All_URL_Extractor/all_url_extractor.py

import os
import gzip
import re
from multiprocessing import Pool

def extract_urls_from_line(line):
    # Extract URLs using regular expression
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return re.findall(url_pattern, line)

def process_file(file_path, output_directory):
    # Create the output file path with '_urls.txt' extension in the specified output directory
    output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt')

    # Process the file line by line and extract URLs
    with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file:
        for line in file:
            # Extract URLs from the line
            line_urls = extract_urls_from_line(line)
            
            # Write the URLs to the output file
            output_file.write('\n'.join(line_urls) + '\n')

    print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'")

    # Remove the original gzipped file
    os.remove(file_path)

def extract_urls_from_directory(directory_path, output_directory, concurrency_level):
    # Get the list of files in the directory and sort them
    file_list = sorted(os.listdir(directory_path))

    # Create a multiprocessing Pool with the specified concurrency level
    pool = Pool(processes=concurrency_level)

    # Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions
    pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))])

    # Close the pool to free up resources
    pool.close()
    pool.join()

# Prompt the user to input the directory containing the files
directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ")

# Prompt the user to input the output directory
output_directory = input("Enter the output directory path to store the extracted URLs files: ")

# Prompt the user to input the concurrency level
concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))

# Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
extract_urls_from_directory(directory_path, output_directory, concurrency_level)
Updated script to support .txt files 2023-12-13 11:32:17 +00:00			`import os`
			`import gzip`
			`import re`
			`from multiprocessing import Pool`

			`def extract_urls_from_line(line):`
			`# Extract URLs using regular expression`
			`url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+')`
			`return re.findall(url_pattern, line)`

			`def process_file(file_path, output_directory):`
			`# Create the output file path with '_urls.txt' extension in the specified output directory`
			`output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt')`

			`# Process the file line by line and extract URLs`
			`with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file:`
			`for line in file:`
			`# Extract URLs from the line`
			`line_urls = extract_urls_from_line(line)`

			`# Write the URLs to the output file`
			`output_file.write('\n'.join(line_urls) + '\n')`

			`print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'")`

			`# Remove the original gzipped file`
			`os.remove(file_path)`

			`def extract_urls_from_directory(directory_path, output_directory, concurrency_level):`
			`# Get the list of files in the directory and sort them`
			`file_list = sorted(os.listdir(directory_path))`

			`# Create a multiprocessing Pool with the specified concurrency level`
			`pool = Pool(processes=concurrency_level)`

			`# Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions`
			`pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))])`

			`# Close the pool to free up resources`
			`pool.close()`
			`pool.join()`

			`# Prompt the user to input the directory containing the files`
			`directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ")`

			`# Prompt the user to input the output directory`
			`output_directory = input("Enter the output directory path to store the extracted URLs files: ")`

			`# Prompt the user to input the concurrency level`
			`concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))`

			`# Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level`
			`extract_urls_from_directory(directory_path, output_directory, concurrency_level)`