diff --git a/all_url_extractor.py b/all_url_extractor.py index 90e2050..3454d72 100644 --- a/all_url_extractor.py +++ b/all_url_extractor.py @@ -1,53 +1,53 @@ -import os -import gzip -import re -from multiprocessing import Pool - -def extract_urls_from_line(line): - # Extract URLs using regular expression - url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') - return re.findall(url_pattern, line) - -def process_file(file_path, output_directory): - # Create the output file path with '_urls.txt' extension in the specified output directory - output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt') - - # Process the file line by line and extract URLs - with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file: - for line in file: - # Extract URLs from the line - line_urls = extract_urls_from_line(line) - - # Write the URLs to the output file - output_file.write('\n'.join(line_urls) + '\n') - - print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'") - - # Remove the original gzipped file - os.remove(file_path) - -def extract_urls_from_directory(directory_path, output_directory, concurrency_level): - # Get the list of files in the directory and sort them - file_list = sorted(os.listdir(directory_path)) - - # Create a multiprocessing Pool with the specified concurrency level - pool = Pool(processes=concurrency_level) - - # Map the file processing function to the list of files with '.txt.gz' and '.csv.gz' extensions - pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt.gz', '.csv.gz'))]) - - # Close the pool to free up resources - pool.close() - pool.join() - -# Prompt the user to input the directory containing the files -directory_path = input("Enter the directory path containing the '.txt.gz' and '.csv.gz' files to extract URLs from: ") - -# Prompt the user to input the output directory -output_directory = input("Enter the output directory path to store the extracted URLs files: ") - -# Prompt the user to input the concurrency level -concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): ")) - -# Extract URLs from the '.txt.gz' and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level -extract_urls_from_directory(directory_path, output_directory, concurrency_level) +import os +import gzip +import re +from multiprocessing import Pool + +def extract_urls_from_line(line): + # Extract URLs using regular expression + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + return re.findall(url_pattern, line) + +def process_file(file_path, output_directory): + # Create the output file path with '_urls.txt' extension in the specified output directory + output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt') + + # Process the file line by line and extract URLs + with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file: + for line in file: + # Extract URLs from the line + line_urls = extract_urls_from_line(line) + + # Write the URLs to the output file + output_file.write('\n'.join(line_urls) + '\n') + + print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'") + + # Remove the original gzipped file + os.remove(file_path) + +def extract_urls_from_directory(directory_path, output_directory, concurrency_level): + # Get the list of files in the directory and sort them + file_list = sorted(os.listdir(directory_path)) + + # Create a multiprocessing Pool with the specified concurrency level + pool = Pool(processes=concurrency_level) + + # Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions + pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))]) + + # Close the pool to free up resources + pool.close() + pool.join() + +# Prompt the user to input the directory containing the files +directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ") + +# Prompt the user to input the output directory +output_directory = input("Enter the output directory path to store the extracted URLs files: ") + +# Prompt the user to input the concurrency level +concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): ")) + +# Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level +extract_urls_from_directory(directory_path, output_directory, concurrency_level)