From 5f582ab369cbf5ded938ea82c04f94205c91a2b4 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Wed, 13 Dec 2023 03:35:55 +0000 Subject: [PATCH] Upload files to "/" --- all_url_extractor.py | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 all_url_extractor.py diff --git a/all_url_extractor.py b/all_url_extractor.py new file mode 100644 index 0000000..90e2050 --- /dev/null +++ b/all_url_extractor.py @@ -0,0 +1,53 @@ +import os +import gzip +import re +from multiprocessing import Pool + +def extract_urls_from_line(line): + # Extract URLs using regular expression + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + return re.findall(url_pattern, line) + +def process_file(file_path, output_directory): + # Create the output file path with '_urls.txt' extension in the specified output directory + output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt') + + # Process the file line by line and extract URLs + with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file: + for line in file: + # Extract URLs from the line + line_urls = extract_urls_from_line(line) + + # Write the URLs to the output file + output_file.write('\n'.join(line_urls) + '\n') + + print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'") + + # Remove the original gzipped file + os.remove(file_path) + +def extract_urls_from_directory(directory_path, output_directory, concurrency_level): + # Get the list of files in the directory and sort them + file_list = sorted(os.listdir(directory_path)) + + # Create a multiprocessing Pool with the specified concurrency level + pool = Pool(processes=concurrency_level) + + # Map the file processing function to the list of files with '.txt.gz' and '.csv.gz' extensions + pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt.gz', '.csv.gz'))]) + + # Close the pool to free up resources + pool.close() + pool.join() + +# Prompt the user to input the directory containing the files +directory_path = input("Enter the directory path containing the '.txt.gz' and '.csv.gz' files to extract URLs from: ") + +# Prompt the user to input the output directory +output_directory = input("Enter the output directory path to store the extracted URLs files: ") + +# Prompt the user to input the concurrency level +concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): ")) + +# Extract URLs from the '.txt.gz' and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level +extract_urls_from_directory(directory_path, output_directory, concurrency_level)