import os import gzip import re from multiprocessing import Pool def extract_urls_from_line(line): # Extract URLs using regular expression url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') return re.findall(url_pattern, line) def process_file(file_path, output_directory): # Create the output file path with '_urls.txt' extension in the specified output directory output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt') # Process the file line by line and extract URLs with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file: for line in file: # Extract URLs from the line line_urls = extract_urls_from_line(line) # Write the URLs to the output file output_file.write('\n'.join(line_urls) + '\n') print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'") # Remove the original gzipped file os.remove(file_path) def extract_urls_from_directory(directory_path, output_directory, concurrency_level): # Get the list of files in the directory and sort them file_list = sorted(os.listdir(directory_path)) # Create a multiprocessing Pool with the specified concurrency level pool = Pool(processes=concurrency_level) # Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))]) # Close the pool to free up resources pool.close() pool.join() # Prompt the user to input the directory containing the files directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ") # Prompt the user to input the output directory output_directory = input("Enter the output directory path to store the extracted URLs files: ") # Prompt the user to input the concurrency level concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): ")) # Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level extract_urls_from_directory(directory_path, output_directory, concurrency_level)