import os import re from concurrent.futures import ThreadPoolExecutor def filter_and_extract_urls(input_file_path, output_file_path): # Check if the input file exists if not os.path.exists(input_file_path): print(f"The file {input_file_path} does not exist.") return # Regular expression pattern for URLs url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE) unique_urls = set() with open(input_file_path, 'r') as input_file: for line in input_file: # Find all URLs in the line urls = url_pattern.findall(line) for url in urls: # Extract the desired format formatted_url = url.split('/')[2] # Add the formatted URL to the set of unique URLs unique_urls.add(formatted_url) # Write unique URLs to the output file with open(output_file_path, 'w') as output_file: for unique_url in unique_urls: output_file.write(unique_url + '\n') print(f"Extracted and filtered URLs in {input_file_path}. Unique filtered URLs saved to {output_file_path}.") def process_directory(directory_path, output_directory, concurrency): # Get a list of all text files in the specified directory text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')] if not text_files: print(f"No text files found in the directory {directory_path}.") return # Process text files concurrently with the specified concurrency level with ThreadPoolExecutor(max_workers=concurrency) as executor: # Submit each text file for processing futures = [executor.submit( filter_and_extract_urls, os.path.join(directory_path, text_file), os.path.join(output_directory, f"filtered_{text_file}") ) for text_file in text_files] # Wait for all tasks to complete for future in futures: future.result() print("Concurrent processing of text files completed.") # Get user input for the directory directory_path = input("Enter the path to the directory containing text files: ") # Get user input for the output directory output_directory = input("Enter the path to the directory for saving filtered URLs: ") # Get user input for the concurrency level concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): ")) # Call the function to process all text files in the directory concurrently process_directory(directory_path, output_directory, concurrency)