import os import re from concurrent.futures import ThreadPoolExecutor def filter_image_lines(input_file_path, output_file_path): # Check if the input file exists if not os.path.exists(input_file_path): print(f"The file {input_file_path} does not exist.") return # Regular expression pattern for image file extensions image_extensions_pattern = re.compile(r'\b(?:jpg|png|gif|jpeg)\b', re.IGNORECASE) with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file: for line in input_file: # Check if the line contains any of the specified image file extensions using regex if not image_extensions_pattern.search(line): # Write the line to the output file if it doesn't contain an image file extension output_file.write(line) print(f"Filtered lines with image file extensions in {input_file_path}. Filtered URLs saved to {output_file_path}.") def process_directory(directory_path, output_directory, concurrency): # Get a list of all text files in the specified directory text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')] if not text_files: print(f"No text files found in the directory {directory_path}.") return # Process text files concurrently with the specified concurrency level with ThreadPoolExecutor(max_workers=concurrency) as executor: # Submit each text file for processing futures = [executor.submit( filter_image_lines, os.path.join(directory_path, text_file), os.path.join(output_directory, f"filtered_{text_file}") ) for text_file in text_files] # Wait for all tasks to complete for future in futures: future.result() print("Concurrent processing of text files completed.") # Get user input for the directory directory_path = input("Enter the path to the directory containing text files: ") # Get user input for the output directory output_directory = input("Enter the path to the directory for saving filtered URLs: ") # Get user input for the concurrency level concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): ")) # Call the function to process all text files in the directory concurrently process_directory(directory_path, output_directory, concurrency)