diff --git a/archiveteam_project_url_extractor.py b/archiveteam_project_url_extractor.py index 886a480..38445fd 100644 --- a/archiveteam_project_url_extractor.py +++ b/archiveteam_project_url_extractor.py @@ -2,6 +2,7 @@ import subprocess import os from concurrent.futures import ThreadPoolExecutor from threading import Lock +import re # Function to process each file def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter): @@ -19,20 +20,30 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock, print(f"Skipping {file_path}. Unsupported file extension.") return - # Iterate through each pattern and process the file accordingly - for pattern, output_filename in patterns.items(): - grep_command = f"grep -E '{pattern}'" - full_command = f"{command} | {grep_command}" + # Load the entire file content into memory + try: + file_content = subprocess.check_output(command, shell=True, text=True) + except subprocess.CalledProcessError as e: + print(f"Error processing {file_path}: {e}") + return - # Open the output file in append mode + # Create output files if they don't exist and prepare for appending matches + output_files = {} + for output_filename in patterns.values(): output_file_path = os.path.join(output_directory, output_filename) if not os.path.exists(output_file_path): open(output_file_path, 'a').close() # Create the file if it doesn't exist + output_files[output_filename] = open(output_file_path, "a") - with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc: - with open(output_file_path, "a") as output_file: - for line in proc.stdout: - output_file.write(line) + # Process the file content and apply all patterns + for line in file_content.splitlines(): + for pattern, output_filename in patterns.items(): + if re.search(pattern, line): + output_files[output_filename].write(line + '\n') + + # Close all output files + for output_file in output_files.values(): + output_file.close() # Update the processed files count outside the inner loop with counter_lock: @@ -99,4 +110,4 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor: # Check file size for any remaining files check_file_size(output_directory) -print(f"\nAll files processed. URLs appended to corresponding output files.") \ No newline at end of file +print(f"\nAll files processed. URLs appended to corresponding output files.")