Stream to RAM + regex at same time

2024-06-22 12:47:49 +00:00 · 2024-06-22 12:47:49 +00:00 · 46d8e2e718
commit 46d8e2e718
parent b4f357090f
1 changed files with 21 additions and 10 deletions
--- a/archiveteam_project_url_extractor.py
+++ b/archiveteam_project_url_extractor.py
@ -2,6 +2,7 @@ import subprocess
 import os
 from concurrent.futures import ThreadPoolExecutor
 from threading import Lock
 import re
 # Function to process each file
 def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
@ -19,20 +20,30 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
        print(f"Skipping {file_path}. Unsupported file extension.")
        return
-    # Iterate through each pattern and process the file accordingly
+    # Load the entire file content into memory
-    for pattern, output_filename in patterns.items():
+    try:
-        grep_command = f"grep -E '{pattern}'"
+        file_content = subprocess.check_output(command, shell=True, text=True)
-        full_command = f"{command} | {grep_command}"
+    except subprocess.CalledProcessError as e:
        print(f"Error processing {file_path}: {e}")
        return
-        # Open the output file in append mode
+    # Create output files if they don't exist and prepare for appending matches
    output_files = {}
    for output_filename in patterns.values():
        output_file_path = os.path.join(output_directory, output_filename)
        if not os.path.exists(output_file_path):
            open(output_file_path, 'a').close()  # Create the file if it doesn't exist
        output_files[output_filename] = open(output_file_path, "a")
-        with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc:
+    # Process the file content and apply all patterns
-            with open(output_file_path, "a") as output_file:
+    for line in file_content.splitlines():
-                for line in proc.stdout:
+        for pattern, output_filename in patterns.items():
-                    output_file.write(line)
+            if re.search(pattern, line):
                output_files[output_filename].write(line + '\n')
    # Close all output files
    for output_file in output_files.values():
        output_file.close()
    # Update the processed files count outside the inner loop
    with counter_lock:
@ -99,4 +110,4 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
 # Check file size for any remaining files
 check_file_size(output_directory)
-print(f"\nAll files processed. URLs appended to corresponding output files.")
+print(f"\nAll files processed. URLs appended to corresponding output files.")