Stream to RAM + regex at same time

This commit is contained in:
datechnoman 2024-06-22 12:47:49 +00:00
parent b4f357090f
commit 46d8e2e718

View File

@ -2,6 +2,7 @@ import subprocess
import os import os
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from threading import Lock from threading import Lock
import re
# Function to process each file # Function to process each file
def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter): def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
@ -19,20 +20,30 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
print(f"Skipping {file_path}. Unsupported file extension.") print(f"Skipping {file_path}. Unsupported file extension.")
return return
# Iterate through each pattern and process the file accordingly # Load the entire file content into memory
for pattern, output_filename in patterns.items(): try:
grep_command = f"grep -E '{pattern}'" file_content = subprocess.check_output(command, shell=True, text=True)
full_command = f"{command} | {grep_command}" except subprocess.CalledProcessError as e:
print(f"Error processing {file_path}: {e}")
return
# Open the output file in append mode # Create output files if they don't exist and prepare for appending matches
output_files = {}
for output_filename in patterns.values():
output_file_path = os.path.join(output_directory, output_filename) output_file_path = os.path.join(output_directory, output_filename)
if not os.path.exists(output_file_path): if not os.path.exists(output_file_path):
open(output_file_path, 'a').close() # Create the file if it doesn't exist open(output_file_path, 'a').close() # Create the file if it doesn't exist
output_files[output_filename] = open(output_file_path, "a")
with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc: # Process the file content and apply all patterns
with open(output_file_path, "a") as output_file: for line in file_content.splitlines():
for line in proc.stdout: for pattern, output_filename in patterns.items():
output_file.write(line) if re.search(pattern, line):
output_files[output_filename].write(line + '\n')
# Close all output files
for output_file in output_files.values():
output_file.close()
# Update the processed files count outside the inner loop # Update the processed files count outside the inner loop
with counter_lock: with counter_lock:
@ -99,4 +110,4 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
# Check file size for any remaining files # Check file size for any remaining files
check_file_size(output_directory) check_file_size(output_directory)
print(f"\nAll files processed. URLs appended to corresponding output files.") print(f"\nAll files processed. URLs appended to corresponding output files.")