Stream to RAM + regex at same time

This commit is contained in:
datechnoman 2024-06-22 12:47:49 +00:00
parent b4f357090f
commit 46d8e2e718

View File

@ -2,6 +2,7 @@ import subprocess
import os
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
import re
# Function to process each file
def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
@ -19,20 +20,30 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
print(f"Skipping {file_path}. Unsupported file extension.")
return
# Iterate through each pattern and process the file accordingly
for pattern, output_filename in patterns.items():
grep_command = f"grep -E '{pattern}'"
full_command = f"{command} | {grep_command}"
# Load the entire file content into memory
try:
file_content = subprocess.check_output(command, shell=True, text=True)
except subprocess.CalledProcessError as e:
print(f"Error processing {file_path}: {e}")
return
# Open the output file in append mode
# Create output files if they don't exist and prepare for appending matches
output_files = {}
for output_filename in patterns.values():
output_file_path = os.path.join(output_directory, output_filename)
if not os.path.exists(output_file_path):
open(output_file_path, 'a').close() # Create the file if it doesn't exist
output_files[output_filename] = open(output_file_path, "a")
with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc:
with open(output_file_path, "a") as output_file:
for line in proc.stdout:
output_file.write(line)
# Process the file content and apply all patterns
for line in file_content.splitlines():
for pattern, output_filename in patterns.items():
if re.search(pattern, line):
output_files[output_filename].write(line + '\n')
# Close all output files
for output_file in output_files.values():
output_file.close()
# Update the processed files count outside the inner loop
with counter_lock:
@ -99,4 +110,4 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
# Check file size for any remaining files
check_file_size(output_directory)
print(f"\nAll files processed. URLs appended to corresponding output files.")
print(f"\nAll files processed. URLs appended to corresponding output files.")