Stream to RAM + regex at same time
This commit is contained in:
parent
b4f357090f
commit
46d8e2e718
@ -2,6 +2,7 @@ import subprocess
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Lock
|
||||
import re
|
||||
|
||||
# Function to process each file
|
||||
def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
|
||||
@ -19,20 +20,30 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
|
||||
print(f"Skipping {file_path}. Unsupported file extension.")
|
||||
return
|
||||
|
||||
# Iterate through each pattern and process the file accordingly
|
||||
for pattern, output_filename in patterns.items():
|
||||
grep_command = f"grep -E '{pattern}'"
|
||||
full_command = f"{command} | {grep_command}"
|
||||
# Load the entire file content into memory
|
||||
try:
|
||||
file_content = subprocess.check_output(command, shell=True, text=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
return
|
||||
|
||||
# Open the output file in append mode
|
||||
# Create output files if they don't exist and prepare for appending matches
|
||||
output_files = {}
|
||||
for output_filename in patterns.values():
|
||||
output_file_path = os.path.join(output_directory, output_filename)
|
||||
if not os.path.exists(output_file_path):
|
||||
open(output_file_path, 'a').close() # Create the file if it doesn't exist
|
||||
output_files[output_filename] = open(output_file_path, "a")
|
||||
|
||||
with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc:
|
||||
with open(output_file_path, "a") as output_file:
|
||||
for line in proc.stdout:
|
||||
output_file.write(line)
|
||||
# Process the file content and apply all patterns
|
||||
for line in file_content.splitlines():
|
||||
for pattern, output_filename in patterns.items():
|
||||
if re.search(pattern, line):
|
||||
output_files[output_filename].write(line + '\n')
|
||||
|
||||
# Close all output files
|
||||
for output_file in output_files.values():
|
||||
output_file.close()
|
||||
|
||||
# Update the processed files count outside the inner loop
|
||||
with counter_lock:
|
||||
@ -99,4 +110,4 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
|
||||
# Check file size for any remaining files
|
||||
check_file_size(output_directory)
|
||||
|
||||
print(f"\nAll files processed. URLs appended to corresponding output files.")
|
||||
print(f"\nAll files processed. URLs appended to corresponding output files.")
|
||||
|
Loading…
Reference in New Issue
Block a user