From 3649ea306fecafeac272842d270cc26ea83a302e Mon Sep 17 00:00:00 2001 From: datechnoman Date: Sat, 2 Mar 2024 00:47:00 +0000 Subject: [PATCH] Update archiveteam_project_url_extractor.py --- archiveteam_project_url_extractor.py | 37 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/archiveteam_project_url_extractor.py b/archiveteam_project_url_extractor.py index 954854a..72d0fee 100644 --- a/archiveteam_project_url_extractor.py +++ b/archiveteam_project_url_extractor.py @@ -3,9 +3,9 @@ import os from concurrent.futures import ThreadPoolExecutor from threading import Lock +# Function to process each file def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter): file_path = os.path.join(directory_path, file) - print(f"\nProcessing {file_path}...") # Determine the appropriate command based on file extension @@ -21,18 +21,18 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock, # Iterate through each pattern and process the file accordingly for pattern, output_filename in patterns.items(): - # Run the command and append the output to the corresponding output file grep_command = f"grep -E '{pattern}'" full_command = f"{command} | {grep_command}" - result = subprocess.run(full_command, shell=True, stdout=subprocess.PIPE, text=True) - - # Generate the output file path based on the output filename + # Open the output file in append mode output_file_path = os.path.join(output_directory, output_filename) + if not os.path.exists(output_file_path): + open(output_file_path, 'a').close() # Create the file if it doesn't exist - # Append the output to the corresponding output file - with open(output_file_path, "a") as output_file: - output_file.write(result.stdout) + with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc: + with open(output_file_path, "a") as output_file: + for line in proc.stdout: + output_file.write(line) # Update the processed files count outside the inner loop with counter_lock: @@ -40,6 +40,18 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock, remaining_count = len(gzipped_files) - processed_counter[0] print(f"{file_path} processed. Remaining files: {remaining_count}") +# Function to check the size of the output files +def check_file_size(output_directory): + for filename in os.listdir(output_directory): + output_file_path = os.path.join(output_directory, filename) + output_file_size = os.path.getsize(output_file_path) + + # If the output file size is greater than or equal to 125GB (125 * 1024 * 1024 * 1024 bytes), + # create a new output file with a suffix indicating its sequence number + if output_file_size >= 125 * 1024 * 1024 * 1024: + new_filename = f"{filename[:-4]}_{processed_counter[0]}.txt" + os.rename(output_file_path, os.path.join(output_directory, new_filename)) + # Ask the user for the directory containing .txt, .txt.gz, and .zst files directory_path = input("Enter the directory path containing .txt, .txt.gz, and .zst files: ") @@ -59,10 +71,6 @@ if not gzipped_files: # Ask the user for the output directory output_directory = input("Enter the output directory path: ") -# Ensure the output directory exists; if not, create it -if not os.path.exists(output_directory): - os.makedirs(output_directory) - # Define the URL patterns and their corresponding output filenames url_patterns = { r'(\S+\.blogspot|\S*blogger)\.\S+': 'filtered_blogspot_blogger.txt', @@ -87,4 +95,7 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor: for future in futures: future.result() -print(f"\nAll files processed. URLs appended to corresponding output files.") +# Check file size for any remaining files +check_file_size(output_directory) + +print(f"\nAll files processed. URLs appended to corresponding output files.") \ No newline at end of file