From 3649ea306fecafeac272842d270cc26ea83a302e Mon Sep 17 00:00:00 2001
From: datechnoman <datechnoman@hotmail.com>
Date: Sat, 2 Mar 2024 00:47:00 +0000
Subject: [PATCH] Update archiveteam_project_url_extractor.py

---
 archiveteam_project_url_extractor.py | 37 ++++++++++++++++++----------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/archiveteam_project_url_extractor.py b/archiveteam_project_url_extractor.py
index 954854a..72d0fee 100644
--- a/archiveteam_project_url_extractor.py
+++ b/archiveteam_project_url_extractor.py
@@ -3,9 +3,9 @@ import os
 from concurrent.futures import ThreadPoolExecutor
 from threading import Lock
 
+# Function to process each file
 def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
     file_path = os.path.join(directory_path, file)
-
     print(f"\nProcessing {file_path}...")
 
     # Determine the appropriate command based on file extension
@@ -21,18 +21,18 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
 
     # Iterate through each pattern and process the file accordingly
     for pattern, output_filename in patterns.items():
-        # Run the command and append the output to the corresponding output file
         grep_command = f"grep -E '{pattern}'"
         full_command = f"{command} | {grep_command}"
 
-        result = subprocess.run(full_command, shell=True, stdout=subprocess.PIPE, text=True)
-
-        # Generate the output file path based on the output filename
+        # Open the output file in append mode
         output_file_path = os.path.join(output_directory, output_filename)
+        if not os.path.exists(output_file_path):
+            open(output_file_path, 'a').close()  # Create the file if it doesn't exist
 
-        # Append the output to the corresponding output file
-        with open(output_file_path, "a") as output_file:
-            output_file.write(result.stdout)
+        with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc:
+            with open(output_file_path, "a") as output_file:
+                for line in proc.stdout:
+                    output_file.write(line)
 
     # Update the processed files count outside the inner loop
     with counter_lock:
@@ -40,6 +40,18 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
         remaining_count = len(gzipped_files) - processed_counter[0]
         print(f"{file_path} processed. Remaining files: {remaining_count}")
 
+# Function to check the size of the output files
+def check_file_size(output_directory):
+    for filename in os.listdir(output_directory):
+        output_file_path = os.path.join(output_directory, filename)
+        output_file_size = os.path.getsize(output_file_path)
+        
+        # If the output file size is greater than or equal to 125GB (125 * 1024 * 1024 * 1024 bytes),
+        # create a new output file with a suffix indicating its sequence number
+        if output_file_size >= 125 * 1024 * 1024 * 1024:
+            new_filename = f"{filename[:-4]}_{processed_counter[0]}.txt"
+            os.rename(output_file_path, os.path.join(output_directory, new_filename))
+
 # Ask the user for the directory containing .txt, .txt.gz, and .zst files
 directory_path = input("Enter the directory path containing .txt, .txt.gz, and .zst files: ")
 
@@ -59,10 +71,6 @@ if not gzipped_files:
 # Ask the user for the output directory
 output_directory = input("Enter the output directory path: ")
 
-# Ensure the output directory exists; if not, create it
-if not os.path.exists(output_directory):
-    os.makedirs(output_directory)
-
 # Define the URL patterns and their corresponding output filenames
 url_patterns = {
     r'(\S+\.blogspot|\S*blogger)\.\S+': 'filtered_blogspot_blogger.txt',
@@ -87,4 +95,7 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
     for future in futures:
         future.result()
 
-print(f"\nAll files processed. URLs appended to corresponding output files.")
+# Check file size for any remaining files
+check_file_size(output_directory)
+
+print(f"\nAll files processed. URLs appended to corresponding output files.")
\ No newline at end of file