From 9e6e5190d398db838b4ca1330d0f0a9ee45fc8f6 Mon Sep 17 00:00:00 2001
From: datechnoman <datechnoman@hotmail.com>
Date: Mon, 5 Feb 2024 02:19:15 +0000
Subject: [PATCH] Updated to support zst compressed files

---
 url_extractor.py | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/url_extractor.py b/url_extractor.py
index 0fe2dfb..b7de15e 100644
--- a/url_extractor.py
+++ b/url_extractor.py
@@ -8,37 +8,47 @@ def process_file(file, directory_path, output_file_path, keyword, counter_lock,
 
     print(f"\nProcessing {file_path}...")
 
-    # Run the command and append the output to the same output file
-    command = f"zcat {file_path} | grep '{keyword}'"
+    # Determine the appropriate command based on file extension
+    if file.endswith(".gz"):
+        command = f"zcat {file_path}"
+    elif file.endswith(".zst"):
+        command = f"zstdcat {file_path}"
+    elif file.endswith(".txt"):
+        command = f"cat {file_path}"
+    else:
+        print(f"Skipping {file_path}. Unsupported file extension.")
+        return
+
     process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, text=True)
 
     # Stream the output line by line and append to the output file
     with open(output_file_path, "a") as output_file:
         for line in process.stdout:
-            output_file.write(line)
+            if keyword in line:
+                output_file.write(line)
 
     process.wait()  # Wait for the process to finish
 
     # Update the processed files count
     with counter_lock:
         processed_counter[0] += 1
-        remaining_count = len(gzipped_files) - processed_counter[0]
+        remaining_count = len(files) - processed_counter[0]
         print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}")
 
-# Ask the user for the directory containing .txt.gz files
-directory_path = input("Enter the directory path containing .txt.gz files: ")
+# Ask the user for the directory containing files
+directory_path = input("Enter the directory path containing files: ")
 
 # Ensure the directory exists
 if not os.path.exists(directory_path):
     print(f"Error: The directory '{directory_path}' does not exist.")
     exit()
 
-# List all files in the directory that end with .txt.gz
-gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")]
+# List all files in the directory
+files = os.listdir(directory_path)
 
-# Check if there are any .txt.gz files in the directory
-if not gzipped_files:
-    print("Error: No .txt.gz files found in the specified directory.")
+# Check if there are any files in the directory
+if not files:
+    print("Error: No files found in the specified directory.")
     exit()
 
 # Ask the user for the output directory
@@ -65,10 +75,10 @@ processed_counter = [0]  # Using a list to store an integer (mutable) to pass by
 
 with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
     # Submit each file for processing
-    futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files]
+    futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in files]
 
     # Wait for all tasks to complete
     for future in futures:
         future.result()
 
-print(f"\nAll files processed. URLs appended to {output_file_path}")
\ No newline at end of file
+print(f"\nAll files processed. Matching lines appended to {output_file_path}")
\ No newline at end of file