Updated to support zst compressed files

2024-02-05 02:19:15 +00:00 · 2024-02-05 02:19:15 +00:00 · 9e6e5190d3
commit 9e6e5190d3
parent 95abf80bd1
1 changed files with 23 additions and 13 deletions
--- a/url_extractor.py
+++ b/url_extractor.py
@ -8,13 +8,23 @@ def process_file(file, directory_path, output_file_path, keyword, counter_lock,
    print(f"\nProcessing {file_path}...")
-    # Run the command and append the output to the same output file
+    # Determine the appropriate command based on file extension
-    command = f"zcat {file_path} | grep '{keyword}'"
+    if file.endswith(".gz"):
        command = f"zcat {file_path}"
    elif file.endswith(".zst"):
        command = f"zstdcat {file_path}"
    elif file.endswith(".txt"):
        command = f"cat {file_path}"
    else:
        print(f"Skipping {file_path}. Unsupported file extension.")
        return
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, text=True)
    # Stream the output line by line and append to the output file
    with open(output_file_path, "a") as output_file:
        for line in process.stdout:
            if keyword in line:
                output_file.write(line)
    process.wait()  # Wait for the process to finish
@ -22,23 +32,23 @@ def process_file(file, directory_path, output_file_path, keyword, counter_lock,
    # Update the processed files count
    with counter_lock:
        processed_counter[0] += 1
-        remaining_count = len(gzipped_files) - processed_counter[0]
+        remaining_count = len(files) - processed_counter[0]
        print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}")
-# Ask the user for the directory containing .txt.gz files
+# Ask the user for the directory containing files
-directory_path = input("Enter the directory path containing .txt.gz files: ")
+directory_path = input("Enter the directory path containing files: ")
 # Ensure the directory exists
 if not os.path.exists(directory_path):
    print(f"Error: The directory '{directory_path}' does not exist.")
    exit()
-# List all files in the directory that end with .txt.gz
+# List all files in the directory
-gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")]
+files = os.listdir(directory_path)
-# Check if there are any .txt.gz files in the directory
+# Check if there are any files in the directory
-if not gzipped_files:
+if not files:
-    print("Error: No .txt.gz files found in the specified directory.")
+    print("Error: No files found in the specified directory.")
    exit()
 # Ask the user for the output directory
@ -65,10 +75,10 @@ processed_counter = [0]  # Using a list to store an integer (mutable) to pass by
 with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
    # Submit each file for processing
-    futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files]
+    futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in files]
    # Wait for all tasks to complete
    for future in futures:
        future.result()
-print(f"\nAll files processed. URLs appended to {output_file_path}")
+print(f"\nAll files processed. Matching lines appended to {output_file_path}")