From 9e6e5190d398db838b4ca1330d0f0a9ee45fc8f6 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Mon, 5 Feb 2024 02:19:15 +0000 Subject: [PATCH] Updated to support zst compressed files --- url_extractor.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/url_extractor.py b/url_extractor.py index 0fe2dfb..b7de15e 100644 --- a/url_extractor.py +++ b/url_extractor.py @@ -8,37 +8,47 @@ def process_file(file, directory_path, output_file_path, keyword, counter_lock, print(f"\nProcessing {file_path}...") - # Run the command and append the output to the same output file - command = f"zcat {file_path} | grep '{keyword}'" + # Determine the appropriate command based on file extension + if file.endswith(".gz"): + command = f"zcat {file_path}" + elif file.endswith(".zst"): + command = f"zstdcat {file_path}" + elif file.endswith(".txt"): + command = f"cat {file_path}" + else: + print(f"Skipping {file_path}. Unsupported file extension.") + return + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, text=True) # Stream the output line by line and append to the output file with open(output_file_path, "a") as output_file: for line in process.stdout: - output_file.write(line) + if keyword in line: + output_file.write(line) process.wait() # Wait for the process to finish # Update the processed files count with counter_lock: processed_counter[0] += 1 - remaining_count = len(gzipped_files) - processed_counter[0] + remaining_count = len(files) - processed_counter[0] print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}") -# Ask the user for the directory containing .txt.gz files -directory_path = input("Enter the directory path containing .txt.gz files: ") +# Ask the user for the directory containing files +directory_path = input("Enter the directory path containing files: ") # Ensure the directory exists if not os.path.exists(directory_path): print(f"Error: The directory '{directory_path}' does not exist.") exit() -# List all files in the directory that end with .txt.gz -gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")] +# List all files in the directory +files = os.listdir(directory_path) -# Check if there are any .txt.gz files in the directory -if not gzipped_files: - print("Error: No .txt.gz files found in the specified directory.") +# Check if there are any files in the directory +if not files: + print("Error: No files found in the specified directory.") exit() # Ask the user for the output directory @@ -65,10 +75,10 @@ processed_counter = [0] # Using a list to store an integer (mutable) to pass by with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor: # Submit each file for processing - futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files] + futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in files] # Wait for all tasks to complete for future in futures: future.result() -print(f"\nAll files processed. URLs appended to {output_file_path}") \ No newline at end of file +print(f"\nAll files processed. Matching lines appended to {output_file_path}") \ No newline at end of file