From 429971ac695279ea574cfae323106a1ed915dad9 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Tue, 12 Dec 2023 09:13:09 +0000 Subject: [PATCH] Upload files to "/" --- scrapper.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 scrapper.py diff --git a/scrapper.py b/scrapper.py new file mode 100644 index 0000000..c2b0a2d --- /dev/null +++ b/scrapper.py @@ -0,0 +1,71 @@ +import subprocess +import os +from concurrent.futures import ThreadPoolExecutor +from threading import Lock + +def process_file(file, directory_path, output_file_path, keyword, counter_lock, processed_counter): + file_path = os.path.join(directory_path, file) + + print(f"\nProcessing {file_path}...") + + # Run the command and append the output to the same output file + command = f"zcat {file_path} | grep '{keyword}'" + result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, text=True) + + # Append the output to the same output file + with open(output_file_path, "a") as output_file: + output_file.write(result.stdout) + + # Update the processed files count + with counter_lock: + processed_counter[0] += 1 + remaining_count = len(gzipped_files) - processed_counter[0] + print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}") + +# Ask the user for the directory containing .txt.gz files +directory_path = input("Enter the directory path containing .txt.gz files: ") + +# Ensure the directory exists +if not os.path.exists(directory_path): + print(f"Error: The directory '{directory_path}' does not exist.") + exit() + +# List all files in the directory that end with .txt.gz +gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")] + +# Check if there are any .txt.gz files in the directory +if not gzipped_files: + print("Error: No .txt.gz files found in the specified directory.") + exit() + +# Ask the user for the output directory +output_directory = input("Enter the output directory path: ") + +# Ensure the output directory exists; if not, create it +if not os.path.exists(output_directory): + os.makedirs(output_directory) + +# Ask the user for the keyword +keyword = input("Enter the keyword to search for: ") + +# Generate the output file name based on the directory name and keyword +directory_name = os.path.basename(directory_path) +output_file_name = f"{directory_name}_{keyword}_output.txt" +output_file_path = os.path.join(output_directory, output_file_name) + +# Ask the user for the number of concurrent instances +num_concurrent_instances = int(input("Enter the number of concurrent instances: ")) + +# Use ThreadPoolExecutor to run the specified number of concurrent instances +counter_lock = Lock() +processed_counter = [0] # Using a list to store an integer (mutable) to pass by reference + +with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor: + # Submit each file for processing + futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files] + + # Wait for all tasks to complete + for future in futures: + future.result() + +print(f"\nAll files processed. URLs appended to {output_file_path}")