From 11a9f30a29d1e90f7c6d01d4b304cbcfc09036da Mon Sep 17 00:00:00 2001 From: datechnoman Date: Sun, 14 Jan 2024 12:08:45 +0000 Subject: [PATCH] Add archiveteam_project_url_extractor.py --- archiveteam_project_url_extractor.py | 88 ++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 archiveteam_project_url_extractor.py diff --git a/archiveteam_project_url_extractor.py b/archiveteam_project_url_extractor.py new file mode 100644 index 0000000..088b844 --- /dev/null +++ b/archiveteam_project_url_extractor.py @@ -0,0 +1,88 @@ +import subprocess +import os +from concurrent.futures import ThreadPoolExecutor +from threading import Lock + +def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter): + file_path = os.path.join(directory_path, file) + + print(f"\nProcessing {file_path}...") + + # Determine the appropriate command based on file extension + if file.endswith(".gz"): + command = f"zcat {file_path}" + elif file.endswith(".txt"): + command = f"cat {file_path}" + else: + print(f"Skipping {file_path}. Unsupported file extension.") + return + + # Iterate through each pattern and process the file accordingly + for pattern, output_filename in patterns.items(): + # Run the command and append the output to the corresponding output file + grep_command = f"grep -E '{pattern}'" + full_command = f"{command} | {grep_command}" + + result = subprocess.run(full_command, shell=True, stdout=subprocess.PIPE, text=True) + + # Generate the output file path based on the output filename + output_file_path = os.path.join(output_directory, output_filename) + + # Append the output to the corresponding output file + with open(output_file_path, "a") as output_file: + output_file.write(result.stdout) + + # Update the processed files count outside the inner loop + with counter_lock: + processed_counter[0] += 1 + remaining_count = len(gzipped_files) - processed_counter[0] + print(f"{file_path} processed. Remaining files: {remaining_count}") + +# Ask the user for the directory containing .txt and .txt.gz files +directory_path = input("Enter the directory path containing .txt and .txt.gz files: ") + +# Ensure the directory exists +if not os.path.exists(directory_path): + print(f"Error: The directory '{directory_path}' does not exist.") + exit() + +# List all files in the directory that end with .txt or .txt.gz +gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt") or file.endswith(".txt.gz")] + +# Check if there are any .txt or .txt.gz files in the directory +if not gzipped_files: + print("Error: No .txt or .txt.gz files found in the specified directory.") + exit() + +# Ask the user for the output directory +output_directory = input("Enter the output directory path: ") + +# Ensure the output directory exists; if not, create it +if not os.path.exists(output_directory): + os.makedirs(output_directory) + +# Define the URL patterns and their corresponding output filenames +url_patterns = { + r'(\S+\.blogspot|\S*blogger)\.\S+': 'filtered_blogspot_blogger.txt', + r'(mediafire\.com|mfi\.re)\S+': 'filtered_mediafire_mfi.re.txt', + r'\S*imgur\S*': 'filtered_imgur.txt', + r'http(s)?://(www\.)?pastebin.com': 'filtered_pastebin.txt', + r'https://cdn.discordapp.com': 'filtered_cdn.discordapp.com.txt' +} + +# Ask the user for the number of concurrent instances +num_concurrent_instances = int(input("Enter the number of concurrent instances: ")) + +# Use ThreadPoolExecutor to run the specified number of concurrent instances +counter_lock = Lock() +processed_counter = [0] # Using a list to store an integer (mutable) to pass by reference + +with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor: + # Submit each file for processing + futures = [executor.submit(process_file, file, directory_path, output_directory, url_patterns, counter_lock, processed_counter) for file in gzipped_files] + + # Wait for all tasks to complete + for future in futures: + future.result() + +print(f"\nAll files processed. URLs appended to corresponding output files.")