Add archiveteam_project_url_extractor.py
This commit is contained in:
commit
11a9f30a29
88
archiveteam_project_url_extractor.py
Normal file
88
archiveteam_project_url_extractor.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
|
def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
|
||||||
|
file_path = os.path.join(directory_path, file)
|
||||||
|
|
||||||
|
print(f"\nProcessing {file_path}...")
|
||||||
|
|
||||||
|
# Determine the appropriate command based on file extension
|
||||||
|
if file.endswith(".gz"):
|
||||||
|
command = f"zcat {file_path}"
|
||||||
|
elif file.endswith(".txt"):
|
||||||
|
command = f"cat {file_path}"
|
||||||
|
else:
|
||||||
|
print(f"Skipping {file_path}. Unsupported file extension.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Iterate through each pattern and process the file accordingly
|
||||||
|
for pattern, output_filename in patterns.items():
|
||||||
|
# Run the command and append the output to the corresponding output file
|
||||||
|
grep_command = f"grep -E '{pattern}'"
|
||||||
|
full_command = f"{command} | {grep_command}"
|
||||||
|
|
||||||
|
result = subprocess.run(full_command, shell=True, stdout=subprocess.PIPE, text=True)
|
||||||
|
|
||||||
|
# Generate the output file path based on the output filename
|
||||||
|
output_file_path = os.path.join(output_directory, output_filename)
|
||||||
|
|
||||||
|
# Append the output to the corresponding output file
|
||||||
|
with open(output_file_path, "a") as output_file:
|
||||||
|
output_file.write(result.stdout)
|
||||||
|
|
||||||
|
# Update the processed files count outside the inner loop
|
||||||
|
with counter_lock:
|
||||||
|
processed_counter[0] += 1
|
||||||
|
remaining_count = len(gzipped_files) - processed_counter[0]
|
||||||
|
print(f"{file_path} processed. Remaining files: {remaining_count}")
|
||||||
|
|
||||||
|
# Ask the user for the directory containing .txt and .txt.gz files
|
||||||
|
directory_path = input("Enter the directory path containing .txt and .txt.gz files: ")
|
||||||
|
|
||||||
|
# Ensure the directory exists
|
||||||
|
if not os.path.exists(directory_path):
|
||||||
|
print(f"Error: The directory '{directory_path}' does not exist.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
# List all files in the directory that end with .txt or .txt.gz
|
||||||
|
gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt") or file.endswith(".txt.gz")]
|
||||||
|
|
||||||
|
# Check if there are any .txt or .txt.gz files in the directory
|
||||||
|
if not gzipped_files:
|
||||||
|
print("Error: No .txt or .txt.gz files found in the specified directory.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
# Ask the user for the output directory
|
||||||
|
output_directory = input("Enter the output directory path: ")
|
||||||
|
|
||||||
|
# Ensure the output directory exists; if not, create it
|
||||||
|
if not os.path.exists(output_directory):
|
||||||
|
os.makedirs(output_directory)
|
||||||
|
|
||||||
|
# Define the URL patterns and their corresponding output filenames
|
||||||
|
url_patterns = {
|
||||||
|
r'(\S+\.blogspot|\S*blogger)\.\S+': 'filtered_blogspot_blogger.txt',
|
||||||
|
r'(mediafire\.com|mfi\.re)\S+': 'filtered_mediafire_mfi.re.txt',
|
||||||
|
r'\S*imgur\S*': 'filtered_imgur.txt',
|
||||||
|
r'http(s)?://(www\.)?pastebin.com': 'filtered_pastebin.txt',
|
||||||
|
r'https://cdn.discordapp.com': 'filtered_cdn.discordapp.com.txt'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ask the user for the number of concurrent instances
|
||||||
|
num_concurrent_instances = int(input("Enter the number of concurrent instances: "))
|
||||||
|
|
||||||
|
# Use ThreadPoolExecutor to run the specified number of concurrent instances
|
||||||
|
counter_lock = Lock()
|
||||||
|
processed_counter = [0] # Using a list to store an integer (mutable) to pass by reference
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
|
||||||
|
# Submit each file for processing
|
||||||
|
futures = [executor.submit(process_file, file, directory_path, output_directory, url_patterns, counter_lock, processed_counter) for file in gzipped_files]
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
|
print(f"\nAll files processed. URLs appended to corresponding output files.")
|
Loading…
Reference in New Issue
Block a user