Upload files to "/"
This commit is contained in:
commit
429971ac69
71
scrapper.py
Normal file
71
scrapper.py
Normal file
@ -0,0 +1,71 @@
|
||||
import subprocess
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Lock
|
||||
|
||||
def process_file(file, directory_path, output_file_path, keyword, counter_lock, processed_counter):
|
||||
file_path = os.path.join(directory_path, file)
|
||||
|
||||
print(f"\nProcessing {file_path}...")
|
||||
|
||||
# Run the command and append the output to the same output file
|
||||
command = f"zcat {file_path} | grep '{keyword}'"
|
||||
result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, text=True)
|
||||
|
||||
# Append the output to the same output file
|
||||
with open(output_file_path, "a") as output_file:
|
||||
output_file.write(result.stdout)
|
||||
|
||||
# Update the processed files count
|
||||
with counter_lock:
|
||||
processed_counter[0] += 1
|
||||
remaining_count = len(gzipped_files) - processed_counter[0]
|
||||
print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}")
|
||||
|
||||
# Ask the user for the directory containing .txt.gz files
|
||||
directory_path = input("Enter the directory path containing .txt.gz files: ")
|
||||
|
||||
# Ensure the directory exists
|
||||
if not os.path.exists(directory_path):
|
||||
print(f"Error: The directory '{directory_path}' does not exist.")
|
||||
exit()
|
||||
|
||||
# List all files in the directory that end with .txt.gz
|
||||
gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")]
|
||||
|
||||
# Check if there are any .txt.gz files in the directory
|
||||
if not gzipped_files:
|
||||
print("Error: No .txt.gz files found in the specified directory.")
|
||||
exit()
|
||||
|
||||
# Ask the user for the output directory
|
||||
output_directory = input("Enter the output directory path: ")
|
||||
|
||||
# Ensure the output directory exists; if not, create it
|
||||
if not os.path.exists(output_directory):
|
||||
os.makedirs(output_directory)
|
||||
|
||||
# Ask the user for the keyword
|
||||
keyword = input("Enter the keyword to search for: ")
|
||||
|
||||
# Generate the output file name based on the directory name and keyword
|
||||
directory_name = os.path.basename(directory_path)
|
||||
output_file_name = f"{directory_name}_{keyword}_output.txt"
|
||||
output_file_path = os.path.join(output_directory, output_file_name)
|
||||
|
||||
# Ask the user for the number of concurrent instances
|
||||
num_concurrent_instances = int(input("Enter the number of concurrent instances: "))
|
||||
|
||||
# Use ThreadPoolExecutor to run the specified number of concurrent instances
|
||||
counter_lock = Lock()
|
||||
processed_counter = [0] # Using a list to store an integer (mutable) to pass by reference
|
||||
|
||||
with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
|
||||
# Submit each file for processing
|
||||
futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files]
|
||||
|
||||
# Wait for all tasks to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
print(f"\nAll files processed. URLs appended to {output_file_path}")
|
Loading…
x
Reference in New Issue
Block a user