Upload files to "/"

2023-12-12 09:13:09 +00:00 · 2023-12-12 09:13:09 +00:00 · 429971ac69
commit 429971ac69
1 changed files with 71 additions and 0 deletions
--- a/scrapper.py
+++ b/scrapper.py
@ -0,0 +1,71 @@
+import subprocess
+import os
+from concurrent.futures import ThreadPoolExecutor
+from threading import Lock
+
+def process_file(file, directory_path, output_file_path, keyword, counter_lock, processed_counter):
+    file_path = os.path.join(directory_path, file)
+
+    print(f"\nProcessing {file_path}...")
+
+    # Run the command and append the output to the same output file
+    command = f"zcat {file_path} | grep '{keyword}'"
+    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, text=True)
+
+    # Append the output to the same output file
+    with open(output_file_path, "a") as output_file:
+        output_file.write(result.stdout)
+
+    # Update the processed files count
+    with counter_lock:
+        processed_counter[0] += 1
+        remaining_count = len(gzipped_files) - processed_counter[0]
+        print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}")
+
+# Ask the user for the directory containing .txt.gz files
+directory_path = input("Enter the directory path containing .txt.gz files: ")
+
+# Ensure the directory exists
+if not os.path.exists(directory_path):
+    print(f"Error: The directory '{directory_path}' does not exist.")
+    exit()
+
+# List all files in the directory that end with .txt.gz
+gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")]
+
+# Check if there are any .txt.gz files in the directory
+if not gzipped_files:
+    print("Error: No .txt.gz files found in the specified directory.")
+    exit()
+
+# Ask the user for the output directory
+output_directory = input("Enter the output directory path: ")
+
+# Ensure the output directory exists; if not, create it
+if not os.path.exists(output_directory):
+    os.makedirs(output_directory)
+
+# Ask the user for the keyword
+keyword = input("Enter the keyword to search for: ")
+
+# Generate the output file name based on the directory name and keyword
+directory_name = os.path.basename(directory_path)
+output_file_name = f"{directory_name}_{keyword}_output.txt"
+output_file_path = os.path.join(output_directory, output_file_name)
+
+# Ask the user for the number of concurrent instances
+num_concurrent_instances = int(input("Enter the number of concurrent instances: "))
+
+# Use ThreadPoolExecutor to run the specified number of concurrent instances
+counter_lock = Lock()
+processed_counter = [0]  # Using a list to store an integer (mutable) to pass by reference
+
+with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
+    # Submit each file for processing
+    futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files]
+
+    # Wait for all tasks to complete
+    for future in futures:
+        future.result()
+
+print(f"\nAll files processed. URLs appended to {output_file_path}")