Updated script to support .txt files

2023-12-13 11:32:17 +00:00 · 2023-12-13 11:32:17 +00:00 · 112814dd35
commit 112814dd35
parent 5f582ab369
1 changed files with 53 additions and 53 deletions
--- a/all_url_extractor.py
+++ b/all_url_extractor.py
@ -33,15 +33,15 @@ def extract_urls_from_directory(directory_path, output_directory, concurrency_le
    # Create a multiprocessing Pool with the specified concurrency level
    pool = Pool(processes=concurrency_level)

-    # Map the file processing function to the list of files with '.txt.gz' and '.csv.gz' extensions
-    pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt.gz', '.csv.gz'))])
+    # Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions
+    pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))])

    # Close the pool to free up resources
    pool.close()
    pool.join()

 # Prompt the user to input the directory containing the files
-directory_path = input("Enter the directory path containing the '.txt.gz' and '.csv.gz' files to extract URLs from: ")
+directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ")

 # Prompt the user to input the output directory
 output_directory = input("Enter the output directory path to store the extracted URLs files: ")
@ -49,5 +49,5 @@ output_directory = input("Enter the output directory path to store the extracted
 # Prompt the user to input the concurrency level
 concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))

-# Extract URLs from the '.txt.gz' and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
+# Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
 extract_urls_from_directory(directory_path, output_directory, concurrency_level)