Updated script to support .txt files

2023-12-13 11:32:17 +00:00 · 2023-12-13 11:32:17 +00:00 · 112814dd35
commit 112814dd35
parent 5f582ab369
1 changed files with 53 additions and 53 deletions
--- a/all_url_extractor.py
+++ b/all_url_extractor.py
@ -1,53 +1,53 @@
-import os
+import os
-import gzip
+import gzip
-import re
+import re
-from multiprocessing import Pool
+from multiprocessing import Pool
-
+
-def extract_urls_from_line(line):
+def extract_urls_from_line(line):
-    # Extract URLs using regular expression
+    # Extract URLs using regular expression
-    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
-    return re.findall(url_pattern, line)
+    return re.findall(url_pattern, line)
-
+
-def process_file(file_path, output_directory):
+def process_file(file_path, output_directory):
-    # Create the output file path with '_urls.txt' extension in the specified output directory
+    # Create the output file path with '_urls.txt' extension in the specified output directory
-    output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt')
+    output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt')
-
+
-    # Process the file line by line and extract URLs
+    # Process the file line by line and extract URLs
-    with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file:
+    with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file:
-        for line in file:
+        for line in file:
-            # Extract URLs from the line
+            # Extract URLs from the line
-            line_urls = extract_urls_from_line(line)
+            line_urls = extract_urls_from_line(line)
-            
+            
-            # Write the URLs to the output file
+            # Write the URLs to the output file
-            output_file.write('\n'.join(line_urls) + '\n')
+            output_file.write('\n'.join(line_urls) + '\n')
-
+
-    print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'")
+    print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'")
-
+
-    # Remove the original gzipped file
+    # Remove the original gzipped file
-    os.remove(file_path)
+    os.remove(file_path)
-
+
-def extract_urls_from_directory(directory_path, output_directory, concurrency_level):
+def extract_urls_from_directory(directory_path, output_directory, concurrency_level):
-    # Get the list of files in the directory and sort them
+    # Get the list of files in the directory and sort them
-    file_list = sorted(os.listdir(directory_path))
+    file_list = sorted(os.listdir(directory_path))
-
+
-    # Create a multiprocessing Pool with the specified concurrency level
+    # Create a multiprocessing Pool with the specified concurrency level
-    pool = Pool(processes=concurrency_level)
+    pool = Pool(processes=concurrency_level)
-
+
-    # Map the file processing function to the list of files with '.txt.gz' and '.csv.gz' extensions
+    # Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions
-    pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt.gz', '.csv.gz'))])
+    pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))])
-
+
-    # Close the pool to free up resources
+    # Close the pool to free up resources
-    pool.close()
+    pool.close()
-    pool.join()
+    pool.join()
-
+
-# Prompt the user to input the directory containing the files
+# Prompt the user to input the directory containing the files
-directory_path = input("Enter the directory path containing the '.txt.gz' and '.csv.gz' files to extract URLs from: ")
+directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ")
-
+
-# Prompt the user to input the output directory
+# Prompt the user to input the output directory
-output_directory = input("Enter the output directory path to store the extracted URLs files: ")
+output_directory = input("Enter the output directory path to store the extracted URLs files: ")
-
+
-# Prompt the user to input the concurrency level
+# Prompt the user to input the concurrency level
-concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))
+concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))
-
+
-# Extract URLs from the '.txt.gz' and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
+# Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
-extract_urls_from_directory(directory_path, output_directory, concurrency_level)
+extract_urls_from_directory(directory_path, output_directory, concurrency_level)