Updated script to support .txt files
This commit is contained in:
parent
5f582ab369
commit
112814dd35
@ -1,53 +1,53 @@
|
|||||||
import os
|
import os
|
||||||
import gzip
|
import gzip
|
||||||
import re
|
import re
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
|
||||||
def extract_urls_from_line(line):
|
def extract_urls_from_line(line):
|
||||||
# Extract URLs using regular expression
|
# Extract URLs using regular expression
|
||||||
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
||||||
return re.findall(url_pattern, line)
|
return re.findall(url_pattern, line)
|
||||||
|
|
||||||
def process_file(file_path, output_directory):
|
def process_file(file_path, output_directory):
|
||||||
# Create the output file path with '_urls.txt' extension in the specified output directory
|
# Create the output file path with '_urls.txt' extension in the specified output directory
|
||||||
output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt')
|
output_file_path = os.path.join(output_directory, os.path.splitext(os.path.basename(file_path))[0] + '_urls.txt')
|
||||||
|
|
||||||
# Process the file line by line and extract URLs
|
# Process the file line by line and extract URLs
|
||||||
with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file:
|
with gzip.open(file_path, 'rt', encoding='latin-1') as file, open(output_file_path, 'w') as output_file:
|
||||||
for line in file:
|
for line in file:
|
||||||
# Extract URLs from the line
|
# Extract URLs from the line
|
||||||
line_urls = extract_urls_from_line(line)
|
line_urls = extract_urls_from_line(line)
|
||||||
|
|
||||||
# Write the URLs to the output file
|
# Write the URLs to the output file
|
||||||
output_file.write('\n'.join(line_urls) + '\n')
|
output_file.write('\n'.join(line_urls) + '\n')
|
||||||
|
|
||||||
print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'")
|
print(f"URLs extracted from '{file_path}' and saved to '{output_file_path}'")
|
||||||
|
|
||||||
# Remove the original gzipped file
|
# Remove the original gzipped file
|
||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
|
|
||||||
def extract_urls_from_directory(directory_path, output_directory, concurrency_level):
|
def extract_urls_from_directory(directory_path, output_directory, concurrency_level):
|
||||||
# Get the list of files in the directory and sort them
|
# Get the list of files in the directory and sort them
|
||||||
file_list = sorted(os.listdir(directory_path))
|
file_list = sorted(os.listdir(directory_path))
|
||||||
|
|
||||||
# Create a multiprocessing Pool with the specified concurrency level
|
# Create a multiprocessing Pool with the specified concurrency level
|
||||||
pool = Pool(processes=concurrency_level)
|
pool = Pool(processes=concurrency_level)
|
||||||
|
|
||||||
# Map the file processing function to the list of files with '.txt.gz' and '.csv.gz' extensions
|
# Map the file processing function to the list of files with '.txt', '.txt.gz', and '.csv.gz' extensions
|
||||||
pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt.gz', '.csv.gz'))])
|
pool.starmap(process_file, [(os.path.join(directory_path, filename), output_directory) for filename in file_list if filename.lower().endswith(('.txt', '.txt.gz', '.csv.gz'))])
|
||||||
|
|
||||||
# Close the pool to free up resources
|
# Close the pool to free up resources
|
||||||
pool.close()
|
pool.close()
|
||||||
pool.join()
|
pool.join()
|
||||||
|
|
||||||
# Prompt the user to input the directory containing the files
|
# Prompt the user to input the directory containing the files
|
||||||
directory_path = input("Enter the directory path containing the '.txt.gz' and '.csv.gz' files to extract URLs from: ")
|
directory_path = input("Enter the directory path containing the '.txt', '.txt.gz', and '.csv.gz' files to extract URLs from: ")
|
||||||
|
|
||||||
# Prompt the user to input the output directory
|
# Prompt the user to input the output directory
|
||||||
output_directory = input("Enter the output directory path to store the extracted URLs files: ")
|
output_directory = input("Enter the output directory path to store the extracted URLs files: ")
|
||||||
|
|
||||||
# Prompt the user to input the concurrency level
|
# Prompt the user to input the concurrency level
|
||||||
concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))
|
concurrency_level = int(input("Enter the concurrency level (number of processes running concurrently): "))
|
||||||
|
|
||||||
# Extract URLs from the '.txt.gz' and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
|
# Extract URLs from the '.txt', '.txt.gz', and '.csv.gz' files in the specified directory and save them to the output directory with the specified concurrency level
|
||||||
extract_urls_from_directory(directory_path, output_directory, concurrency_level)
|
extract_urls_from_directory(directory_path, output_directory, concurrency_level)
|
||||||
|
Loading…
Reference in New Issue
Block a user