From 06e3399861af0387cc3ab501243e4069558a0179 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Sat, 20 Jan 2024 12:28:23 +0000 Subject: [PATCH] Update warc_wat_url_processor.py --- warc_wat_url_processor.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/warc_wat_url_processor.py b/warc_wat_url_processor.py index e29de7e..562e815 100644 --- a/warc_wat_url_processor.py +++ b/warc_wat_url_processor.py @@ -38,14 +38,30 @@ def process_file(file_path): print(f"URLs written to {output_file_path}") # Use zstd command-line tool for compression - command = f'zstd -T0 -18 --long {output_file_path} -o {output_file_path}.zst' - subprocess.run(command, shell=True) - print(f"Compressed file saved as '{output_file_path}.zst'") + compressed_file_path = f'{output_file_path}.zst' + command_compress = f'zstd -T0 -14 --long {output_file_path} -o {compressed_file_path}' + subprocess.run(command_compress, shell=True) + print(f"Compressed file saved as '{compressed_file_path}'") # Remove the original gzipped file os.remove(file_path) print(f"Original file removed: {file_path}") + # Remove the original _urls.txt file + os.remove(output_file_path) + print(f"Original file removed: {output_file_path}") + + # Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt + filename = os.path.basename(output_file_path).replace('_urls.txt', '') + command = f'sed -i "/{filename}/d" "urls_to_download.txt"' + if subprocess.run(command, shell=True).returncode == 0: + print(f"File {filename} has been successfully removed from urls_to_download.txt") + with open('urls_to_download.txt', 'r') as file: + remaining_count = sum(1 for line in file) + print(f"URLs remaining to be processed: {remaining_count}") + else: + print(f"Failed to remove {filename} from urls_to_download.txt") + def extract_urls_from_directory(directory_path): file_list = sorted(os.listdir(directory_path)) pool = Pool(processes=7)