Update warc_wat_url_processor.py

This commit is contained in:
datechnoman 2024-01-20 12:28:23 +00:00
parent 3ce09f46d7
commit 06e3399861

View File

@ -38,14 +38,30 @@ def process_file(file_path):
print(f"URLs written to {output_file_path}")
# Use zstd command-line tool for compression
command = f'zstd -T0 -18 --long {output_file_path} -o {output_file_path}.zst'
subprocess.run(command, shell=True)
print(f"Compressed file saved as '{output_file_path}.zst'")
compressed_file_path = f'{output_file_path}.zst'
command_compress = f'zstd -T0 -14 --long {output_file_path} -o {compressed_file_path}'
subprocess.run(command_compress, shell=True)
print(f"Compressed file saved as '{compressed_file_path}'")
# Remove the original gzipped file
os.remove(file_path)
print(f"Original file removed: {file_path}")
# Remove the original _urls.txt file
os.remove(output_file_path)
print(f"Original file removed: {output_file_path}")
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
filename = os.path.basename(output_file_path).replace('_urls.txt', '')
command = f'sed -i "/{filename}/d" "urls_to_download.txt"'
if subprocess.run(command, shell=True).returncode == 0:
print(f"File {filename} has been successfully removed from urls_to_download.txt")
with open('urls_to_download.txt', 'r') as file:
remaining_count = sum(1 for line in file)
print(f"URLs remaining to be processed: {remaining_count}")
else:
print(f"Failed to remove {filename} from urls_to_download.txt")
def extract_urls_from_directory(directory_path):
file_list = sorted(os.listdir(directory_path))
pool = Pool(processes=7)