From e98f80aec4ea3ba1cd3aed8d9e7e7999b4f3afbf Mon Sep 17 00:00:00 2001 From: datechnoman Date: Sat, 20 Jan 2024 11:29:57 +0000 Subject: [PATCH] Updated to use command line zstd --- warc_wat_url_processor.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/warc_wat_url_processor.py b/warc_wat_url_processor.py index 6aade3b..e29de7e 100644 --- a/warc_wat_url_processor.py +++ b/warc_wat_url_processor.py @@ -1,7 +1,6 @@ import subprocess import os import gzip -import zstandard as zstd import re import traceback from multiprocessing import Pool @@ -20,7 +19,7 @@ def extract_urls_from_file(file_path): print(f"An unexpected error occurred while processing '{file_path}': {e}") print("Full traceback:") traceback.print_exc() - + return urls def process_file(file_path): @@ -38,10 +37,9 @@ def process_file(file_path): output_file.write('\n'.join(urls)) print(f"URLs written to {output_file_path}") - # Compress the output file using zstd with compression level -18 - with open(output_file_path, 'rb') as input_file, open(output_file_path + '.zst', 'wb') as output_zstd_file: - cctx = zstd.ZstdCompressor(level=18) - output_zstd_file.write(cctx.compress(input_file.read())) + # Use zstd command-line tool for compression + command = f'zstd -T0 -18 --long {output_file_path} -o {output_file_path}.zst' + subprocess.run(command, shell=True) print(f"Compressed file saved as '{output_file_path}.zst'") # Remove the original gzipped file