diff --git a/warc_wat_url_processor.py b/warc_wat_url_processor.py index 620af0b..cedf98c 100644 --- a/warc_wat_url_processor.py +++ b/warc_wat_url_processor.py @@ -42,12 +42,15 @@ def process_file(file_path): compressed_file_path = f'{output_file_path}.zst' command_compress = f'zstd -T0 -12 --long {output_file_path} -o {compressed_file_path}' - # Run the compression command synchronously and wait for it to complete - result = subprocess.run(command_compress, shell=True, check=True) - if result.returncode == 0: - print(f"Compressed file saved as '{compressed_file_path}'") - else: - print(f"Compression failed for '{output_file_path}'") + try: + # Run the compression command synchronously and wait for it to complete + result = subprocess.run(command_compress, shell=True, check=True) + if result.returncode == 0: + print(f"Compressed file saved as '{compressed_file_path}'") + else: + print(f"Compression failed for '{output_file_path}'") + except subprocess.CalledProcessError as compression_error: + print(f"Compression failed for '{output_file_path}': {compression_error}") # Remove the original gzipped file os.remove(file_path) @@ -92,7 +95,7 @@ def main(): urls = [url.strip() for url in urls] - download_concurrency_level = 4 + download_concurrency_level = 40 # Start downloading and processing files in parallel with ProcessPoolExecutor(max_workers=download_concurrency_level) as executor: