diff --git a/warc_wat_url_processor.py b/warc_wat_url_processor.py index 7b7f177..620af0b 100644 --- a/warc_wat_url_processor.py +++ b/warc_wat_url_processor.py @@ -43,10 +43,11 @@ def process_file(file_path): command_compress = f'zstd -T0 -12 --long {output_file_path} -o {compressed_file_path}' # Run the compression command synchronously and wait for it to complete - compression_process = subprocess.Popen(command_compress, shell=True) - compression_process.communicate() - - print(f"Compressed file saved as '{compressed_file_path}'") + result = subprocess.run(command_compress, shell=True, check=True) + if result.returncode == 0: + print(f"Compressed file saved as '{compressed_file_path}'") + else: + print(f"Compression failed for '{output_file_path}'") # Remove the original gzipped file os.remove(file_path) @@ -59,7 +60,8 @@ def process_file(file_path): # Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt filename = os.path.basename(output_file_path).replace('_urls.txt', '') command = f'sed -i "/{filename}/d" "urls_to_download.txt"' - if subprocess.run(command, shell=True).returncode == 0: + result = subprocess.run(command, shell=True) + if result.returncode == 0: print(f"File {filename} has been successfully removed from urls_to_download.txt") with open('urls_to_download.txt', 'r') as file: remaining_count = sum(1 for line in file) @@ -73,10 +75,14 @@ def process_file(file_path): def download_and_process_file(url): try: - command = f'axel -n 4 {url}' - subprocess.run(command, shell=True) - file_path = os.path.join(os.getcwd(), os.path.basename(url)) - process_file(file_path) + command = f'axel -n 3 {url}' + result = subprocess.run(command, shell=True, check=True) + if result.returncode == 0: + file_path = os.path.join(os.getcwd(), os.path.basename(url)) + process_file(file_path) + else: + print(f"Download failed for {url}") + except Exception as e: print(f"Error during download and processing {url}: {e}") @@ -93,7 +99,7 @@ def main(): futures = [executor.submit(download_and_process_file, url) for url in urls] # Wait for all downloads and processing to complete before starting the next iteration - wait(futures) + as_completed(futures) if __name__ == "__main__": - main() \ No newline at end of file + main()