From b6a9c68140b4d756a7a72db7b25b3bd1e904fafb Mon Sep 17 00:00:00 2001 From: datechnoman Date: Sun, 28 Jan 2024 09:14:12 +0000 Subject: [PATCH] Update warc_wat_url_processor.py --- warc_wat_url_processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warc_wat_url_processor.py b/warc_wat_url_processor.py index 3f2b6b3..a4fe03a 100644 --- a/warc_wat_url_processor.py +++ b/warc_wat_url_processor.py @@ -48,7 +48,6 @@ def process_file(file_path): # Extract URLs from the gzipped file urls = extract_urls_from_file(file_path) - print(f"Extracted {len(urls)} URLs from {file_path}") # Create the output file path with '_urls.txt' extension output_file_path = os.path.splitext(file_path)[0] + '_urls.txt' @@ -121,13 +120,18 @@ def main(): download_concurrency_level = 40 with ProcessPoolExecutor(max_workers=download_concurrency_level) as executor: + print("Submitting tasks to the ProcessPoolExecutor...") futures = [executor.submit(download_and_process_file, url) for url in urls] + print(f"Submitted {len(futures)} tasks.") + print("Waiting for tasks to complete...") completed_futures, _ = wait(futures) + print(f"{len(completed_futures)} tasks completed.") for completed_future in completed_futures: try: result = completed_future.result() + print(f"Task result: {result}") # Process the result if needed except Exception as e: print(f"Error in processing future: {e}")