From 182c58f1ce5d1a7100902ec9ef7417d070e70180 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Wed, 27 Dec 2023 08:46:36 +0000 Subject: [PATCH] Implemented subprocess for running multiple json extractions at once --- mediafire_cdxprocessor.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mediafire_cdxprocessor.py b/mediafire_cdxprocessor.py index dd23444..fe0d677 100644 --- a/mediafire_cdxprocessor.py +++ b/mediafire_cdxprocessor.py @@ -1,8 +1,8 @@ import os import subprocess -from concurrent.futures import ThreadPoolExecutor import json from datetime import datetime +from concurrent.futures import ThreadPoolExecutor MEDIAFIRE_DIRECTORY = "/opt/MediaFire" CONCURRENCY = 4 # Set the desired concurrency for downloading multiple files @@ -62,13 +62,14 @@ def process_batch(urls, start_index, end_index): downloaded_files = download_files(batch_urls) # Move files and run cdxsummary for each downloaded file - for file_path in downloaded_files: - # Construct file paths - file_path = os.path.join(os.getcwd(), file_path) - json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) + with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: + for file_path in downloaded_files: + # Construct file paths + file_path = os.path.join(os.getcwd(), file_path) + json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) - # Run cdxsummary and delete .cdx.gz file - run_cdxsummary(file_path, json_filepath) + # Run cdxsummary and delete .cdx.gz file + executor.submit(run_cdxsummary, file_path, json_filepath) def run_ia_command(): # Get the current date formatted as YYYY-MM-DD @@ -133,4 +134,3 @@ def main(): if __name__ == "__main__": main() -