From 42be737ee6164961e544bd10232df066e82810da Mon Sep 17 00:00:00 2001 From: datechnoman Date: Mon, 8 Jan 2024 11:05:07 +0000 Subject: [PATCH] Code fix --- mediafire_automated_cdx_processor.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mediafire_automated_cdx_processor.py b/mediafire_automated_cdx_processor.py index b31065f..95361a8 100644 --- a/mediafire_automated_cdx_processor.py +++ b/mediafire_automated_cdx_processor.py @@ -7,8 +7,8 @@ from urllib.parse import urlparse MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire" ROOT_DIRECTORY = "/root/mediafire_files" -CONCURRENCY = 10 -BATCH_SIZE = 10 +CONCURRENCY = 6 +BATCH_SIZE = 6 def run_cdxsummary(file_path, json_filepath): # Construct the cdxsummary command @@ -50,9 +50,13 @@ def run_cdxsummary(file_path, json_filepath): print(f"Error running cdxsummary command: {e}") def download_file(url): + # Strip newline characters from the URL + url = url.strip() + # Command to download the file using axel command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file subprocess.run(command, shell=True) + # Return the downloaded file path return os.path.basename(url) @@ -73,13 +77,22 @@ def process_batch(urls, start_index, end_index): # Move files and run cdxsummary for each downloaded file with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: - for file_path in downloaded_files: + futures = [] + + for file_url in downloaded_files: + # Extracting filename from the URL + file_name = os.path.basename(file_url) + # Construct file paths - file_path = os.path.join(os.getcwd(), file_path) - json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) + file_path = os.path.join(os.getcwd(), file_name) + json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) # Run cdxsummary and delete .cdx.gz file - executor.submit(run_cdxsummary, file_path, json_filepath) + futures.append(executor.submit(run_cdxsummary, file_path, json_filepath)) + + # Wait for all tasks to complete before proceeding to the next batch + for future in futures: + future.result() def run_ia_command(): # Get the current date formatted as YYYY-MM-DD