From 42be737ee6164961e544bd10232df066e82810da Mon Sep 17 00:00:00 2001
From: datechnoman <datechnoman@hotmail.com>
Date: Mon, 8 Jan 2024 11:05:07 +0000
Subject: [PATCH] Code fix

---
 mediafire_automated_cdx_processor.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/mediafire_automated_cdx_processor.py b/mediafire_automated_cdx_processor.py
index b31065f..95361a8 100644
--- a/mediafire_automated_cdx_processor.py
+++ b/mediafire_automated_cdx_processor.py
@@ -7,8 +7,8 @@ from urllib.parse import urlparse
 
 MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
 ROOT_DIRECTORY = "/root/mediafire_files"
-CONCURRENCY = 10
-BATCH_SIZE = 10
+CONCURRENCY = 6
+BATCH_SIZE = 6
 
 def run_cdxsummary(file_path, json_filepath):
     # Construct the cdxsummary command
@@ -50,9 +50,13 @@ def run_cdxsummary(file_path, json_filepath):
         print(f"Error running cdxsummary command: {e}")
 
 def download_file(url):
+    # Strip newline characters from the URL
+    url = url.strip()
+
     # Command to download the file using axel
     command = f'axel -n 1 {url}'  # Set concurrency to 1 for each individual file
     subprocess.run(command, shell=True)
+
     # Return the downloaded file path
     return os.path.basename(url)
 
@@ -73,13 +77,22 @@ def process_batch(urls, start_index, end_index):
 
     # Move files and run cdxsummary for each downloaded file
     with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
-        for file_path in downloaded_files:
+        futures = []
+
+        for file_url in downloaded_files:
+            # Extracting filename from the URL
+            file_name = os.path.basename(file_url)
+
             # Construct file paths
-            file_path = os.path.join(os.getcwd(), file_path)
-            json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
+            file_path = os.path.join(os.getcwd(), file_name)
+            json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
 
             # Run cdxsummary and delete .cdx.gz file
-            executor.submit(run_cdxsummary, file_path, json_filepath)
+            futures.append(executor.submit(run_cdxsummary, file_path, json_filepath))
+
+        # Wait for all tasks to complete before proceeding to the next batch
+        for future in futures:
+            future.result()
 
 def run_ia_command():
     # Get the current date formatted as YYYY-MM-DD