Update mediafire_automated_cdx_processor.py

2024-01-08 10:44:33 +00:00 · 2024-01-08 10:44:33 +00:00 · 2bf2e02275
commit 2bf2e02275
parent d56d7bb81f
1 changed files with 9 additions and 8 deletions
--- a/mediafire_automated_cdx_processor.py
+++ b/mediafire_automated_cdx_processor.py
@ -6,8 +6,9 @@ from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urlparse

 MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
-CONCURRENCY = 6
-BATCH_SIZE = 6
+ROOT_DIRECTORY = "/root/mediafire_files"
+CONCURRENCY = 10
+BATCH_SIZE = 10

 def run_cdxsummary(file_path, json_filepath):
    # Construct the cdxsummary command
@ -75,7 +76,7 @@ def process_batch(urls, start_index, end_index):
        for file_path in downloaded_files:
            # Construct file paths
            file_path = os.path.join(os.getcwd(), file_path)
-            json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
+            json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))

            # Run cdxsummary and delete .cdx.gz file
            executor.submit(run_cdxsummary, file_path, json_filepath)
@ -91,7 +92,7 @@ def run_ia_command():
    ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist"

    # Output file for ia search results
-    output_file = "ia_search_results.txt"
+    output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt")

    try:
        # Run the ia search command and write the output to a text file
@ -118,7 +119,7 @@ def create_mediafire_directory():

 def move_file(source_path):
    file_name = os.path.basename(source_path)
-    destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
+    destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))

    # Check if the file has a ".cdx.gz" extension before moving
    if file_name.endswith(".cdx.gz"):
@ -141,7 +142,7 @@ def main():
    create_mediafire_directory()

    # Create directory_output.txt
-    directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt')
+    directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt')
    with open(directory_output_path, 'w') as directory_output_file:
        for filename in os.listdir(MEDIAFIRE_DIRECTORY):
            if filename.endswith(".cdx.json"):
@ -150,7 +151,7 @@ def main():
    run_ia_command()  # Run IA command after creating directory_output.txt

    # Read the URLs from the file
-    with open('ia_search_results.txt', 'r') as ia_search_file:
+    with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
        ia_search_urls = ia_search_file.readlines()

    # Extract filenames from URLs
@ -164,7 +165,7 @@ def main():
    filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)

    # Write filtered URLs to urls_to_download.txt
-    urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt')
+    urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt')
    with open(urls_to_download_path, 'w') as urls_to_download_file:
        urls_to_download_file.writelines(filtered_urls)