Add mediafire_automated_cdx_processor.py

2024-01-08 08:23:28 +00:00 · 2024-01-08 08:23:28 +00:00 · 2e938624f9
commit 2e938624f9
parent b2a9c44c9e
1 changed files with 146 additions and 0 deletions
--- a/mediafire_automated_cdx_processor.py
+++ b/mediafire_automated_cdx_processor.py
@ -0,0 +1,146 @@
+import os
+import subprocess
+import json
+from datetime import datetime, timedelta
+from concurrent.futures import ThreadPoolExecutor
+
+MEDIAFIRE_DIRECTORY = "/opt/MediaFire"
+CONCURRENCY = 4  # Set the desired concurrency for downloading multiple files
+BATCH_SIZE = 6   # Set the batch size for processing
+
+def run_cdxsummary(file_path, json_filepath):
+    # Construct the cdxsummary command
+    cdxsummary_command = f"cdxsummary --json {file_path}"
+
+    try:
+        # Run the cdxsummary command and capture the output
+        result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True)
+
+        # Parse the JSON output
+        json_output = json.loads(result.stdout)
+
+        # Add "cdxcount" entry with value 1
+        json_output["cdxcount"] = 1
+
+        # Add "cdxsize" entry with the size of the cdx.gz file in bytes
+        cdx_size_bytes = os.path.getsize(file_path)
+        json_output["cdxsize"] = cdx_size_bytes
+
+        # Remove "pathquery" and "samples" keys
+        if "pathquery" in json_output:
+            del json_output["pathquery"]
+        if "samples" in json_output:
+            del json_output["samples"]
+        if "tophosts" in json_output:
+            del json_output["tophosts"]
+
+        # Write the JSON output to a file if it doesn't exist
+        with open(json_filepath, "w") as json_file:
+            json.dump(json_output, json_file, indent=2)
+
+        print(f"Created JSON file for '{file_path}': '{json_filepath}'")
+
+        # Delete the .cdx.gz file
+        os.remove(file_path)
+        print(f"Deleted '{file_path}' after processing.")
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error running cdxsummary command: {e}")
+
+def download_file(url):
+    command = f'axel -n 1 {url}'  # Set concurrency to 1 for each individual file
+    subprocess.run(command, shell=True)
+    # Return the downloaded file path
+    return os.path.basename(url)
+
+def download_files(urls):
+    with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
+        # Use map to get the file paths
+        downloaded_files = list(executor.map(download_file, urls))
+
+    return downloaded_files
+
+def process_batch(urls, start_index, end_index):
+    # Extract batch of URLs
+    batch_urls = urls[start_index:end_index]
+    print("\nDownloading Batch...\n")
+
+    # Download files concurrently
+    downloaded_files = download_files(batch_urls)
+
+    # Move files and run cdxsummary for each downloaded file
+    with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
+        for file_path in downloaded_files:
+            # Construct file paths
+            file_path = os.path.join(os.getcwd(), file_path)
+            json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
+
+            # Run cdxsummary and delete .cdx.gz file
+            executor.submit(run_cdxsummary, file_path, json_filepath)
+
+def run_ia_command():
+    # Get the current date formatted as YYYY-MM-DD
+    current_date = datetime.now().strftime("%Y-%m-%d")
+
+    # Calculate 2 days before the current date
+    two_days_before = (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%d")
+
+    # Replace archiveteam_telegram with archiveteam_mediafire and add date range
+    ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist"
+
+    # Output file for ia search results
+    output_file = "ia_search_results.txt"
+
+    try:
+        # Run the ia search command and write the output to a text file
+        with open(output_file, "w") as output:
+            subprocess.run(ia_search_command, shell=True, stdout=output, check=True)
+
+        # Read the file, append a "/" and duplicate the item name with .cdx.gz extension
+        with open(output_file, "r") as input_file:
+            lines = input_file.readlines()
+            lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines]
+
+        # Write the modified lines back to the file
+        with open(output_file, "w") as output_file:
+            output_file.writelines(lines)
+
+        print(f"IA search results written to '{output_file}' with URLs appended.")
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error running IA search command: {e}")
+
+def create_mediafire_directory():
+    if not os.path.exists(MEDIAFIRE_DIRECTORY):
+        os.makedirs(MEDIAFIRE_DIRECTORY)
+
+def move_file(source_path):
+    file_name = os.path.basename(source_path)
+    destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
+
+    # Check if the file has a ".cdx.gz" extension before moving
+    if file_name.endswith(".cdx.gz"):
+        print(f"Skipping move for JSON file '{file_name}'.")
+    else:
+        os.rename(source_path, destination_path)
+        print(f"Moved '{file_name}' to the root folder.")
+
+def main():
+    run_ia_command()  # Run IA command first
+    create_mediafire_directory()
+
+    # Read the URLs from the file
+    with open('ia_search_results.txt', 'r') as file:
+        urls = file.readlines()
+
+    # Remove leading/trailing whitespace from the URLs
+    urls = [url.strip() for url in urls]
+
+    # Process URLs in batches
+    for i in range(0, len(urls), BATCH_SIZE):
+        start_index = i
+        end_index = min(i + BATCH_SIZE, len(urls))
+        process_batch(urls, start_index, end_index)
+
+if __name__ == "__main__":
+    main()