Added tar support for files older than 24 hours to keep cdx file count low

2024-01-09 00:19:22 +00:00 · 2024-01-09 00:19:22 +00:00 · 457f32ed61
commit 457f32ed61
parent f5928594a5
1 changed files with 26 additions and 1 deletions
--- a/mediafire_automated_cdx_processor.py
+++ b/mediafire_automated_cdx_processor.py
@ -1,15 +1,19 @@
+# Import necessary libraries
 import os
 import subprocess
 import json
+import tarfile  # Use tarfile for creating tar archives
 from datetime import datetime, timedelta
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urlparse

+# Define constants
 MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
 ROOT_DIRECTORY = "/root/mediafire_files"
 CONCURRENCY = 10
 BATCH_SIZE = 10

+# Function to run cdxsummary command
 def run_cdxsummary(file_path, json_filepath):
    # Construct the cdxsummary command
    cdxsummary_command = f"cdxsummary --json {file_path}"
@ -54,6 +58,7 @@ def run_cdxsummary(file_path, json_filepath):
    except subprocess.CalledProcessError as e:
        print(f"Error running cdxsummary command: {e}")

+# Function to download a file using axel
 def download_file(url):
    # Strip newline characters from the URL
    url = url.strip()
@ -65,6 +70,7 @@ def download_file(url):
    # Return the downloaded file path
    return os.path.basename(url)

+# Function to download files concurrently
 def download_files(urls):
    with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
        # Use map to get the file paths
@ -72,6 +78,7 @@ def download_files(urls):

    return downloaded_files

+# Function to process a batch of URLs
 def process_batch(urls, start_index, end_index):
    # Extract batch of URLs
    batch_urls = urls[start_index:end_index]
@ -99,6 +106,7 @@ def process_batch(urls, start_index, end_index):
        for future in futures:
            future.result()

+# Function to run the Internet Archive (IA) search command
 def run_ia_command():
    # Get the current date formatted as YYYY-MM-DD
    current_date = datetime.now().strftime("%Y-%m-%d")
@ -131,10 +139,12 @@ def run_ia_command():
    except subprocess.CalledProcessError as e:
        print(f"Error running IA search command: {e}")

+# Function to create the mediafire directory if it doesn't exist
 def create_mediafire_directory():
    if not os.path.exists(MEDIAFIRE_DIRECTORY):
        os.makedirs(MEDIAFIRE_DIRECTORY)

+# Function to move a file to the root folder
 def move_file(source_path):
    file_name = os.path.basename(source_path)
    destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
@ -146,16 +156,19 @@ def move_file(source_path):
        os.rename(source_path, destination_path)
        print(f"Moved '{file_name}' to the root folder.")

+# Function to filter URLs to download based on filenames to exclude
 def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
    filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
    return filtered_urls

+# Function to extract filename from URL
 def extract_filename_from_url(url):
    # Extract the filename part from the URL
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    return filename

+# Main function
 def main():
    create_mediafire_directory()

@ -166,7 +179,18 @@ def main():
            if filename.endswith(".cdx.json"):
                directory_output_file.write(f"{filename}\n")

-    run_ia_command()  # Run IA command after creating directory_output.txt
+    # Process older files in /opt/cdxfiles/mediafire
+    older_than_24_hours = datetime.now() - timedelta(days=1)
+    for filename in os.listdir(MEDIAFIRE_DIRECTORY):
+        file_path = os.path.join(MEDIAFIRE_DIRECTORY, filename)
+        if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_24_hours.timestamp():
+            # Appending to the existing tar file
+            with tarfile.open(os.path.join(MEDIAFIRE_DIRECTORY, 'mediafire_cdxfiles_archive.tar'), 'a') as tar:
+                tar.add(file_path, arcname=filename)
+            os.remove(file_path)
+            print(f"Added '{filename}' to 'mediafire_cdxfiles_archive.tar' and removed the JSON file.")
+
+    run_ia_command()  # Run IA command after processing older files

    # Read the URLs from the file
    with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
@ -193,5 +217,6 @@ def main():
        end_index = min(i + BATCH_SIZE, len(filtered_urls))
        process_batch(filtered_urls, start_index, end_index)

+# Entry point
 if __name__ == "__main__":
    main()