Add urls_automated_cdx_processor.py

2024-03-11 23:32:19 +00:00 · 2024-03-11 23:32:19 +00:00 · 7106415581
commit 7106415581
parent 9cb12880cc
1 changed files with 242 additions and 0 deletions
--- a/urls_automated_cdx_processor.py
+++ b/urls_automated_cdx_processor.py
@ -0,0 +1,242 @@
 # Import necessary libraries
 import os
 import subprocess
 import json
 from datetime import datetime, timedelta
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urlparse
 # Define constants
 URLS_DIRECTORY = "/opt/cdxfiles/urls"
 ROOT_DIRECTORY = "/root/urls_files"
 CONCURRENCY = 5
 BATCH_SIZE = 5
 # Function to run cdxsummary command
 def run_cdxsummary(file_path, json_filepath):
    # Construct the cdxsummary command
    cdxsummary_command = f"/usr/local/bin/cdxsummary -t 30 --json {file_path}"
    try:
        # Run the cdxsummary command and capture the output
        result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True)
        # Parse the JSON output
        json_output = json.loads(result.stdout)
        # Add "cdxcount" entry with value 1
        json_output["cdxcount"] = 1
        # Add "cdxsize" entry with the size of the cdx.gz file in bytes
        cdx_size_bytes = os.path.getsize(file_path)
        json_output["cdxsize"] = cdx_size_bytes
        # Remove "pathquery" and "samples" keys
        if "pathquery" in json_output:
            del json_output["pathquery"]
        if "samples" in json_output:
            del json_output["samples"]
        # Write the JSON output to a file if it doesn't exist
        with open(json_filepath, "w") as json_file:
            json.dump(json_output, json_file, indent=2)
        print(f"Created JSON file for '{file_path}': '{json_filepath}'")
        # Write the tophosts section to a separate JSON file
        tophosts_output = {"tophosts": json_output.get("tophosts", {})}  # Extract tophosts section
        tophosts_filepath = os.path.join(ROOT_DIRECTORY, os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json"))
        with open(tophosts_filepath, "w") as tophosts_file:
            json.dump(tophosts_output, tophosts_file, indent=2)
        print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
        # Remove the tophosts section from the JSON output
        if "tophosts" in json_output:
            del json_output["tophosts"]
        # Write the modified JSON output to the original file
        with open(json_filepath, "w") as json_file:
            json.dump(json_output, json_file, indent=2)
        print(f"Removed 'tophosts' section from '{json_filepath}'")
        # Copy "first" and "last" timestamps to the tophosts JSON file
        if "first" in json_output and "last" in json_output:
            tophosts_output["first"] = json_output["first"]
            tophosts_output["last"] = json_output["last"]
            with open(tophosts_filepath, "w") as tophosts_file:
                json.dump(tophosts_output, tophosts_file, indent=2)
                print(f"Copied 'first' and 'last' timestamps to '{tophosts_filepath}'")
        # Move the JSON file to /opt/cdxfiles/urls
        destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath))
        os.rename(json_filepath, destination_path)
        print(f"Moved '{os.path.basename(json_filepath)}' to '{URLS_DIRECTORY}'.")
        # Delete the .cdx.gz file
        os.remove(file_path)
        print(f"Deleted '{file_path}' after processing.")
    except subprocess.CalledProcessError as e:
        print(f"Error running cdxsummary command: {e}")
 # Function to download a file using axel
 def download_file(url):
    # Strip newline characters from the URL
    url = url.strip()
    # Command to download the file using axel
    command = f'axel -n 1 {url}'  # Set concurrency to 1 for each individual file
    subprocess.run(command, shell=True)
    # Return the downloaded file path
    return os.path.basename(url)
 # Function to download files concurrently
 def download_files(urls):
    with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
        # Use map to get the file paths
        downloaded_files = list(executor.map(download_file, urls))
    return downloaded_files
 # Function to process a batch of URLs
 def process_batch(urls, start_index, end_index):
    # Extract batch of URLs
    batch_urls = urls[start_index:end_index]
    print("\nDownloading Batch...\n")
    # Download files concurrently
    downloaded_files = download_files(batch_urls)
    # Move files and run cdxsummary for each downloaded file
    with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
        futures = []
        for file_url in downloaded_files:
            # Extracting filename from the URL
            file_name = os.path.basename(file_url)
            # Construct file paths
            file_path = os.path.join(os.getcwd(), file_name)
            json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
            # Run cdxsummary and delete .cdx.gz file
            futures.append(executor.submit(run_cdxsummary, file_path, json_filepath))
        # Wait for all tasks to complete before proceeding to the next batch
        for future in futures:
            future.result()
 # Function to run the Internet Archive (IA) search command
 def run_ia_command():
    # Get the current date formatted as YYYY-MM-DD
    current_date = datetime.now().strftime("%Y-%m-%d")
    # Calculate 2 days before the current date
    two_days_before = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
    # Replace archiveteam_telegram with archiveteam_urls and add date range
    ia_search_command = f"/usr/local/bin/ia search 'collection:archiveteam_urls addeddate:[{two_days_before} TO {current_date}]' --itemlist"
    # Output file for ia search results
    output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt")
    try:
        # Run the ia search command and write the output to a text file
        with open(output_file, "w") as output:
            subprocess.run(ia_search_command, shell=True, stdout=output, check=True)
        # Read the file, append a "/" and duplicate the item name with .cdx.gz extension
        with open(output_file, "r") as input_file:
            lines = input_file.readlines()
            lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines]
        # Write the modified lines back to the file
        with open(output_file, "w") as output_file:
            output_file.writelines(lines)
        print(f"IA search results written to '{output_file}' with URLs appended.")
    except subprocess.CalledProcessError as e:
        print(f"Error running IA search command: {e}")
 # Function to create the urls directory if it doesn't exist
 def create_urls_directory():
    if not os.path.exists(URLS_DIRECTORY):
        os.makedirs(URLS_DIRECTORY)
 # Function to move a file to the root folder
 def move_file(source_path):
    file_name = os.path.basename(source_path)
    destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
    # Check if the file has a ".cdx.gz" extension before moving
    if file_name.endswith(".cdx.gz"):
        print(f"Skipping move for JSON file '{file_name}'.")
    else:
        os.rename(source_path, destination_path)
        print(f"Moved '{file_name}' to the root folder.")
 # Function to filter URLs to download based on filenames to exclude
 def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
    filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
    return filtered_urls
 # Function to extract filename from URL
 def extract_filename_from_url(url):
    # Extract the filename part from the URL
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    return filename
 # Main function
 def main():
    create_urls_directory()
    # Create directory_output.txt
    directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt')
    with open(directory_output_path, 'w') as directory_output_file:
        for filename in os.listdir(URLS_DIRECTORY):
            if filename.endswith(".cdx.json"):
                directory_output_file.write(f"{filename}\n")
    # Process older files in /opt/cdxfiles/urls
    older_than_48_hours = datetime.now() - timedelta(days=2)
    for filename in os.listdir(URLS_DIRECTORY):
        file_path = os.path.join(URLS_DIRECTORY, filename)
        if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_48_hours.timestamp():
            os.remove(file_path)
            print(f"Deleted '{filename}' as it is older than 48 hours.")
    run_ia_command()  # Run IA command after processing older files
    # Read the URLs from the file
    with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
        ia_search_urls = ia_search_file.readlines()
    # Extract filenames from URLs
    ia_search_filenames = [extract_filename_from_url(url.strip()) for url in ia_search_urls]
    # Read the filenames from directory_output.txt and remove the .json extension
    with open(directory_output_path, 'r') as directory_output_file:
        directory_filenames = [line.strip().replace(".cdx.json", "") for line in directory_output_file.readlines()]
    # Filter URLs that don't match filenames
    filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
    # Write filtered URLs to urls_to_download.txt
    urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt')
    with open(urls_to_download_path, 'w') as urls_to_download_file:
        urls_to_download_file.writelines(filtered_urls)
    # Process URLs in batches
    for i in range(0, len(filtered_urls), BATCH_SIZE):
        start_index = i
        end_index = min(i + BATCH_SIZE, len(filtered_urls))
        process_batch(filtered_urls, start_index, end_index)
 # Entry point
 if __name__ == "__main__":
    main()