diff --git a/mediafire_automated_cdx_processor.py b/mediafire_automated_cdx_processor.py index 952f77f..b1be716 100644 --- a/mediafire_automated_cdx_processor.py +++ b/mediafire_automated_cdx_processor.py @@ -1,15 +1,19 @@ +# Import necessary libraries import os import subprocess import json +import tarfile # Use tarfile for creating tar archives from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor from urllib.parse import urlparse +# Define constants MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire" ROOT_DIRECTORY = "/root/mediafire_files" CONCURRENCY = 10 BATCH_SIZE = 10 +# Function to run cdxsummary command def run_cdxsummary(file_path, json_filepath): # Construct the cdxsummary command cdxsummary_command = f"cdxsummary --json {file_path}" @@ -54,6 +58,7 @@ def run_cdxsummary(file_path, json_filepath): except subprocess.CalledProcessError as e: print(f"Error running cdxsummary command: {e}") +# Function to download a file using axel def download_file(url): # Strip newline characters from the URL url = url.strip() @@ -65,6 +70,7 @@ def download_file(url): # Return the downloaded file path return os.path.basename(url) +# Function to download files concurrently def download_files(urls): with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: # Use map to get the file paths @@ -72,6 +78,7 @@ def download_files(urls): return downloaded_files +# Function to process a batch of URLs def process_batch(urls, start_index, end_index): # Extract batch of URLs batch_urls = urls[start_index:end_index] @@ -99,6 +106,7 @@ def process_batch(urls, start_index, end_index): for future in futures: future.result() +# Function to run the Internet Archive (IA) search command def run_ia_command(): # Get the current date formatted as YYYY-MM-DD current_date = datetime.now().strftime("%Y-%m-%d") @@ -131,10 +139,12 @@ def run_ia_command(): except subprocess.CalledProcessError as e: print(f"Error running IA search command: {e}") +# Function to create the mediafire directory if it doesn't exist def create_mediafire_directory(): if not os.path.exists(MEDIAFIRE_DIRECTORY): os.makedirs(MEDIAFIRE_DIRECTORY) +# Function to move a file to the root folder def move_file(source_path): file_name = os.path.basename(source_path) destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) @@ -146,16 +156,19 @@ def move_file(source_path): os.rename(source_path, destination_path) print(f"Moved '{file_name}' to the root folder.") +# Function to filter URLs to download based on filenames to exclude def filter_urls_to_download(urls_to_filter, filenames_to_exclude): filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)] return filtered_urls +# Function to extract filename from URL def extract_filename_from_url(url): # Extract the filename part from the URL parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) return filename +# Main function def main(): create_mediafire_directory() @@ -166,7 +179,18 @@ def main(): if filename.endswith(".cdx.json"): directory_output_file.write(f"{filename}\n") - run_ia_command() # Run IA command after creating directory_output.txt + # Process older files in /opt/cdxfiles/mediafire + older_than_24_hours = datetime.now() - timedelta(days=1) + for filename in os.listdir(MEDIAFIRE_DIRECTORY): + file_path = os.path.join(MEDIAFIRE_DIRECTORY, filename) + if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_24_hours.timestamp(): + # Appending to the existing tar file + with tarfile.open(os.path.join(MEDIAFIRE_DIRECTORY, 'mediafire_cdxfiles_archive.tar'), 'a') as tar: + tar.add(file_path, arcname=filename) + os.remove(file_path) + print(f"Added '{filename}' to 'mediafire_cdxfiles_archive.tar' and removed the JSON file.") + + run_ia_command() # Run IA command after processing older files # Read the URLs from the file with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file: @@ -193,5 +217,6 @@ def main(): end_index = min(i + BATCH_SIZE, len(filtered_urls)) process_batch(filtered_urls, start_index, end_index) +# Entry point if __name__ == "__main__": main()