diff --git a/mediafire_automated_cdx_processor.py b/mediafire_automated_cdx_processor.py index f46619e..cc27b4c 100644 --- a/mediafire_automated_cdx_processor.py +++ b/mediafire_automated_cdx_processor.py @@ -3,10 +3,11 @@ import subprocess import json from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse -MEDIAFIRE_DIRECTORY = "/opt/MediaFire" -CONCURRENCY = 6 # Set the desired concurrency for downloading multiple files -BATCH_SIZE = 6 # Set the batch size for processing +MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire" +CONCURRENCY = 6 +BATCH_SIZE = 6 def run_cdxsummary(file_path, json_filepath): # Construct the cdxsummary command @@ -48,6 +49,7 @@ def run_cdxsummary(file_path, json_filepath): print(f"Error running cdxsummary command: {e}") def download_file(url): + # Command to download the file using axel command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file subprocess.run(command, shell=True) # Return the downloaded file path @@ -125,22 +127,52 @@ def move_file(source_path): os.rename(source_path, destination_path) print(f"Moved '{file_name}' to the root folder.") +def filter_urls_to_download(urls_to_filter, filenames_to_exclude): + filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)] + return filtered_urls + +def extract_filename_from_url(url): + # Extract the filename part from the URL + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + return filename + def main(): - run_ia_command() # Run IA command first create_mediafire_directory() - # Read the URLs from the file - with open('ia_search_results.txt', 'r') as file: - urls = file.readlines() + # Create directory_output.txt + directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt') + with open(directory_output_path, 'w') as directory_output_file: + for filename in os.listdir(MEDIAFIRE_DIRECTORY): + if filename.endswith(".cdx.json"): + directory_output_file.write(f"{filename}\n") - # Remove leading/trailing whitespace from the URLs - urls = [url.strip() for url in urls] + run_ia_command() # Run IA command after creating directory_output.txt + + # Read the URLs from the file + with open('ia_search_results.txt', 'r') as ia_search_file: + ia_search_urls = ia_search_file.readlines() + + # Extract filenames from URLs + ia_search_filenames = [extract_filename_from_url(url.strip()) for url in ia_search_urls] + + # Read the filenames from directory_output.txt and remove the .json extension + with open(directory_output_path, 'r') as directory_output_file: + directory_filenames = [line.strip().replace(".cdx.json", "") for line in directory_output_file.readlines()] + + # Filter URLs that don't match filenames + filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames) + + # Write filtered URLs to urls_to_download.txt + urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt') + with open(urls_to_download_path, 'w') as urls_to_download_file: + urls_to_download_file.writelines(filtered_urls) # Process URLs in batches - for i in range(0, len(urls), BATCH_SIZE): + for i in range(0, len(filtered_urls), BATCH_SIZE): start_index = i - end_index = min(i + BATCH_SIZE, len(urls)) - process_batch(urls, start_index, end_index) + end_index = min(i + BATCH_SIZE, len(filtered_urls)) + process_batch(filtered_urls, start_index, end_index) if __name__ == "__main__": main()