import os import subprocess import json from datetime import datetime from concurrent.futures import ThreadPoolExecutor MEDIAFIRE_DIRECTORY = "/opt/MediaFire" CONCURRENCY = 4 # Set the desired concurrency for downloading multiple files BATCH_SIZE = 6 # Set the batch size for processing def run_cdxsummary(file_path, json_filepath): # Construct the cdxsummary command cdxsummary_command = f"cdxsummary --json {file_path}" try: # Run the cdxsummary command and capture the output result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True) # Parse the JSON output json_output = json.loads(result.stdout) # Add "cdxcount" entry with value 1 json_output["cdxcount"] = 1 # Remove "pathquery" and "samples" keys if "pathquery" in json_output: del json_output["pathquery"] if "samples" in json_output: del json_output["samples"] if "tophosts" in json_output: del json_output["tophosts"] # Write the JSON output to a file if it doesn't exist with open(json_filepath, "w") as json_file: json.dump(json_output, json_file, indent=2) print(f"Created JSON file for '{file_path}': '{json_filepath}'") # Delete the .cdx.gz file os.remove(file_path) print(f"Deleted '{file_path}' after processing.") except subprocess.CalledProcessError as e: print(f"Error running cdxsummary command: {e}") def download_file(url): command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file subprocess.run(command, shell=True) # Return the downloaded file path return os.path.basename(url) def download_files(urls): with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: # Use map to get the file paths downloaded_files = list(executor.map(download_file, urls)) return downloaded_files def process_batch(urls, start_index, end_index): # Extract batch of URLs batch_urls = urls[start_index:end_index] print("\nDownloading Batch...\n") # Download files concurrently downloaded_files = download_files(batch_urls) # Move files and run cdxsummary for each downloaded file with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: for file_path in downloaded_files: # Construct file paths file_path = os.path.join(os.getcwd(), file_path) json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) # Run cdxsummary and delete .cdx.gz file executor.submit(run_cdxsummary, file_path, json_filepath) def run_ia_command(): # Get the current date formatted as YYYY-MM-DD current_date = datetime.now().strftime("%Y-%m-%d") # Replace archiveteam_telegram with archiveteam_mediafire ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{current_date}]' --itemlist" # Output file for ia search results output_file = "ia_search_results.txt" try: # Run the ia search command and write the output to a text file with open(output_file, "w") as output: subprocess.run(ia_search_command, shell=True, stdout=output, check=True) # Read the file, append a "/" and duplicate the item name with .cdx.gz extension with open(output_file, "r") as input_file: lines = input_file.readlines() lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines] # Write the modified lines back to the file with open(output_file, "w") as output_file: output_file.writelines(lines) print(f"IA search results written to '{output_file}' with URLs appended.") except subprocess.CalledProcessError as e: print(f"Error running IA search command: {e}") def create_mediafire_directory(): if not os.path.exists(MEDIAFIRE_DIRECTORY): os.makedirs(MEDIAFIRE_DIRECTORY) def move_file(source_path): file_name = os.path.basename(source_path) destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) # Check if the file has a ".cdx.gz" extension before moving if file_name.endswith(".cdx.gz"): print(f"Skipping move for JSON file '{file_name}'.") else: os.rename(source_path, destination_path) print(f"Moved '{file_name}' to the root folder.") def main(): run_ia_command() # Run IA command first create_mediafire_directory() # Read the URLs from the file with open('ia_search_results.txt', 'r') as file: urls = file.readlines() # Remove leading/trailing whitespace from the URLs urls = [url.strip() for url in urls] # Process URLs in batches for i in range(0, len(urls), BATCH_SIZE): start_index = i end_index = min(i + BATCH_SIZE, len(urls)) process_batch(urls, start_index, end_index) if __name__ == "__main__": main()