commit ae515fb425f0aa99b07bd1e756a82ff12e15f88b Author: datechnoman Date: Sat Dec 23 04:39:00 2023 +0000 Upload files to "/" diff --git a/mediafire_cdxprocessor.py b/mediafire_cdxprocessor.py new file mode 100644 index 0000000..a75acf7 --- /dev/null +++ b/mediafire_cdxprocessor.py @@ -0,0 +1,134 @@ +import os +import subprocess +from concurrent.futures import ThreadPoolExecutor +import json +from datetime import datetime + +MEDIAFIRE_DIRECTORY = "/opt/MediaFire" +CONCURRENCY = 4 # Set the desired concurrency for downloading multiple files +BATCH_SIZE = 6 # Set the batch size for processing + +def run_cdxsummary(file_path, json_filepath): + # Construct the cdxsummary command + cdxsummary_command = f"cdxsummary --json {file_path}" + + try: + # Run the cdxsummary command and capture the output + result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True) + + # Parse the JSON output + json_output = json.loads(result.stdout) + + # Remove "pathquery" and "samples" keys + if "pathquery" in json_output: + del json_output["pathquery"] + if "samples" in json_output: + del json_output["samples"] + + # Write the JSON output to a file if it doesn't exist + with open(json_filepath, "w") as json_file: + json.dump(json_output, json_file, indent=2) + + print(f"Created JSON file for '{file_path}': '{json_filepath}'") + + # Delete the .cdx.gz file + os.remove(file_path) + print(f"Deleted '{file_path}' after processing.") + + except subprocess.CalledProcessError as e: + print(f"Error running cdxsummary command: {e}") + +def download_file(url): + command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file + subprocess.run(command, shell=True) + # Return the downloaded file path + return os.path.basename(url) + +def download_files(urls): + with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: + # Use map to get the file paths + downloaded_files = list(executor.map(download_file, urls)) + + return downloaded_files + +def process_batch(urls, start_index, end_index): + # Extract batch of URLs + batch_urls = urls[start_index:end_index] + print("\nDownloading Batch...\n") + + # Download files concurrently + downloaded_files = download_files(batch_urls) + + # Move files and run cdxsummary for each downloaded file + for file_path in downloaded_files: + # Construct file paths + file_path = os.path.join(os.getcwd(), file_path) + json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) + + # Run cdxsummary and delete .cdx.gz file + run_cdxsummary(file_path, json_filepath) + +def run_ia_command(): + # Get the current date formatted as YYYY-MM-DD + current_date = datetime.now().strftime("%Y-%m-%d") + + # Replace archiveteam_telegram with archiveteam_mediafire + ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{current_date}]' --itemlist" + + # Output file for ia search results + output_file = "ia_search_results.txt" + + try: + # Run the ia search command and write the output to a text file + with open(output_file, "w") as output: + subprocess.run(ia_search_command, shell=True, stdout=output, check=True) + + # Read the file, append a "/" and duplicate the item name with .cdx.gz extension + with open(output_file, "r") as input_file: + lines = input_file.readlines() + lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines] + + # Write the modified lines back to the file + with open(output_file, "w") as output_file: + output_file.writelines(lines) + + print(f"IA search results written to '{output_file}' with URLs appended.") + + except subprocess.CalledProcessError as e: + print(f"Error running IA search command: {e}") + +def create_mediafire_directory(): + if not os.path.exists(MEDIAFIRE_DIRECTORY): + os.makedirs(MEDIAFIRE_DIRECTORY) + +def move_file(source_path): + file_name = os.path.basename(source_path) + destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) + + # Check if the file has a ".cdx.gz" extension before moving + if file_name.endswith(".cdx.gz"): + print(f"Skipping move for JSON file '{file_name}'.") + else: + os.rename(source_path, destination_path) + print(f"Moved '{file_name}' to the root folder.") + +def main(): + run_ia_command() # Run IA command first + create_mediafire_directory() + + # Read the URLs from the file + with open('ia_search_results.txt', 'r') as file: + urls = file.readlines() + + # Remove leading/trailing whitespace from the URLs + urls = [url.strip() for url in urls] + + # Process URLs in batches + for i in range(0, len(urls), BATCH_SIZE): + start_index = i + end_index = min(i + BATCH_SIZE, len(urls)) + process_batch(urls, start_index, end_index) + +if __name__ == "__main__": + main() +