144 lines
5.1 KiB
Python
144 lines
5.1 KiB
Python
import os
|
|
import subprocess
|
|
import json
|
|
from datetime import datetime
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
MEDIAFIRE_DIRECTORY = "/opt/MediaFire"
|
|
CONCURRENCY = 4 # Set the desired concurrency for downloading multiple files
|
|
BATCH_SIZE = 6 # Set the batch size for processing
|
|
|
|
def run_cdxsummary(file_path, json_filepath):
|
|
# Construct the cdxsummary command
|
|
cdxsummary_command = f"cdxsummary --json {file_path}"
|
|
|
|
try:
|
|
# Run the cdxsummary command and capture the output
|
|
result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True)
|
|
|
|
# Parse the JSON output
|
|
json_output = json.loads(result.stdout)
|
|
|
|
# Add "cdxcount" entry with value 1
|
|
json_output["cdxcount"] = 1
|
|
|
|
# Add "cdxsize" entry with the size of the cdx.gz file in bytes
|
|
cdx_size_bytes = os.path.getsize(file_path)
|
|
json_output["cdxsize"] = cdx_size_bytes
|
|
|
|
# Remove "pathquery" and "samples" keys
|
|
if "pathquery" in json_output:
|
|
del json_output["pathquery"]
|
|
if "samples" in json_output:
|
|
del json_output["samples"]
|
|
if "tophosts" in json_output:
|
|
del json_output["tophosts"]
|
|
|
|
# Write the JSON output to a file if it doesn't exist
|
|
with open(json_filepath, "w") as json_file:
|
|
json.dump(json_output, json_file, indent=2)
|
|
|
|
print(f"Created JSON file for '{file_path}': '{json_filepath}'")
|
|
|
|
# Delete the .cdx.gz file
|
|
os.remove(file_path)
|
|
print(f"Deleted '{file_path}' after processing.")
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error running cdxsummary command: {e}")
|
|
|
|
def download_file(url):
|
|
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
|
|
subprocess.run(command, shell=True)
|
|
# Return the downloaded file path
|
|
return os.path.basename(url)
|
|
|
|
def download_files(urls):
|
|
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
|
# Use map to get the file paths
|
|
downloaded_files = list(executor.map(download_file, urls))
|
|
|
|
return downloaded_files
|
|
|
|
def process_batch(urls, start_index, end_index):
|
|
# Extract batch of URLs
|
|
batch_urls = urls[start_index:end_index]
|
|
print("\nDownloading Batch...\n")
|
|
|
|
# Download files concurrently
|
|
downloaded_files = download_files(batch_urls)
|
|
|
|
# Move files and run cdxsummary for each downloaded file
|
|
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
|
for file_path in downloaded_files:
|
|
# Construct file paths
|
|
file_path = os.path.join(os.getcwd(), file_path)
|
|
json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
|
|
|
|
# Run cdxsummary and delete .cdx.gz file
|
|
executor.submit(run_cdxsummary, file_path, json_filepath)
|
|
|
|
def run_ia_command():
|
|
# Get the current date formatted as YYYY-MM-DD
|
|
current_date = datetime.now().strftime("%Y-%m-%d")
|
|
|
|
# Replace archiveteam_telegram with archiveteam_mediafire
|
|
ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{current_date}]' --itemlist"
|
|
|
|
# Output file for ia search results
|
|
output_file = "ia_search_results.txt"
|
|
|
|
try:
|
|
# Run the ia search command and write the output to a text file
|
|
with open(output_file, "w") as output:
|
|
subprocess.run(ia_search_command, shell=True, stdout=output, check=True)
|
|
|
|
# Read the file, append a "/" and duplicate the item name with .cdx.gz extension
|
|
with open(output_file, "r") as input_file:
|
|
lines = input_file.readlines()
|
|
lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines]
|
|
|
|
# Write the modified lines back to the file
|
|
with open(output_file, "w") as output_file:
|
|
output_file.writelines(lines)
|
|
|
|
print(f"IA search results written to '{output_file}' with URLs appended.")
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error running IA search command: {e}")
|
|
|
|
def create_mediafire_directory():
|
|
if not os.path.exists(MEDIAFIRE_DIRECTORY):
|
|
os.makedirs(MEDIAFIRE_DIRECTORY)
|
|
|
|
def move_file(source_path):
|
|
file_name = os.path.basename(source_path)
|
|
destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
|
|
|
# Check if the file has a ".cdx.gz" extension before moving
|
|
if file_name.endswith(".cdx.gz"):
|
|
print(f"Skipping move for JSON file '{file_name}'.")
|
|
else:
|
|
os.rename(source_path, destination_path)
|
|
print(f"Moved '{file_name}' to the root folder.")
|
|
|
|
def main():
|
|
run_ia_command() # Run IA command first
|
|
create_mediafire_directory()
|
|
|
|
# Read the URLs from the file
|
|
with open('ia_search_results.txt', 'r') as file:
|
|
urls = file.readlines()
|
|
|
|
# Remove leading/trailing whitespace from the URLs
|
|
urls = [url.strip() for url in urls]
|
|
|
|
# Process URLs in batches
|
|
for i in range(0, len(urls), BATCH_SIZE):
|
|
start_index = i
|
|
end_index = min(i + BATCH_SIZE, len(urls))
|
|
process_batch(urls, start_index, end_index)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|