Update mediafire_automated_cdx_processor.py

This commit is contained in:
datechnoman 2024-01-08 10:44:33 +00:00
parent d56d7bb81f
commit 2bf2e02275

View File

@ -6,8 +6,9 @@ from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
CONCURRENCY = 6
BATCH_SIZE = 6
ROOT_DIRECTORY = "/root/mediafire_files"
CONCURRENCY = 10
BATCH_SIZE = 10
def run_cdxsummary(file_path, json_filepath):
# Construct the cdxsummary command
@ -75,7 +76,7 @@ def process_batch(urls, start_index, end_index):
for file_path in downloaded_files:
# Construct file paths
file_path = os.path.join(os.getcwd(), file_path)
json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
# Run cdxsummary and delete .cdx.gz file
executor.submit(run_cdxsummary, file_path, json_filepath)
@ -91,7 +92,7 @@ def run_ia_command():
ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist"
# Output file for ia search results
output_file = "ia_search_results.txt"
output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt")
try:
# Run the ia search command and write the output to a text file
@ -118,7 +119,7 @@ def create_mediafire_directory():
def move_file(source_path):
file_name = os.path.basename(source_path)
destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
# Check if the file has a ".cdx.gz" extension before moving
if file_name.endswith(".cdx.gz"):
@ -141,7 +142,7 @@ def main():
create_mediafire_directory()
# Create directory_output.txt
directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt')
directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt')
with open(directory_output_path, 'w') as directory_output_file:
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
if filename.endswith(".cdx.json"):
@ -150,7 +151,7 @@ def main():
run_ia_command() # Run IA command after creating directory_output.txt
# Read the URLs from the file
with open('ia_search_results.txt', 'r') as ia_search_file:
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
ia_search_urls = ia_search_file.readlines()
# Extract filenames from URLs
@ -164,7 +165,7 @@ def main():
filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
# Write filtered URLs to urls_to_download.txt
urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt')
urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt')
with open(urls_to_download_path, 'w') as urls_to_download_file:
urls_to_download_file.writelines(filtered_urls)