diff --git a/mediafire_automated_cdx_processor.py b/mediafire_automated_cdx_processor.py index cc27b4c..b31065f 100644 --- a/mediafire_automated_cdx_processor.py +++ b/mediafire_automated_cdx_processor.py @@ -6,8 +6,9 @@ from concurrent.futures import ThreadPoolExecutor from urllib.parse import urlparse MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire" -CONCURRENCY = 6 -BATCH_SIZE = 6 +ROOT_DIRECTORY = "/root/mediafire_files" +CONCURRENCY = 10 +BATCH_SIZE = 10 def run_cdxsummary(file_path, json_filepath): # Construct the cdxsummary command @@ -75,7 +76,7 @@ def process_batch(urls, start_index, end_index): for file_path in downloaded_files: # Construct file paths file_path = os.path.join(os.getcwd(), file_path) - json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) + json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) # Run cdxsummary and delete .cdx.gz file executor.submit(run_cdxsummary, file_path, json_filepath) @@ -91,7 +92,7 @@ def run_ia_command(): ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist" # Output file for ia search results - output_file = "ia_search_results.txt" + output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt") try: # Run the ia search command and write the output to a text file @@ -118,7 +119,7 @@ def create_mediafire_directory(): def move_file(source_path): file_name = os.path.basename(source_path) - destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) + destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) # Check if the file has a ".cdx.gz" extension before moving if file_name.endswith(".cdx.gz"): @@ -141,7 +142,7 @@ def main(): create_mediafire_directory() # Create directory_output.txt - directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt') + directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt') with open(directory_output_path, 'w') as directory_output_file: for filename in os.listdir(MEDIAFIRE_DIRECTORY): if filename.endswith(".cdx.json"): @@ -150,7 +151,7 @@ def main(): run_ia_command() # Run IA command after creating directory_output.txt # Read the URLs from the file - with open('ia_search_results.txt', 'r') as ia_search_file: + with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file: ia_search_urls = ia_search_file.readlines() # Extract filenames from URLs @@ -164,7 +165,7 @@ def main(): filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames) # Write filtered URLs to urls_to_download.txt - urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt') + urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt') with open(urls_to_download_path, 'w') as urls_to_download_file: urls_to_download_file.writelines(filtered_urls)