diff --git a/urls_automated_cdx_processor.py b/urls_automated_cdx_processor.py index 8cb88aa..6cfd4c9 100644 --- a/urls_automated_cdx_processor.py +++ b/urls_automated_cdx_processor.py @@ -9,8 +9,8 @@ from urllib.parse import urlparse # Define constants URLS_DIRECTORY = "/opt/cdxfiles/urls" ROOT_DIRECTORY = "/root/urls_files" -CONCURRENCY = 2 -BATCH_SIZE = 2 +CONCURRENCY = 10 +BATCH_SIZE = 10 # Function to run cdxsummary command def run_cdxsummary(file_path, json_filepath): @@ -45,12 +45,18 @@ def run_cdxsummary(file_path, json_filepath): # Write the tophosts section to a separate JSON file tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section - tophosts_filepath = os.path.join(ROOT_DIRECTORY, os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")) + tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json") + tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename) + tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename) with open(tophosts_filepath, "w") as tophosts_file: json.dump(tophosts_output, tophosts_file, indent=2) print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'") + # Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts + os.rename(tophosts_filepath, tophosts_final_filepath) + print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.") + # Remove the tophosts section from the JSON output if "tophosts" in json_output: del json_output["tophosts"] @@ -65,9 +71,9 @@ def run_cdxsummary(file_path, json_filepath): if "first" in json_output and "last" in json_output: tophosts_output["first"] = json_output["first"] tophosts_output["last"] = json_output["last"] - with open(tophosts_filepath, "w") as tophosts_file: + with open(tophosts_final_filepath, "w") as tophosts_file: json.dump(tophosts_output, tophosts_file, indent=2) - print(f"Copied 'first' and 'last' timestamps to '{tophosts_filepath}'") + print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'") # Move the JSON file to /opt/cdxfiles/urls destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath))