From 9793f0e9db41880c2d6648fab6bec5b4323f7a83 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Wed, 13 Mar 2024 04:29:55 +0000 Subject: [PATCH] Updated to change JSON output for tophosts file --- urls_automated_cdx_processor.py | 70 +++++++++++++++------------------ 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/urls_automated_cdx_processor.py b/urls_automated_cdx_processor.py index 40f9ba1..b92a35c 100644 --- a/urls_automated_cdx_processor.py +++ b/urls_automated_cdx_processor.py @@ -13,6 +13,16 @@ ROOT_DIRECTORY = "/root/urls_files" CONCURRENCY = 10 BATCH_SIZE = 10 +# Function to process the tophosts section and generate the desired format +def process_tophosts(tophosts): + processed_tophosts = [] + for domain, hits in tophosts.items(): + processed_tophosts.append({ + "domain": domain, + "hits": hits + }) + return processed_tophosts + # Function to run cdxsummary command def run_cdxsummary(file_path, json_filepath): # Construct the cdxsummary command @@ -25,53 +35,37 @@ def run_cdxsummary(file_path, json_filepath): # Parse the JSON output json_output = json.loads(result.stdout) - # Add "cdxcount" entry with value 1 - json_output["cdxcount"] = 1 - - # Add "cdxsize" entry with the size of the cdx.gz file in bytes - cdx_size_bytes = os.path.getsize(file_path) - json_output["cdxsize"] = cdx_size_bytes - - # Remove "pathquery" and "samples" keys - if "pathquery" in json_output: - del json_output["pathquery"] - if "samples" in json_output: - del json_output["samples"] - - # Write the JSON output to a file if it doesn't exist - with open(json_filepath, "w") as json_file: - json.dump(json_output, json_file, indent=2) - - print(f"Created JSON file for '{file_path}': '{json_filepath}'") - - # Write the tophosts section to a separate JSON file - tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section - tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json") - tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename) - tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename) - with open(tophosts_filepath, "w") as tophosts_file: - json.dump(tophosts_output, tophosts_file, indent=2) - - print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'") - - # Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts - os.rename(tophosts_filepath, tophosts_final_filepath) - print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.") - - # Remove the tophosts section from the JSON output + # Process the tophosts section if "tophosts" in json_output: + processed_tophosts = process_tophosts(json_output["tophosts"]) + + # Write the processed tophosts to a separate JSON file + tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json") + tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename) + tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename) + with open(tophosts_final_filepath, "w") as tophosts_file: + json.dump(processed_tophosts, tophosts_file, indent=2) + + print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'") + + # Remove the tophosts section from the JSON output del json_output["tophosts"] + # Add additional fields or modify the JSON output as needed + # Write the modified JSON output to the original file with open(json_filepath, "w") as json_file: json.dump(json_output, json_file, indent=2) - print(f"Removed 'tophosts' section from '{json_filepath}'") + print(f"Modified JSON file written to '{json_filepath}'") - # Copy "first" and "last" timestamps to the tophosts JSON file + # Copy "first" and "last" timestamps to the tophosts JSON file if available if "first" in json_output and "last" in json_output: - tophosts_output["first"] = json_output["first"] - tophosts_output["last"] = json_output["last"] + tophosts_output = { + "first": json_output["first"], + "last": json_output["last"], + "tophosts": processed_tophosts # Include processed tophosts + } with open(tophosts_final_filepath, "w") as tophosts_file: json.dump(tophosts_output, tophosts_file, indent=2) print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")