diff --git a/urls_automated_cdx_processor.py b/urls_automated_cdx_processor.py index b92a35c..e89a165 100644 --- a/urls_automated_cdx_processor.py +++ b/urls_automated_cdx_processor.py @@ -35,6 +35,19 @@ def run_cdxsummary(file_path, json_filepath): # Parse the JSON output json_output = json.loads(result.stdout) + # Add "cdxcount" entry with value 1 + json_output["cdxcount"] = 1 + + # Add "cdxsize" entry with the size of the cdx.gz file in bytes + cdx_size_bytes = os.path.getsize(file_path) + json_output["cdxsize"] = cdx_size_bytes + + # Remove "pathquery" and "samples" keys + if "pathquery" in json_output: + del json_output["pathquery"] + if "samples" in json_output: + del json_output["samples"] + # Process the tophosts section if "tophosts" in json_output: processed_tophosts = process_tophosts(json_output["tophosts"])