Updated to change JSON output for tophosts file

This commit is contained in:
datechnoman 2024-03-13 04:29:55 +00:00
parent c1c0be47cf
commit 9793f0e9db

View File

@ -13,6 +13,16 @@ ROOT_DIRECTORY = "/root/urls_files"
CONCURRENCY = 10
BATCH_SIZE = 10
# Function to process the tophosts section and generate the desired format
def process_tophosts(tophosts):
processed_tophosts = []
for domain, hits in tophosts.items():
processed_tophosts.append({
"domain": domain,
"hits": hits
})
return processed_tophosts
# Function to run cdxsummary command
def run_cdxsummary(file_path, json_filepath):
# Construct the cdxsummary command
@ -25,53 +35,37 @@ def run_cdxsummary(file_path, json_filepath):
# Parse the JSON output
json_output = json.loads(result.stdout)
# Add "cdxcount" entry with value 1
json_output["cdxcount"] = 1
# Process the tophosts section
if "tophosts" in json_output:
processed_tophosts = process_tophosts(json_output["tophosts"])
# Add "cdxsize" entry with the size of the cdx.gz file in bytes
cdx_size_bytes = os.path.getsize(file_path)
json_output["cdxsize"] = cdx_size_bytes
# Remove "pathquery" and "samples" keys
if "pathquery" in json_output:
del json_output["pathquery"]
if "samples" in json_output:
del json_output["samples"]
# Write the JSON output to a file if it doesn't exist
with open(json_filepath, "w") as json_file:
json.dump(json_output, json_file, indent=2)
print(f"Created JSON file for '{file_path}': '{json_filepath}'")
# Write the tophosts section to a separate JSON file
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
# Write the processed tophosts to a separate JSON file
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
with open(tophosts_filepath, "w") as tophosts_file:
json.dump(tophosts_output, tophosts_file, indent=2)
with open(tophosts_final_filepath, "w") as tophosts_file:
json.dump(processed_tophosts, tophosts_file, indent=2)
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
# Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts
os.rename(tophosts_filepath, tophosts_final_filepath)
print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.")
# Remove the tophosts section from the JSON output
if "tophosts" in json_output:
del json_output["tophosts"]
# Add additional fields or modify the JSON output as needed
# Write the modified JSON output to the original file
with open(json_filepath, "w") as json_file:
json.dump(json_output, json_file, indent=2)
print(f"Removed 'tophosts' section from '{json_filepath}'")
print(f"Modified JSON file written to '{json_filepath}'")
# Copy "first" and "last" timestamps to the tophosts JSON file
# Copy "first" and "last" timestamps to the tophosts JSON file if available
if "first" in json_output and "last" in json_output:
tophosts_output["first"] = json_output["first"]
tophosts_output["last"] = json_output["last"]
tophosts_output = {
"first": json_output["first"],
"last": json_output["last"],
"tophosts": processed_tophosts # Include processed tophosts
}
with open(tophosts_final_filepath, "w") as tophosts_file:
json.dump(tophosts_output, tophosts_file, indent=2)
print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")