Updated to change JSON output for tophosts file
This commit is contained in:
parent
c1c0be47cf
commit
9793f0e9db
@ -13,6 +13,16 @@ ROOT_DIRECTORY = "/root/urls_files"
|
||||
CONCURRENCY = 10
|
||||
BATCH_SIZE = 10
|
||||
|
||||
# Function to process the tophosts section and generate the desired format
|
||||
def process_tophosts(tophosts):
|
||||
processed_tophosts = []
|
||||
for domain, hits in tophosts.items():
|
||||
processed_tophosts.append({
|
||||
"domain": domain,
|
||||
"hits": hits
|
||||
})
|
||||
return processed_tophosts
|
||||
|
||||
# Function to run cdxsummary command
|
||||
def run_cdxsummary(file_path, json_filepath):
|
||||
# Construct the cdxsummary command
|
||||
@ -25,53 +35,37 @@ def run_cdxsummary(file_path, json_filepath):
|
||||
# Parse the JSON output
|
||||
json_output = json.loads(result.stdout)
|
||||
|
||||
# Add "cdxcount" entry with value 1
|
||||
json_output["cdxcount"] = 1
|
||||
|
||||
# Add "cdxsize" entry with the size of the cdx.gz file in bytes
|
||||
cdx_size_bytes = os.path.getsize(file_path)
|
||||
json_output["cdxsize"] = cdx_size_bytes
|
||||
|
||||
# Remove "pathquery" and "samples" keys
|
||||
if "pathquery" in json_output:
|
||||
del json_output["pathquery"]
|
||||
if "samples" in json_output:
|
||||
del json_output["samples"]
|
||||
|
||||
# Write the JSON output to a file if it doesn't exist
|
||||
with open(json_filepath, "w") as json_file:
|
||||
json.dump(json_output, json_file, indent=2)
|
||||
|
||||
print(f"Created JSON file for '{file_path}': '{json_filepath}'")
|
||||
|
||||
# Write the tophosts section to a separate JSON file
|
||||
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
|
||||
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
|
||||
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
|
||||
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
|
||||
with open(tophosts_filepath, "w") as tophosts_file:
|
||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||
|
||||
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
||||
|
||||
# Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts
|
||||
os.rename(tophosts_filepath, tophosts_final_filepath)
|
||||
print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.")
|
||||
|
||||
# Remove the tophosts section from the JSON output
|
||||
# Process the tophosts section
|
||||
if "tophosts" in json_output:
|
||||
processed_tophosts = process_tophosts(json_output["tophosts"])
|
||||
|
||||
# Write the processed tophosts to a separate JSON file
|
||||
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
|
||||
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
|
||||
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
|
||||
with open(tophosts_final_filepath, "w") as tophosts_file:
|
||||
json.dump(processed_tophosts, tophosts_file, indent=2)
|
||||
|
||||
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
||||
|
||||
# Remove the tophosts section from the JSON output
|
||||
del json_output["tophosts"]
|
||||
|
||||
# Add additional fields or modify the JSON output as needed
|
||||
|
||||
# Write the modified JSON output to the original file
|
||||
with open(json_filepath, "w") as json_file:
|
||||
json.dump(json_output, json_file, indent=2)
|
||||
|
||||
print(f"Removed 'tophosts' section from '{json_filepath}'")
|
||||
print(f"Modified JSON file written to '{json_filepath}'")
|
||||
|
||||
# Copy "first" and "last" timestamps to the tophosts JSON file
|
||||
# Copy "first" and "last" timestamps to the tophosts JSON file if available
|
||||
if "first" in json_output and "last" in json_output:
|
||||
tophosts_output["first"] = json_output["first"]
|
||||
tophosts_output["last"] = json_output["last"]
|
||||
tophosts_output = {
|
||||
"first": json_output["first"],
|
||||
"last": json_output["last"],
|
||||
"tophosts": processed_tophosts # Include processed tophosts
|
||||
}
|
||||
with open(tophosts_final_filepath, "w") as tophosts_file:
|
||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||
print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")
|
||||
|
Loading…
Reference in New Issue
Block a user