Updated to change JSON output for tophosts file
This commit is contained in:
parent
c1c0be47cf
commit
9793f0e9db
@ -13,6 +13,16 @@ ROOT_DIRECTORY = "/root/urls_files"
|
|||||||
CONCURRENCY = 10
|
CONCURRENCY = 10
|
||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 10
|
||||||
|
|
||||||
|
# Function to process the tophosts section and generate the desired format
|
||||||
|
def process_tophosts(tophosts):
|
||||||
|
processed_tophosts = []
|
||||||
|
for domain, hits in tophosts.items():
|
||||||
|
processed_tophosts.append({
|
||||||
|
"domain": domain,
|
||||||
|
"hits": hits
|
||||||
|
})
|
||||||
|
return processed_tophosts
|
||||||
|
|
||||||
# Function to run cdxsummary command
|
# Function to run cdxsummary command
|
||||||
def run_cdxsummary(file_path, json_filepath):
|
def run_cdxsummary(file_path, json_filepath):
|
||||||
# Construct the cdxsummary command
|
# Construct the cdxsummary command
|
||||||
@ -25,53 +35,37 @@ def run_cdxsummary(file_path, json_filepath):
|
|||||||
# Parse the JSON output
|
# Parse the JSON output
|
||||||
json_output = json.loads(result.stdout)
|
json_output = json.loads(result.stdout)
|
||||||
|
|
||||||
# Add "cdxcount" entry with value 1
|
# Process the tophosts section
|
||||||
json_output["cdxcount"] = 1
|
|
||||||
|
|
||||||
# Add "cdxsize" entry with the size of the cdx.gz file in bytes
|
|
||||||
cdx_size_bytes = os.path.getsize(file_path)
|
|
||||||
json_output["cdxsize"] = cdx_size_bytes
|
|
||||||
|
|
||||||
# Remove "pathquery" and "samples" keys
|
|
||||||
if "pathquery" in json_output:
|
|
||||||
del json_output["pathquery"]
|
|
||||||
if "samples" in json_output:
|
|
||||||
del json_output["samples"]
|
|
||||||
|
|
||||||
# Write the JSON output to a file if it doesn't exist
|
|
||||||
with open(json_filepath, "w") as json_file:
|
|
||||||
json.dump(json_output, json_file, indent=2)
|
|
||||||
|
|
||||||
print(f"Created JSON file for '{file_path}': '{json_filepath}'")
|
|
||||||
|
|
||||||
# Write the tophosts section to a separate JSON file
|
|
||||||
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
|
|
||||||
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
|
|
||||||
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
|
|
||||||
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
|
|
||||||
with open(tophosts_filepath, "w") as tophosts_file:
|
|
||||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
|
||||||
|
|
||||||
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
|
||||||
|
|
||||||
# Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts
|
|
||||||
os.rename(tophosts_filepath, tophosts_final_filepath)
|
|
||||||
print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.")
|
|
||||||
|
|
||||||
# Remove the tophosts section from the JSON output
|
|
||||||
if "tophosts" in json_output:
|
if "tophosts" in json_output:
|
||||||
|
processed_tophosts = process_tophosts(json_output["tophosts"])
|
||||||
|
|
||||||
|
# Write the processed tophosts to a separate JSON file
|
||||||
|
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
|
||||||
|
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
|
||||||
|
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
|
||||||
|
with open(tophosts_final_filepath, "w") as tophosts_file:
|
||||||
|
json.dump(processed_tophosts, tophosts_file, indent=2)
|
||||||
|
|
||||||
|
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
||||||
|
|
||||||
|
# Remove the tophosts section from the JSON output
|
||||||
del json_output["tophosts"]
|
del json_output["tophosts"]
|
||||||
|
|
||||||
|
# Add additional fields or modify the JSON output as needed
|
||||||
|
|
||||||
# Write the modified JSON output to the original file
|
# Write the modified JSON output to the original file
|
||||||
with open(json_filepath, "w") as json_file:
|
with open(json_filepath, "w") as json_file:
|
||||||
json.dump(json_output, json_file, indent=2)
|
json.dump(json_output, json_file, indent=2)
|
||||||
|
|
||||||
print(f"Removed 'tophosts' section from '{json_filepath}'")
|
print(f"Modified JSON file written to '{json_filepath}'")
|
||||||
|
|
||||||
# Copy "first" and "last" timestamps to the tophosts JSON file
|
# Copy "first" and "last" timestamps to the tophosts JSON file if available
|
||||||
if "first" in json_output and "last" in json_output:
|
if "first" in json_output and "last" in json_output:
|
||||||
tophosts_output["first"] = json_output["first"]
|
tophosts_output = {
|
||||||
tophosts_output["last"] = json_output["last"]
|
"first": json_output["first"],
|
||||||
|
"last": json_output["last"],
|
||||||
|
"tophosts": processed_tophosts # Include processed tophosts
|
||||||
|
}
|
||||||
with open(tophosts_final_filepath, "w") as tophosts_file:
|
with open(tophosts_final_filepath, "w") as tophosts_file:
|
||||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||||
print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")
|
print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")
|
||||||
|
Loading…
Reference in New Issue
Block a user