Updated to move tophost json file

This commit is contained in:
datechnoman 2024-03-12 02:38:05 +00:00
parent eaa8278db2
commit 3fb177fb70

View File

@ -9,8 +9,8 @@ from urllib.parse import urlparse
# Define constants # Define constants
URLS_DIRECTORY = "/opt/cdxfiles/urls" URLS_DIRECTORY = "/opt/cdxfiles/urls"
ROOT_DIRECTORY = "/root/urls_files" ROOT_DIRECTORY = "/root/urls_files"
CONCURRENCY = 2 CONCURRENCY = 10
BATCH_SIZE = 2 BATCH_SIZE = 10
# Function to run cdxsummary command # Function to run cdxsummary command
def run_cdxsummary(file_path, json_filepath): def run_cdxsummary(file_path, json_filepath):
@ -45,12 +45,18 @@ def run_cdxsummary(file_path, json_filepath):
# Write the tophosts section to a separate JSON file # Write the tophosts section to a separate JSON file
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
tophosts_filepath = os.path.join(ROOT_DIRECTORY, os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")) tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
with open(tophosts_filepath, "w") as tophosts_file: with open(tophosts_filepath, "w") as tophosts_file:
json.dump(tophosts_output, tophosts_file, indent=2) json.dump(tophosts_output, tophosts_file, indent=2)
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'") print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
# Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts
os.rename(tophosts_filepath, tophosts_final_filepath)
print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.")
# Remove the tophosts section from the JSON output # Remove the tophosts section from the JSON output
if "tophosts" in json_output: if "tophosts" in json_output:
del json_output["tophosts"] del json_output["tophosts"]
@ -65,9 +71,9 @@ def run_cdxsummary(file_path, json_filepath):
if "first" in json_output and "last" in json_output: if "first" in json_output and "last" in json_output:
tophosts_output["first"] = json_output["first"] tophosts_output["first"] = json_output["first"]
tophosts_output["last"] = json_output["last"] tophosts_output["last"] = json_output["last"]
with open(tophosts_filepath, "w") as tophosts_file: with open(tophosts_final_filepath, "w") as tophosts_file:
json.dump(tophosts_output, tophosts_file, indent=2) json.dump(tophosts_output, tophosts_file, indent=2)
print(f"Copied 'first' and 'last' timestamps to '{tophosts_filepath}'") print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")
# Move the JSON file to /opt/cdxfiles/urls # Move the JSON file to /opt/cdxfiles/urls
destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath)) destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath))