Updated to move tophost json file
This commit is contained in:
parent
eaa8278db2
commit
3fb177fb70
@ -9,8 +9,8 @@ from urllib.parse import urlparse
|
||||
# Define constants
|
||||
URLS_DIRECTORY = "/opt/cdxfiles/urls"
|
||||
ROOT_DIRECTORY = "/root/urls_files"
|
||||
CONCURRENCY = 2
|
||||
BATCH_SIZE = 2
|
||||
CONCURRENCY = 10
|
||||
BATCH_SIZE = 10
|
||||
|
||||
# Function to run cdxsummary command
|
||||
def run_cdxsummary(file_path, json_filepath):
|
||||
@ -45,12 +45,18 @@ def run_cdxsummary(file_path, json_filepath):
|
||||
|
||||
# Write the tophosts section to a separate JSON file
|
||||
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
|
||||
tophosts_filepath = os.path.join(ROOT_DIRECTORY, os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json"))
|
||||
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
|
||||
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
|
||||
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
|
||||
with open(tophosts_filepath, "w") as tophosts_file:
|
||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||
|
||||
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
||||
|
||||
# Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts
|
||||
os.rename(tophosts_filepath, tophosts_final_filepath)
|
||||
print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.")
|
||||
|
||||
# Remove the tophosts section from the JSON output
|
||||
if "tophosts" in json_output:
|
||||
del json_output["tophosts"]
|
||||
@ -65,9 +71,9 @@ def run_cdxsummary(file_path, json_filepath):
|
||||
if "first" in json_output and "last" in json_output:
|
||||
tophosts_output["first"] = json_output["first"]
|
||||
tophosts_output["last"] = json_output["last"]
|
||||
with open(tophosts_filepath, "w") as tophosts_file:
|
||||
with open(tophosts_final_filepath, "w") as tophosts_file:
|
||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||
print(f"Copied 'first' and 'last' timestamps to '{tophosts_filepath}'")
|
||||
print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")
|
||||
|
||||
# Move the JSON file to /opt/cdxfiles/urls
|
||||
destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath))
|
||||
|
Loading…
Reference in New Issue
Block a user