Updated to move tophost json file
This commit is contained in:
parent
eaa8278db2
commit
3fb177fb70
@ -9,8 +9,8 @@ from urllib.parse import urlparse
|
|||||||
# Define constants
|
# Define constants
|
||||||
URLS_DIRECTORY = "/opt/cdxfiles/urls"
|
URLS_DIRECTORY = "/opt/cdxfiles/urls"
|
||||||
ROOT_DIRECTORY = "/root/urls_files"
|
ROOT_DIRECTORY = "/root/urls_files"
|
||||||
CONCURRENCY = 2
|
CONCURRENCY = 10
|
||||||
BATCH_SIZE = 2
|
BATCH_SIZE = 10
|
||||||
|
|
||||||
# Function to run cdxsummary command
|
# Function to run cdxsummary command
|
||||||
def run_cdxsummary(file_path, json_filepath):
|
def run_cdxsummary(file_path, json_filepath):
|
||||||
@ -45,12 +45,18 @@ def run_cdxsummary(file_path, json_filepath):
|
|||||||
|
|
||||||
# Write the tophosts section to a separate JSON file
|
# Write the tophosts section to a separate JSON file
|
||||||
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
|
tophosts_output = {"tophosts": json_output.get("tophosts", {})} # Extract tophosts section
|
||||||
tophosts_filepath = os.path.join(ROOT_DIRECTORY, os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json"))
|
tophosts_filename = os.path.basename(json_filepath).replace(".cdx.json", "_tophosts.json")
|
||||||
|
tophosts_filepath = os.path.join(ROOT_DIRECTORY, tophosts_filename)
|
||||||
|
tophosts_final_filepath = os.path.join("/opt/cdxfiles/urls_tophosts", tophosts_filename)
|
||||||
with open(tophosts_filepath, "w") as tophosts_file:
|
with open(tophosts_filepath, "w") as tophosts_file:
|
||||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||||
|
|
||||||
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
print(f"Created separate JSON file for tophosts: '{tophosts_filepath}'")
|
||||||
|
|
||||||
|
# Move the top hosts JSON file to /opt/cdxfiles/urls_tophosts
|
||||||
|
os.rename(tophosts_filepath, tophosts_final_filepath)
|
||||||
|
print(f"Moved '{tophosts_filename}' to '/opt/cdxfiles/urls_tophosts'.")
|
||||||
|
|
||||||
# Remove the tophosts section from the JSON output
|
# Remove the tophosts section from the JSON output
|
||||||
if "tophosts" in json_output:
|
if "tophosts" in json_output:
|
||||||
del json_output["tophosts"]
|
del json_output["tophosts"]
|
||||||
@ -65,9 +71,9 @@ def run_cdxsummary(file_path, json_filepath):
|
|||||||
if "first" in json_output and "last" in json_output:
|
if "first" in json_output and "last" in json_output:
|
||||||
tophosts_output["first"] = json_output["first"]
|
tophosts_output["first"] = json_output["first"]
|
||||||
tophosts_output["last"] = json_output["last"]
|
tophosts_output["last"] = json_output["last"]
|
||||||
with open(tophosts_filepath, "w") as tophosts_file:
|
with open(tophosts_final_filepath, "w") as tophosts_file:
|
||||||
json.dump(tophosts_output, tophosts_file, indent=2)
|
json.dump(tophosts_output, tophosts_file, indent=2)
|
||||||
print(f"Copied 'first' and 'last' timestamps to '{tophosts_filepath}'")
|
print(f"Copied 'first' and 'last' timestamps to '{tophosts_final_filepath}'")
|
||||||
|
|
||||||
# Move the JSON file to /opt/cdxfiles/urls
|
# Move the JSON file to /opt/cdxfiles/urls
|
||||||
destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath))
|
destination_path = os.path.join(URLS_DIRECTORY, os.path.basename(json_filepath))
|
||||||
|
Loading…
Reference in New Issue
Block a user