Updated to delete older tophost json files

This commit is contained in:
datechnoman 2024-03-12 04:08:42 +00:00
parent 3fb177fb70
commit 7b4651b07e

View File

@ -8,6 +8,7 @@ from urllib.parse import urlparse
# Define constants
URLS_DIRECTORY = "/opt/cdxfiles/urls"
URLS_FILES_DIRECTORY = "/opt/cdxfiles/urls_files"
ROOT_DIRECTORY = "/root/urls_files"
CONCURRENCY = 10
BATCH_SIZE = 10
@ -208,13 +209,16 @@ def main():
if filename.endswith(".cdx.json"):
directory_output_file.write(f"{filename}\n")
# Process older files in /opt/cdxfiles/urls
# Process older files in /opt/cdxfiles/urls, URLS_FILES_DIRECTORY, and the ROOT_DIRECTORY
older_than_48_hours = datetime.now() - timedelta(days=2)
for filename in os.listdir(URLS_DIRECTORY):
file_path = os.path.join(URLS_DIRECTORY, filename)
directories_to_check = [URLS_DIRECTORY, URLS_FILES_DIRECTORY]
for directory in directories_to_check:
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_48_hours.timestamp():
os.remove(file_path)
print(f"Deleted '{filename}' as it is older than 48 hours.")
print(f"Deleted '{filename}' in '{directory}' as it is older than 48 hours.")
run_ia_command() # Run IA command after processing older files