From 7b4651b07e8b652f8b346b9b3bdf648abe4ccee2 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Tue, 12 Mar 2024 04:08:42 +0000 Subject: [PATCH] Updated to delete older tophost json files --- urls_automated_cdx_processor.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/urls_automated_cdx_processor.py b/urls_automated_cdx_processor.py index 6cfd4c9..008da3e 100644 --- a/urls_automated_cdx_processor.py +++ b/urls_automated_cdx_processor.py @@ -8,6 +8,7 @@ from urllib.parse import urlparse # Define constants URLS_DIRECTORY = "/opt/cdxfiles/urls" +URLS_FILES_DIRECTORY = "/opt/cdxfiles/urls_files" ROOT_DIRECTORY = "/root/urls_files" CONCURRENCY = 10 BATCH_SIZE = 10 @@ -208,13 +209,16 @@ def main(): if filename.endswith(".cdx.json"): directory_output_file.write(f"{filename}\n") - # Process older files in /opt/cdxfiles/urls + # Process older files in /opt/cdxfiles/urls, URLS_FILES_DIRECTORY, and the ROOT_DIRECTORY older_than_48_hours = datetime.now() - timedelta(days=2) - for filename in os.listdir(URLS_DIRECTORY): - file_path = os.path.join(URLS_DIRECTORY, filename) - if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_48_hours.timestamp(): - os.remove(file_path) - print(f"Deleted '{filename}' as it is older than 48 hours.") + directories_to_check = [URLS_DIRECTORY, URLS_FILES_DIRECTORY] + + for directory in directories_to_check: + for filename in os.listdir(directory): + file_path = os.path.join(directory, filename) + if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_48_hours.timestamp(): + os.remove(file_path) + print(f"Deleted '{filename}' in '{directory}' as it is older than 48 hours.") run_ia_command() # Run IA command after processing older files