Added tar support for files older than 24 hours to keep cdx file count low

This commit is contained in:
datechnoman 2024-01-09 00:19:22 +00:00
parent f5928594a5
commit 457f32ed61

View File

@ -1,15 +1,19 @@
# Import necessary libraries
import os import os
import subprocess import subprocess
import json import json
import tarfile # Use tarfile for creating tar archives
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse from urllib.parse import urlparse
# Define constants
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire" MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
ROOT_DIRECTORY = "/root/mediafire_files" ROOT_DIRECTORY = "/root/mediafire_files"
CONCURRENCY = 10 CONCURRENCY = 10
BATCH_SIZE = 10 BATCH_SIZE = 10
# Function to run cdxsummary command
def run_cdxsummary(file_path, json_filepath): def run_cdxsummary(file_path, json_filepath):
# Construct the cdxsummary command # Construct the cdxsummary command
cdxsummary_command = f"cdxsummary --json {file_path}" cdxsummary_command = f"cdxsummary --json {file_path}"
@ -54,6 +58,7 @@ def run_cdxsummary(file_path, json_filepath):
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"Error running cdxsummary command: {e}") print(f"Error running cdxsummary command: {e}")
# Function to download a file using axel
def download_file(url): def download_file(url):
# Strip newline characters from the URL # Strip newline characters from the URL
url = url.strip() url = url.strip()
@ -65,6 +70,7 @@ def download_file(url):
# Return the downloaded file path # Return the downloaded file path
return os.path.basename(url) return os.path.basename(url)
# Function to download files concurrently
def download_files(urls): def download_files(urls):
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
# Use map to get the file paths # Use map to get the file paths
@ -72,6 +78,7 @@ def download_files(urls):
return downloaded_files return downloaded_files
# Function to process a batch of URLs
def process_batch(urls, start_index, end_index): def process_batch(urls, start_index, end_index):
# Extract batch of URLs # Extract batch of URLs
batch_urls = urls[start_index:end_index] batch_urls = urls[start_index:end_index]
@ -99,6 +106,7 @@ def process_batch(urls, start_index, end_index):
for future in futures: for future in futures:
future.result() future.result()
# Function to run the Internet Archive (IA) search command
def run_ia_command(): def run_ia_command():
# Get the current date formatted as YYYY-MM-DD # Get the current date formatted as YYYY-MM-DD
current_date = datetime.now().strftime("%Y-%m-%d") current_date = datetime.now().strftime("%Y-%m-%d")
@ -131,10 +139,12 @@ def run_ia_command():
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"Error running IA search command: {e}") print(f"Error running IA search command: {e}")
# Function to create the mediafire directory if it doesn't exist
def create_mediafire_directory(): def create_mediafire_directory():
if not os.path.exists(MEDIAFIRE_DIRECTORY): if not os.path.exists(MEDIAFIRE_DIRECTORY):
os.makedirs(MEDIAFIRE_DIRECTORY) os.makedirs(MEDIAFIRE_DIRECTORY)
# Function to move a file to the root folder
def move_file(source_path): def move_file(source_path):
file_name = os.path.basename(source_path) file_name = os.path.basename(source_path)
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
@ -146,16 +156,19 @@ def move_file(source_path):
os.rename(source_path, destination_path) os.rename(source_path, destination_path)
print(f"Moved '{file_name}' to the root folder.") print(f"Moved '{file_name}' to the root folder.")
# Function to filter URLs to download based on filenames to exclude
def filter_urls_to_download(urls_to_filter, filenames_to_exclude): def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)] filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
return filtered_urls return filtered_urls
# Function to extract filename from URL
def extract_filename_from_url(url): def extract_filename_from_url(url):
# Extract the filename part from the URL # Extract the filename part from the URL
parsed_url = urlparse(url) parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path) filename = os.path.basename(parsed_url.path)
return filename return filename
# Main function
def main(): def main():
create_mediafire_directory() create_mediafire_directory()
@ -166,7 +179,18 @@ def main():
if filename.endswith(".cdx.json"): if filename.endswith(".cdx.json"):
directory_output_file.write(f"{filename}\n") directory_output_file.write(f"{filename}\n")
run_ia_command() # Run IA command after creating directory_output.txt # Process older files in /opt/cdxfiles/mediafire
older_than_24_hours = datetime.now() - timedelta(days=1)
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
file_path = os.path.join(MEDIAFIRE_DIRECTORY, filename)
if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_24_hours.timestamp():
# Appending to the existing tar file
with tarfile.open(os.path.join(MEDIAFIRE_DIRECTORY, 'mediafire_cdxfiles_archive.tar'), 'a') as tar:
tar.add(file_path, arcname=filename)
os.remove(file_path)
print(f"Added '{filename}' to 'mediafire_cdxfiles_archive.tar' and removed the JSON file.")
run_ia_command() # Run IA command after processing older files
# Read the URLs from the file # Read the URLs from the file
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file: with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
@ -193,5 +217,6 @@ def main():
end_index = min(i + BATCH_SIZE, len(filtered_urls)) end_index = min(i + BATCH_SIZE, len(filtered_urls))
process_batch(filtered_urls, start_index, end_index) process_batch(filtered_urls, start_index, end_index)
# Entry point
if __name__ == "__main__": if __name__ == "__main__":
main() main()