Added tar support for files older than 24 hours to keep cdx file count low
This commit is contained in:
parent
f5928594a5
commit
457f32ed61
@ -1,15 +1,19 @@
|
|||||||
|
# Import necessary libraries
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import json
|
import json
|
||||||
|
import tarfile # Use tarfile for creating tar archives
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
# Define constants
|
||||||
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
||||||
ROOT_DIRECTORY = "/root/mediafire_files"
|
ROOT_DIRECTORY = "/root/mediafire_files"
|
||||||
CONCURRENCY = 10
|
CONCURRENCY = 10
|
||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 10
|
||||||
|
|
||||||
|
# Function to run cdxsummary command
|
||||||
def run_cdxsummary(file_path, json_filepath):
|
def run_cdxsummary(file_path, json_filepath):
|
||||||
# Construct the cdxsummary command
|
# Construct the cdxsummary command
|
||||||
cdxsummary_command = f"cdxsummary --json {file_path}"
|
cdxsummary_command = f"cdxsummary --json {file_path}"
|
||||||
@ -54,6 +58,7 @@ def run_cdxsummary(file_path, json_filepath):
|
|||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"Error running cdxsummary command: {e}")
|
print(f"Error running cdxsummary command: {e}")
|
||||||
|
|
||||||
|
# Function to download a file using axel
|
||||||
def download_file(url):
|
def download_file(url):
|
||||||
# Strip newline characters from the URL
|
# Strip newline characters from the URL
|
||||||
url = url.strip()
|
url = url.strip()
|
||||||
@ -65,6 +70,7 @@ def download_file(url):
|
|||||||
# Return the downloaded file path
|
# Return the downloaded file path
|
||||||
return os.path.basename(url)
|
return os.path.basename(url)
|
||||||
|
|
||||||
|
# Function to download files concurrently
|
||||||
def download_files(urls):
|
def download_files(urls):
|
||||||
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
||||||
# Use map to get the file paths
|
# Use map to get the file paths
|
||||||
@ -72,6 +78,7 @@ def download_files(urls):
|
|||||||
|
|
||||||
return downloaded_files
|
return downloaded_files
|
||||||
|
|
||||||
|
# Function to process a batch of URLs
|
||||||
def process_batch(urls, start_index, end_index):
|
def process_batch(urls, start_index, end_index):
|
||||||
# Extract batch of URLs
|
# Extract batch of URLs
|
||||||
batch_urls = urls[start_index:end_index]
|
batch_urls = urls[start_index:end_index]
|
||||||
@ -99,6 +106,7 @@ def process_batch(urls, start_index, end_index):
|
|||||||
for future in futures:
|
for future in futures:
|
||||||
future.result()
|
future.result()
|
||||||
|
|
||||||
|
# Function to run the Internet Archive (IA) search command
|
||||||
def run_ia_command():
|
def run_ia_command():
|
||||||
# Get the current date formatted as YYYY-MM-DD
|
# Get the current date formatted as YYYY-MM-DD
|
||||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||||
@ -131,10 +139,12 @@ def run_ia_command():
|
|||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"Error running IA search command: {e}")
|
print(f"Error running IA search command: {e}")
|
||||||
|
|
||||||
|
# Function to create the mediafire directory if it doesn't exist
|
||||||
def create_mediafire_directory():
|
def create_mediafire_directory():
|
||||||
if not os.path.exists(MEDIAFIRE_DIRECTORY):
|
if not os.path.exists(MEDIAFIRE_DIRECTORY):
|
||||||
os.makedirs(MEDIAFIRE_DIRECTORY)
|
os.makedirs(MEDIAFIRE_DIRECTORY)
|
||||||
|
|
||||||
|
# Function to move a file to the root folder
|
||||||
def move_file(source_path):
|
def move_file(source_path):
|
||||||
file_name = os.path.basename(source_path)
|
file_name = os.path.basename(source_path)
|
||||||
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||||
@ -146,16 +156,19 @@ def move_file(source_path):
|
|||||||
os.rename(source_path, destination_path)
|
os.rename(source_path, destination_path)
|
||||||
print(f"Moved '{file_name}' to the root folder.")
|
print(f"Moved '{file_name}' to the root folder.")
|
||||||
|
|
||||||
|
# Function to filter URLs to download based on filenames to exclude
|
||||||
def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
|
def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
|
||||||
filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
|
filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
|
||||||
return filtered_urls
|
return filtered_urls
|
||||||
|
|
||||||
|
# Function to extract filename from URL
|
||||||
def extract_filename_from_url(url):
|
def extract_filename_from_url(url):
|
||||||
# Extract the filename part from the URL
|
# Extract the filename part from the URL
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
filename = os.path.basename(parsed_url.path)
|
filename = os.path.basename(parsed_url.path)
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
# Main function
|
||||||
def main():
|
def main():
|
||||||
create_mediafire_directory()
|
create_mediafire_directory()
|
||||||
|
|
||||||
@ -166,7 +179,18 @@ def main():
|
|||||||
if filename.endswith(".cdx.json"):
|
if filename.endswith(".cdx.json"):
|
||||||
directory_output_file.write(f"{filename}\n")
|
directory_output_file.write(f"{filename}\n")
|
||||||
|
|
||||||
run_ia_command() # Run IA command after creating directory_output.txt
|
# Process older files in /opt/cdxfiles/mediafire
|
||||||
|
older_than_24_hours = datetime.now() - timedelta(days=1)
|
||||||
|
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
|
||||||
|
file_path = os.path.join(MEDIAFIRE_DIRECTORY, filename)
|
||||||
|
if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_24_hours.timestamp():
|
||||||
|
# Appending to the existing tar file
|
||||||
|
with tarfile.open(os.path.join(MEDIAFIRE_DIRECTORY, 'mediafire_cdxfiles_archive.tar'), 'a') as tar:
|
||||||
|
tar.add(file_path, arcname=filename)
|
||||||
|
os.remove(file_path)
|
||||||
|
print(f"Added '{filename}' to 'mediafire_cdxfiles_archive.tar' and removed the JSON file.")
|
||||||
|
|
||||||
|
run_ia_command() # Run IA command after processing older files
|
||||||
|
|
||||||
# Read the URLs from the file
|
# Read the URLs from the file
|
||||||
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
|
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
|
||||||
@ -193,5 +217,6 @@ def main():
|
|||||||
end_index = min(i + BATCH_SIZE, len(filtered_urls))
|
end_index = min(i + BATCH_SIZE, len(filtered_urls))
|
||||||
process_batch(filtered_urls, start_index, end_index)
|
process_batch(filtered_urls, start_index, end_index)
|
||||||
|
|
||||||
|
# Entry point
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user