Add blogger_automated_cdx_processor.py
This commit is contained in:
parent
10b7b6f65a
commit
badcd7d432
218
blogger_automated_cdx_processor.py
Normal file
218
blogger_automated_cdx_processor.py
Normal file
@ -0,0 +1,218 @@
|
||||
# Import necessary libraries
|
||||
import os
|
||||
import subprocess
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Define constants
|
||||
BLOGGER_DIRECTORY = "/opt/cdxfiles/blogger"
|
||||
ROOT_DIRECTORY = "/root/blogger_files"
|
||||
CONCURRENCY = 10
|
||||
BATCH_SIZE = 10
|
||||
|
||||
# Function to run cdxsummary command
|
||||
def run_cdxsummary(file_path, json_filepath):
|
||||
# Construct the cdxsummary command
|
||||
cdxsummary_command = f"/usr/local/bin/cdxsummary --json {file_path}"
|
||||
|
||||
try:
|
||||
# Run the cdxsummary command and capture the output
|
||||
result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True)
|
||||
|
||||
# Parse the JSON output
|
||||
json_output = json.loads(result.stdout)
|
||||
|
||||
# Add "cdxcount" entry with value 1
|
||||
json_output["cdxcount"] = 1
|
||||
|
||||
# Add "cdxsize" entry with the size of the cdx.gz file in bytes
|
||||
cdx_size_bytes = os.path.getsize(file_path)
|
||||
json_output["cdxsize"] = cdx_size_bytes
|
||||
|
||||
# Remove "pathquery" and "samples" keys
|
||||
if "pathquery" in json_output:
|
||||
del json_output["pathquery"]
|
||||
if "samples" in json_output:
|
||||
del json_output["samples"]
|
||||
if "tophosts" in json_output:
|
||||
del json_output["tophosts"]
|
||||
|
||||
# Write the JSON output to a file if it doesn't exist
|
||||
with open(json_filepath, "w") as json_file:
|
||||
json.dump(json_output, json_file, indent=2)
|
||||
|
||||
print(f"Created JSON file for '{file_path}': '{json_filepath}'")
|
||||
|
||||
# Move the JSON file to /opt/cdxfiles/blogger
|
||||
destination_path = os.path.join(BLOGGER_DIRECTORY, os.path.basename(json_filepath))
|
||||
os.rename(json_filepath, destination_path)
|
||||
print(f"Moved '{os.path.basename(json_filepath)}' to '{BLOGGER_DIRECTORY}'.")
|
||||
|
||||
# Delete the .cdx.gz file
|
||||
os.remove(file_path)
|
||||
print(f"Deleted '{file_path}' after processing.")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running cdxsummary command: {e}")
|
||||
|
||||
# Function to download a file using axel
|
||||
def download_file(url):
|
||||
# Strip newline characters from the URL
|
||||
url = url.strip()
|
||||
|
||||
# Command to download the file using axel
|
||||
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
|
||||
subprocess.run(command, shell=True)
|
||||
|
||||
# Return the downloaded file path
|
||||
return os.path.basename(url)
|
||||
|
||||
# Function to download files concurrently
|
||||
def download_files(urls):
|
||||
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
||||
# Use map to get the file paths
|
||||
downloaded_files = list(executor.map(download_file, urls))
|
||||
|
||||
return downloaded_files
|
||||
|
||||
# Function to process a batch of URLs
|
||||
def process_batch(urls, start_index, end_index):
|
||||
# Extract batch of URLs
|
||||
batch_urls = urls[start_index:end_index]
|
||||
print("\nDownloading Batch...\n")
|
||||
|
||||
# Download files concurrently
|
||||
downloaded_files = download_files(batch_urls)
|
||||
|
||||
# Move files and run cdxsummary for each downloaded file
|
||||
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
||||
futures = []
|
||||
|
||||
for file_url in downloaded_files:
|
||||
# Extracting filename from the URL
|
||||
file_name = os.path.basename(file_url)
|
||||
|
||||
# Construct file paths
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||
|
||||
# Run cdxsummary and delete .cdx.gz file
|
||||
futures.append(executor.submit(run_cdxsummary, file_path, json_filepath))
|
||||
|
||||
# Wait for all tasks to complete before proceeding to the next batch
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
# Function to run the Internet Archive (IA) search command
|
||||
def run_ia_command():
|
||||
# Get the current date formatted as YYYY-MM-DD
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
# Calculate 2 days before the current date
|
||||
two_days_before = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
|
||||
# Replace archiveteam_telegram with archiveteam_blogger and add date range
|
||||
ia_search_command = f"/usr/local/bin/ia search 'collection:archiveteam_blogger addeddate:[{two_days_before} TO {current_date}]' --itemlist"
|
||||
|
||||
# Output file for ia search results
|
||||
output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt")
|
||||
|
||||
try:
|
||||
# Run the ia search command and write the output to a text file
|
||||
with open(output_file, "w") as output:
|
||||
subprocess.run(ia_search_command, shell=True, stdout=output, check=True)
|
||||
|
||||
# Read the file, append a "/" and duplicate the item name with .cdx.gz extension
|
||||
with open(output_file, "r") as input_file:
|
||||
lines = input_file.readlines()
|
||||
lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines]
|
||||
|
||||
# Write the modified lines back to the file
|
||||
with open(output_file, "w") as output_file:
|
||||
output_file.writelines(lines)
|
||||
|
||||
print(f"IA search results written to '{output_file}' with URLs appended.")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running IA search command: {e}")
|
||||
|
||||
# Function to create the blogger directory if it doesn't exist
|
||||
def create_blogger_directory():
|
||||
if not os.path.exists(BLOGGER_DIRECTORY):
|
||||
os.makedirs(BLOGGER_DIRECTORY)
|
||||
|
||||
# Function to move a file to the root folder
|
||||
def move_file(source_path):
|
||||
file_name = os.path.basename(source_path)
|
||||
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||
|
||||
# Check if the file has a ".cdx.gz" extension before moving
|
||||
if file_name.endswith(".cdx.gz"):
|
||||
print(f"Skipping move for JSON file '{file_name}'.")
|
||||
else:
|
||||
os.rename(source_path, destination_path)
|
||||
print(f"Moved '{file_name}' to the root folder.")
|
||||
|
||||
# Function to filter URLs to download based on filenames to exclude
|
||||
def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
|
||||
filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
|
||||
return filtered_urls
|
||||
|
||||
# Function to extract filename from URL
|
||||
def extract_filename_from_url(url):
|
||||
# Extract the filename part from the URL
|
||||
parsed_url = urlparse(url)
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
return filename
|
||||
|
||||
# Main function
|
||||
def main():
|
||||
create_blogger_directory()
|
||||
|
||||
# Create directory_output.txt
|
||||
directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt')
|
||||
with open(directory_output_path, 'w') as directory_output_file:
|
||||
for filename in os.listdir(BLOGGER_DIRECTORY):
|
||||
if filename.endswith(".cdx.json"):
|
||||
directory_output_file.write(f"{filename}\n")
|
||||
|
||||
# Process older files in /opt/cdxfiles/blogger
|
||||
older_than_48_hours = datetime.now() - timedelta(days=2)
|
||||
for filename in os.listdir(BLOGGER_DIRECTORY):
|
||||
file_path = os.path.join(BLOGGER_DIRECTORY, filename)
|
||||
if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_48_hours.timestamp():
|
||||
os.remove(file_path)
|
||||
print(f"Deleted '{filename}' as it is older than 48 hours.")
|
||||
|
||||
run_ia_command() # Run IA command after processing older files
|
||||
|
||||
# Read the URLs from the file
|
||||
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
|
||||
ia_search_urls = ia_search_file.readlines()
|
||||
|
||||
# Extract filenames from URLs
|
||||
ia_search_filenames = [extract_filename_from_url(url.strip()) for url in ia_search_urls]
|
||||
|
||||
# Read the filenames from directory_output.txt and remove the .json extension
|
||||
with open(directory_output_path, 'r') as directory_output_file:
|
||||
directory_filenames = [line.strip().replace(".cdx.json", "") for line in directory_output_file.readlines()]
|
||||
|
||||
# Filter URLs that don't match filenames
|
||||
filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
|
||||
|
||||
# Write filtered URLs to urls_to_download.txt
|
||||
urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt')
|
||||
with open(urls_to_download_path, 'w') as urls_to_download_file:
|
||||
urls_to_download_file.writelines(filtered_urls)
|
||||
|
||||
# Process URLs in batches
|
||||
for i in range(0, len(filtered_urls), BATCH_SIZE):
|
||||
start_index = i
|
||||
end_index = min(i + BATCH_SIZE, len(filtered_urls))
|
||||
process_batch(filtered_urls, start_index, end_index)
|
||||
|
||||
# Entry point
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user