From 49a9bc47102bd48dc32827da9694df9b33f8d332 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Tue, 9 Jan 2024 05:15:31 +0000 Subject: [PATCH] Add pastebin_automated_cdx_processor.py --- pastebin_automated_cdx_processor.py | 222 ++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 pastebin_automated_cdx_processor.py diff --git a/pastebin_automated_cdx_processor.py b/pastebin_automated_cdx_processor.py new file mode 100644 index 0000000..81293d1 --- /dev/null +++ b/pastebin_automated_cdx_processor.py @@ -0,0 +1,222 @@ +# Import necessary libraries +import os +import subprocess +import json +import tarfile # Use tarfile for creating tar archives +from datetime import datetime, timedelta +from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse + +# Define constants +PASTEBIN_DIRECTORY = "/opt/cdxfiles/pastebin" +ROOT_DIRECTORY = "/root/pastebin_files" +CONCURRENCY = 10 +BATCH_SIZE = 10 + +# Function to run cdxsummary command +def run_cdxsummary(file_path, json_filepath): + # Construct the cdxsummary command + cdxsummary_command = f"cdxsummary --json {file_path}" + + try: + # Run the cdxsummary command and capture the output + result = subprocess.run(cdxsummary_command, shell=True, capture_output=True, text=True, check=True) + + # Parse the JSON output + json_output = json.loads(result.stdout) + + # Add "cdxcount" entry with value 1 + json_output["cdxcount"] = 1 + + # Add "cdxsize" entry with the size of the cdx.gz file in bytes + cdx_size_bytes = os.path.getsize(file_path) + json_output["cdxsize"] = cdx_size_bytes + + # Remove "pathquery" and "samples" keys + if "pathquery" in json_output: + del json_output["pathquery"] + if "samples" in json_output: + del json_output["samples"] + if "tophosts" in json_output: + del json_output["tophosts"] + + # Write the JSON output to a file if it doesn't exist + with open(json_filepath, "w") as json_file: + json.dump(json_output, json_file, indent=2) + + print(f"Created JSON file for '{file_path}': '{json_filepath}'") + + # Move the JSON file to /opt/cdxfiles/pastebin + destination_path = os.path.join(PASTEBIN_DIRECTORY, os.path.basename(json_filepath)) + os.rename(json_filepath, destination_path) + print(f"Moved '{os.path.basename(json_filepath)}' to '{PASTEBIN_DIRECTORY}'.") + + # Delete the .cdx.gz file + os.remove(file_path) + print(f"Deleted '{file_path}' after processing.") + + except subprocess.CalledProcessError as e: + print(f"Error running cdxsummary command: {e}") + +# Function to download a file using axel +def download_file(url): + # Strip newline characters from the URL + url = url.strip() + + # Command to download the file using axel + command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file + subprocess.run(command, shell=True) + + # Return the downloaded file path + return os.path.basename(url) + +# Function to download files concurrently +def download_files(urls): + with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: + # Use map to get the file paths + downloaded_files = list(executor.map(download_file, urls)) + + return downloaded_files + +# Function to process a batch of URLs +def process_batch(urls, start_index, end_index): + # Extract batch of URLs + batch_urls = urls[start_index:end_index] + print("\nDownloading Batch...\n") + + # Download files concurrently + downloaded_files = download_files(batch_urls) + + # Move files and run cdxsummary for each downloaded file + with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: + futures = [] + + for file_url in downloaded_files: + # Extracting filename from the URL + file_name = os.path.basename(file_url) + + # Construct file paths + file_path = os.path.join(os.getcwd(), file_name) + json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) + + # Run cdxsummary and delete .cdx.gz file + futures.append(executor.submit(run_cdxsummary, file_path, json_filepath)) + + # Wait for all tasks to complete before proceeding to the next batch + for future in futures: + future.result() + +# Function to run the Internet Archive (IA) search command +def run_ia_command(): + # Get the current date formatted as YYYY-MM-DD + current_date = datetime.now().strftime("%Y-%m-%d") + + # Calculate 2 days before the current date + two_days_before = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") + + # Replace archiveteam_telegram with archiveteam_pastebin and add date range + ia_search_command = f"ia search 'collection:archiveteam_pastebin addeddate:[{two_days_before} TO {current_date}]' --itemlist" + + # Output file for ia search results + output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt") + + try: + # Run the ia search command and write the output to a text file + with open(output_file, "w") as output: + subprocess.run(ia_search_command, shell=True, stdout=output, check=True) + + # Read the file, append a "/" and duplicate the item name with .cdx.gz extension + with open(output_file, "r") as input_file: + lines = input_file.readlines() + lines = [f"https://archive.org/download/{line.strip()}/{line.strip()}.cdx.gz\n" for line in lines] + + # Write the modified lines back to the file + with open(output_file, "w") as output_file: + output_file.writelines(lines) + + print(f"IA search results written to '{output_file}' with URLs appended.") + + except subprocess.CalledProcessError as e: + print(f"Error running IA search command: {e}") + +# Function to create the pastebin directory if it doesn't exist +def create_pastebin_directory(): + if not os.path.exists(PASTEBIN_DIRECTORY): + os.makedirs(PASTEBIN_DIRECTORY) + +# Function to move a file to the root folder +def move_file(source_path): + file_name = os.path.basename(source_path) + destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json")) + + # Check if the file has a ".cdx.gz" extension before moving + if file_name.endswith(".cdx.gz"): + print(f"Skipping move for JSON file '{file_name}'.") + else: + os.rename(source_path, destination_path) + print(f"Moved '{file_name}' to the root folder.") + +# Function to filter URLs to download based on filenames to exclude +def filter_urls_to_download(urls_to_filter, filenames_to_exclude): + filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)] + return filtered_urls + +# Function to extract filename from URL +def extract_filename_from_url(url): + # Extract the filename part from the URL + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + return filename + +# Main function +def main(): + create_pastebin_directory() + + # Create directory_output.txt + directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt') + with open(directory_output_path, 'w') as directory_output_file: + for filename in os.listdir(PASTEBIN_DIRECTORY): + if filename.endswith(".cdx.json"): + directory_output_file.write(f"{filename}\n") + + # Process older files in /opt/cdxfiles/pastebin + older_than_24_hours = datetime.now() - timedelta(days=1) + for filename in os.listdir(PASTEBIN_DIRECTORY): + file_path = os.path.join(PASTEBIN_DIRECTORY, filename) + if filename.endswith(".cdx.json") and os.path.getmtime(file_path) < older_than_24_hours.timestamp(): + # Appending to the existing tar file + with tarfile.open(os.path.join(PASTEBIN_DIRECTORY, 'pastebin_cdxfiles_archive.tar'), 'a') as tar: + tar.add(file_path, arcname=filename) + os.remove(file_path) + print(f"Added '{filename}' to 'pastebin_cdxfiles_archive.tar' and removed the JSON file.") + + run_ia_command() # Run IA command after processing older files + + # Read the URLs from the file + with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file: + ia_search_urls = ia_search_file.readlines() + + # Extract filenames from URLs + ia_search_filenames = [extract_filename_from_url(url.strip()) for url in ia_search_urls] + + # Read the filenames from directory_output.txt and remove the .json extension + with open(directory_output_path, 'r') as directory_output_file: + directory_filenames = [line.strip().replace(".cdx.json", "") for line in directory_output_file.readlines()] + + # Filter URLs that don't match filenames + filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames) + + # Write filtered URLs to urls_to_download.txt + urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt') + with open(urls_to_download_path, 'w') as urls_to_download_file: + urls_to_download_file.writelines(filtered_urls) + + # Process URLs in batches + for i in range(0, len(filtered_urls), BATCH_SIZE): + start_index = i + end_index = min(i + BATCH_SIZE, len(filtered_urls)) + process_batch(filtered_urls, start_index, end_index) + +# Entry point +if __name__ == "__main__": + main() \ No newline at end of file