Upload files to "/"

2024-01-28 01:58:48 +00:00 · 2024-01-28 01:58:48 +00:00 · 38f8f98795
commit 38f8f98795
1 changed files with 77 additions and 0 deletions
--- a/ia_metadata_to_download_links_archivebot.py
+++ b/ia_metadata_to_download_links_archivebot.py
@ -0,0 +1,77 @@
 import os
 import requests
 import json
 from concurrent.futures import ThreadPoolExecutor
 import threading
 # Function to download a URL
 def download_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to download URL: {url}")
        return None
 # Function to extract names from JSON content
 def extract_names(json_content):
    try:
        data = json.loads(json_content)
        files = data.get("files", [])
        names = [file_data.get("name") for file_data in files]
        return names
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON content: {e}")
    return []
 # Function to process a URL
 def process_url(url, counter_lock):
    content = download_url(url)
    if content is not None:
        # Extract all "name" fields from the downloaded content
        names = extract_names(content)
        if any(name.endswith("warc.gz") and not name.endswith("-meta.warc.gz") for name in names):
            # Replace "metadata" with "download" in the URL
            url = url.replace("metadata", "download")
            # Write each extracted name and the modified URL to the output file on a new line
            with open(output_file_path, 'a') as output_file:
                for name in names:
                    if name.endswith("warc.gz") and not name.endswith("-meta.warc.gz"):
                        output_file.write(f"{url}/{name}\n")
            print(f"Processed URL {url}")
        else:
            print(f"No 'warc.gz' fields found in content from URL: {url}")
        with counter_lock:
            process_url.counter += 1
            remaining_lines = total_lines - process_url.counter
            print(f"{remaining_lines} lines remaining")
 # Input and output file paths
 input_file_path = "/tmp/archivebot/archiveteam_archivebot_items.txt"
 output_file_path = "/tmp/archivebot/all_extracted_names.txt"
 # Count the total number of lines in the input file
 with open(input_file_path, 'r') as input_file:
    total_lines = sum(1 for line in input_file)
 # Set a counter attribute to track the processed lines
 process_url.counter = 0
 # Create a lock for the counter
 counter_lock = threading.Lock()
 # Set the concurrency level by adjusting max_workers
 max_workers = 5
 # Use ThreadPoolExecutor for concurrent processing
 with ThreadPoolExecutor(max_workers=max_workers) as executor:
    with open(input_file_path, 'r') as input_file:
        # Submit tasks to the ThreadPoolExecutor
        futures = [executor.submit(process_url, line.strip(), counter_lock) for line in input_file]
        # Wait for all tasks to complete
        for future in futures:
            future.result()
 print("Processing complete.")