diff --git a/ia_metadata_to_download_links.py b/ia_metadata_to_download_links.py deleted file mode 100644 index 77bbd10..0000000 --- a/ia_metadata_to_download_links.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import requests -import json -from concurrent.futures import ThreadPoolExecutor -import threading - -def download_url(url): - response = requests.get(url) - if response.status_code == 200: - return response.text - else: - print(f"Failed to download URL: {url}") - return None - -def extract_names(json_content): - try: - data = json.loads(json_content) - files = data.get("files", []) - names = [file_data.get("name") for file_data in files] - return names - except json.JSONDecodeError as e: - print(f"Error decoding JSON content: {e}") - return [] - -def process_url(url, counter_lock): - content = download_url(url) - if content is not None: - # Extract all "name" fields from the downloaded content - names = extract_names(content) - if any("warc.gz" in name for name in names): - # Replace "metadata" with "download" in the URL - url = url.replace("metadata", "download") - # Write each extracted name and the modified URL to the output file on a new line - with open(output_file_path, 'a') as output_file: - for name in names: - if "warc.gz" in name: - output_file.write(f"{url}/{name}\n") - print(f"Processed URL {url}") - else: - print(f"No 'warc.gz' fields found in content from URL: {url}") - - with counter_lock: - process_url.counter += 1 - remaining_lines = total_lines - process_url.counter - print(f"{remaining_lines} lines remaining") - -input_file_path = "/tmp/twittertesting/twitteroutlink_items.txt" -output_file_path = "/tmp/twittertesting/all_extracted_names.txt" - -# Count the total number of lines in the input file -with open(input_file_path, 'r') as input_file: - total_lines = sum(1 for line in input_file) - -# Set a counter attribute to track the processed lines -process_url.counter = 0 - -# Create a lock for the counter -counter_lock = threading.Lock() - -# Set the concurrency level by adjusting max_workers -max_workers = 5 - -# Use ThreadPoolExecutor for concurrent processing -with ThreadPoolExecutor(max_workers=max_workers) as executor: - with open(input_file_path, 'r') as input_file: - # Submit tasks to the ThreadPoolExecutor - futures = [executor.submit(process_url, line.strip(), counter_lock) for line in input_file] - # Wait for all tasks to complete - for future in futures: - future.result() - -print("Processing complete.") -