diff --git a/ia_metadata_to_download_links.py b/ia_metadata_to_download_links.py new file mode 100644 index 0000000..77bbd10 --- /dev/null +++ b/ia_metadata_to_download_links.py @@ -0,0 +1,73 @@ +import os +import requests +import json +from concurrent.futures import ThreadPoolExecutor +import threading + +def download_url(url): + response = requests.get(url) + if response.status_code == 200: + return response.text + else: + print(f"Failed to download URL: {url}") + return None + +def extract_names(json_content): + try: + data = json.loads(json_content) + files = data.get("files", []) + names = [file_data.get("name") for file_data in files] + return names + except json.JSONDecodeError as e: + print(f"Error decoding JSON content: {e}") + return [] + +def process_url(url, counter_lock): + content = download_url(url) + if content is not None: + # Extract all "name" fields from the downloaded content + names = extract_names(content) + if any("warc.gz" in name for name in names): + # Replace "metadata" with "download" in the URL + url = url.replace("metadata", "download") + # Write each extracted name and the modified URL to the output file on a new line + with open(output_file_path, 'a') as output_file: + for name in names: + if "warc.gz" in name: + output_file.write(f"{url}/{name}\n") + print(f"Processed URL {url}") + else: + print(f"No 'warc.gz' fields found in content from URL: {url}") + + with counter_lock: + process_url.counter += 1 + remaining_lines = total_lines - process_url.counter + print(f"{remaining_lines} lines remaining") + +input_file_path = "/tmp/twittertesting/twitteroutlink_items.txt" +output_file_path = "/tmp/twittertesting/all_extracted_names.txt" + +# Count the total number of lines in the input file +with open(input_file_path, 'r') as input_file: + total_lines = sum(1 for line in input_file) + +# Set a counter attribute to track the processed lines +process_url.counter = 0 + +# Create a lock for the counter +counter_lock = threading.Lock() + +# Set the concurrency level by adjusting max_workers +max_workers = 5 + +# Use ThreadPoolExecutor for concurrent processing +with ThreadPoolExecutor(max_workers=max_workers) as executor: + with open(input_file_path, 'r') as input_file: + # Submit tasks to the ThreadPoolExecutor + futures = [executor.submit(process_url, line.strip(), counter_lock) for line in input_file] + # Wait for all tasks to complete + for future in futures: + future.result() + +print("Processing complete.") +