Upload files to "/"

2024-01-28 02:02:12 +00:00 · 2024-01-28 02:02:12 +00:00 · a87ce2476a
commit a87ce2476a
parent 13c3a8ff30
1 changed files with 73 additions and 0 deletions
--- a/ia_metadata_to_download_links.py
+++ b/ia_metadata_to_download_links.py
@ -0,0 +1,73 @@
+import os
+import requests
+import json
+from concurrent.futures import ThreadPoolExecutor
+import threading
+
+def download_url(url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text
+    else:
+        print(f"Failed to download URL: {url}")
+        return None
+
+def extract_names(json_content):
+    try:
+        data = json.loads(json_content)
+        files = data.get("files", [])
+        names = [file_data.get("name") for file_data in files]
+        return names
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON content: {e}")
+    return []
+
+def process_url(url, counter_lock):
+    content = download_url(url)
+    if content is not None:
+        # Extract all "name" fields from the downloaded content
+        names = extract_names(content)
+        if any("warc.gz" in name for name in names):
+            # Replace "metadata" with "download" in the URL
+            url = url.replace("metadata", "download")
+            # Write each extracted name and the modified URL to the output file on a new line
+            with open(output_file_path, 'a') as output_file:
+                for name in names:
+                    if "warc.gz" in name:
+                        output_file.write(f"{url}/{name}\n")
+            print(f"Processed URL {url}")
+        else:
+            print(f"No 'warc.gz' fields found in content from URL: {url}")
+
+        with counter_lock:
+            process_url.counter += 1
+            remaining_lines = total_lines - process_url.counter
+            print(f"{remaining_lines} lines remaining")
+
+input_file_path = "/tmp/twittertesting/twitteroutlink_items.txt"
+output_file_path = "/tmp/twittertesting/all_extracted_names.txt"
+
+# Count the total number of lines in the input file
+with open(input_file_path, 'r') as input_file:
+    total_lines = sum(1 for line in input_file)
+
+# Set a counter attribute to track the processed lines
+process_url.counter = 0
+
+# Create a lock for the counter
+counter_lock = threading.Lock()
+
+# Set the concurrency level by adjusting max_workers
+max_workers = 5
+
+# Use ThreadPoolExecutor for concurrent processing
+with ThreadPoolExecutor(max_workers=max_workers) as executor:
+    with open(input_file_path, 'r') as input_file:
+        # Submit tasks to the ThreadPoolExecutor
+        futures = [executor.submit(process_url, line.strip(), counter_lock) for line in input_file]
+        # Wait for all tasks to complete
+        for future in futures:
+            future.result()
+
+print("Processing complete.")
+