Upload files to "/"
This commit is contained in:
commit
38f8f98795
77
ia_metadata_to_download_links_archivebot.py
Normal file
77
ia_metadata_to_download_links_archivebot.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import threading
|
||||||
|
|
||||||
|
# Function to download a URL
|
||||||
|
def download_url(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.text
|
||||||
|
else:
|
||||||
|
print(f"Failed to download URL: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Function to extract names from JSON content
|
||||||
|
def extract_names(json_content):
|
||||||
|
try:
|
||||||
|
data = json.loads(json_content)
|
||||||
|
files = data.get("files", [])
|
||||||
|
names = [file_data.get("name") for file_data in files]
|
||||||
|
return names
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error decoding JSON content: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Function to process a URL
|
||||||
|
def process_url(url, counter_lock):
|
||||||
|
content = download_url(url)
|
||||||
|
if content is not None:
|
||||||
|
# Extract all "name" fields from the downloaded content
|
||||||
|
names = extract_names(content)
|
||||||
|
if any(name.endswith("warc.gz") and not name.endswith("-meta.warc.gz") for name in names):
|
||||||
|
# Replace "metadata" with "download" in the URL
|
||||||
|
url = url.replace("metadata", "download")
|
||||||
|
# Write each extracted name and the modified URL to the output file on a new line
|
||||||
|
with open(output_file_path, 'a') as output_file:
|
||||||
|
for name in names:
|
||||||
|
if name.endswith("warc.gz") and not name.endswith("-meta.warc.gz"):
|
||||||
|
output_file.write(f"{url}/{name}\n")
|
||||||
|
print(f"Processed URL {url}")
|
||||||
|
else:
|
||||||
|
print(f"No 'warc.gz' fields found in content from URL: {url}")
|
||||||
|
|
||||||
|
with counter_lock:
|
||||||
|
process_url.counter += 1
|
||||||
|
remaining_lines = total_lines - process_url.counter
|
||||||
|
print(f"{remaining_lines} lines remaining")
|
||||||
|
|
||||||
|
# Input and output file paths
|
||||||
|
input_file_path = "/tmp/archivebot/archiveteam_archivebot_items.txt"
|
||||||
|
output_file_path = "/tmp/archivebot/all_extracted_names.txt"
|
||||||
|
|
||||||
|
# Count the total number of lines in the input file
|
||||||
|
with open(input_file_path, 'r') as input_file:
|
||||||
|
total_lines = sum(1 for line in input_file)
|
||||||
|
|
||||||
|
# Set a counter attribute to track the processed lines
|
||||||
|
process_url.counter = 0
|
||||||
|
|
||||||
|
# Create a lock for the counter
|
||||||
|
counter_lock = threading.Lock()
|
||||||
|
|
||||||
|
# Set the concurrency level by adjusting max_workers
|
||||||
|
max_workers = 5
|
||||||
|
|
||||||
|
# Use ThreadPoolExecutor for concurrent processing
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
with open(input_file_path, 'r') as input_file:
|
||||||
|
# Submit tasks to the ThreadPoolExecutor
|
||||||
|
futures = [executor.submit(process_url, line.strip(), counter_lock) for line in input_file]
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
|
print("Processing complete.")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user