Upload files to "/"

This commit is contained in:
datechnoman 2024-01-28 02:02:12 +00:00
parent 13c3a8ff30
commit a87ce2476a

View File

@ -0,0 +1,73 @@
import os
import requests
import json
from concurrent.futures import ThreadPoolExecutor
import threading
def download_url(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to download URL: {url}")
return None
def extract_names(json_content):
try:
data = json.loads(json_content)
files = data.get("files", [])
names = [file_data.get("name") for file_data in files]
return names
except json.JSONDecodeError as e:
print(f"Error decoding JSON content: {e}")
return []
def process_url(url, counter_lock):
content = download_url(url)
if content is not None:
# Extract all "name" fields from the downloaded content
names = extract_names(content)
if any("warc.gz" in name for name in names):
# Replace "metadata" with "download" in the URL
url = url.replace("metadata", "download")
# Write each extracted name and the modified URL to the output file on a new line
with open(output_file_path, 'a') as output_file:
for name in names:
if "warc.gz" in name:
output_file.write(f"{url}/{name}\n")
print(f"Processed URL {url}")
else:
print(f"No 'warc.gz' fields found in content from URL: {url}")
with counter_lock:
process_url.counter += 1
remaining_lines = total_lines - process_url.counter
print(f"{remaining_lines} lines remaining")
input_file_path = "/tmp/twittertesting/twitteroutlink_items.txt"
output_file_path = "/tmp/twittertesting/all_extracted_names.txt"
# Count the total number of lines in the input file
with open(input_file_path, 'r') as input_file:
total_lines = sum(1 for line in input_file)
# Set a counter attribute to track the processed lines
process_url.counter = 0
# Create a lock for the counter
counter_lock = threading.Lock()
# Set the concurrency level by adjusting max_workers
max_workers = 5
# Use ThreadPoolExecutor for concurrent processing
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with open(input_file_path, 'r') as input_file:
# Submit tasks to the ThreadPoolExecutor
futures = [executor.submit(process_url, line.strip(), counter_lock) for line in input_file]
# Wait for all tasks to complete
for future in futures:
future.result()
print("Processing complete.")