Update commoncrawl_url_processor.py

This commit is contained in:
datechnoman 2023-12-12 10:05:46 +00:00
parent ba11c6af9f
commit ac0f299269

View File

@ -60,10 +60,10 @@ with open('urls_to_download.txt', 'r') as file:
urls = [url.strip() for url in urls]
# Define the batch size
batch_size = 84
batch_size = 48
# Define the concurrency level (number of download processes running concurrently)
concurrency_level = 8
concurrency_level = 4
# Split the URLs into batches
batches = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)]
@ -75,7 +75,7 @@ for batch in batches:
for url in batch:
# Create the command to download the URL using axel with 3 connections
command = f'axel -n 3 {url}'
command = f'axel -n 2 {url}'
# Start the subprocess in the background
pool.apply_async(subprocess.run, args=(command,), kwds={'shell': True})