Update commoncrawl_url_processor.py
This commit is contained in:
parent
ba11c6af9f
commit
ac0f299269
@ -60,10 +60,10 @@ with open('urls_to_download.txt', 'r') as file:
|
|||||||
urls = [url.strip() for url in urls]
|
urls = [url.strip() for url in urls]
|
||||||
|
|
||||||
# Define the batch size
|
# Define the batch size
|
||||||
batch_size = 84
|
batch_size = 48
|
||||||
|
|
||||||
# Define the concurrency level (number of download processes running concurrently)
|
# Define the concurrency level (number of download processes running concurrently)
|
||||||
concurrency_level = 8
|
concurrency_level = 4
|
||||||
|
|
||||||
# Split the URLs into batches
|
# Split the URLs into batches
|
||||||
batches = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)]
|
batches = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)]
|
||||||
@ -75,7 +75,7 @@ for batch in batches:
|
|||||||
|
|
||||||
for url in batch:
|
for url in batch:
|
||||||
# Create the command to download the URL using axel with 3 connections
|
# Create the command to download the URL using axel with 3 connections
|
||||||
command = f'axel -n 3 {url}'
|
command = f'axel -n 2 {url}'
|
||||||
|
|
||||||
# Start the subprocess in the background
|
# Start the subprocess in the background
|
||||||
pool.apply_async(subprocess.run, args=(command,), kwds={'shell': True})
|
pool.apply_async(subprocess.run, args=(command,), kwds={'shell': True})
|
||||||
|
Loading…
Reference in New Issue
Block a user