Update commoncrawl_url_processor.py

2023-12-12 10:05:46 +00:00 · 2023-12-12 10:05:46 +00:00 · ac0f299269
commit ac0f299269
parent ba11c6af9f
1 changed files with 3 additions and 3 deletions
--- a/commoncrawl_url_processor.py
+++ b/commoncrawl_url_processor.py
@ -60,10 +60,10 @@ with open('urls_to_download.txt', 'r') as file:
 urls = [url.strip() for url in urls]

 # Define the batch size
-batch_size = 84
+batch_size = 48

 # Define the concurrency level (number of download processes running concurrently)
-concurrency_level = 8
+concurrency_level = 4

 # Split the URLs into batches
 batches = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)]
@ -75,7 +75,7 @@ for batch in batches:

    for url in batch:
        # Create the command to download the URL using axel with 3 connections
-        command = f'axel -n 3 {url}'
+        command = f'axel -n 2 {url}'

        # Start the subprocess in the background
        pool.apply_async(subprocess.run, args=(command,), kwds={'shell': True})