diff --git a/commoncrawl_url_processor.py b/commoncrawl_url_processor.py index 05310e3..d9ee210 100644 --- a/commoncrawl_url_processor.py +++ b/commoncrawl_url_processor.py @@ -60,10 +60,10 @@ with open('urls_to_download.txt', 'r') as file: urls = [url.strip() for url in urls] # Define the batch size -batch_size = 84 +batch_size = 48 # Define the concurrency level (number of download processes running concurrently) -concurrency_level = 8 +concurrency_level = 4 # Split the URLs into batches batches = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)] @@ -75,7 +75,7 @@ for batch in batches: for url in batch: # Create the command to download the URL using axel with 3 connections - command = f'axel -n 3 {url}' + command = f'axel -n 2 {url}' # Start the subprocess in the background pool.apply_async(subprocess.run, args=(command,), kwds={'shell': True})