diff --git a/warc_wat_url_processor.py b/warc_wat_url_processor.py index 90d506a..61f4619 100644 --- a/warc_wat_url_processor.py +++ b/warc_wat_url_processor.py @@ -4,6 +4,26 @@ import re import traceback from concurrent.futures import ProcessPoolExecutor, as_completed, wait import subprocess +import time + +def check_disk_space(path, min_space_gb=20, check_interval=300): + while True: + try: + result = subprocess.run(['df', '-BG', path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode == 0: + output = result.stdout.split('\n')[1].split() + available_space_gb = int(output[3].replace('G', '')) + if available_space_gb >= min_space_gb: + break + else: + print(f"Waiting for more than {min_space_gb}GB free space on {path}. Current available space: {available_space_gb}GB") + time.sleep(check_interval) + else: + print("Error checking disk space.") + time.sleep(check_interval) + except Exception as e: + print(f"An error occurred while checking disk space: {e}") + time.sleep(check_interval) def extract_urls_from_file(file_path): urls = [] @@ -78,7 +98,7 @@ def process_file(file_path): def download_and_process_file(url): try: - command = f'axel -n 4 {url}' + command = f'axel -n 3 {url}' result = subprocess.run(command, shell=True, check=True) if result.returncode == 0: file_path = os.path.join(os.getcwd(), os.path.basename(url)) @@ -90,6 +110,8 @@ def download_and_process_file(url): print(f"Error during download and processing {url}: {e}") def main(): + check_disk_space('/dev/sda1') + with open('urls_to_download.txt', 'r') as file: urls = file.readlines() @@ -97,14 +119,11 @@ def main(): download_concurrency_level = 40 - # Start downloading and processing files in parallel with ProcessPoolExecutor(max_workers=download_concurrency_level) as executor: futures = [executor.submit(download_and_process_file, url) for url in urls] - # Wait for all downloads and processing to complete before starting the next iteration completed_futures, _ = wait(futures) - # Process results from completed futures for completed_future in completed_futures: try: result = completed_future.result() @@ -113,4 +132,4 @@ def main(): print(f"Error in processing future: {e}") if __name__ == "__main__": - main() + main() \ No newline at end of file