Update warc_wat_url_processor.py

This commit is contained in:
datechnoman 2024-01-23 10:45:32 +00:00
parent 6edffba451
commit 295c3daba4

View File

@ -43,10 +43,11 @@ def process_file(file_path):
command_compress = f'zstd -T0 -12 --long {output_file_path} -o {compressed_file_path}'
# Run the compression command synchronously and wait for it to complete
compression_process = subprocess.Popen(command_compress, shell=True)
compression_process.communicate()
print(f"Compressed file saved as '{compressed_file_path}'")
result = subprocess.run(command_compress, shell=True, check=True)
if result.returncode == 0:
print(f"Compressed file saved as '{compressed_file_path}'")
else:
print(f"Compression failed for '{output_file_path}'")
# Remove the original gzipped file
os.remove(file_path)
@ -59,7 +60,8 @@ def process_file(file_path):
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
filename = os.path.basename(output_file_path).replace('_urls.txt', '')
command = f'sed -i "/{filename}/d" "urls_to_download.txt"'
if subprocess.run(command, shell=True).returncode == 0:
result = subprocess.run(command, shell=True)
if result.returncode == 0:
print(f"File {filename} has been successfully removed from urls_to_download.txt")
with open('urls_to_download.txt', 'r') as file:
remaining_count = sum(1 for line in file)
@ -73,10 +75,14 @@ def process_file(file_path):
def download_and_process_file(url):
try:
command = f'axel -n 4 {url}'
subprocess.run(command, shell=True)
file_path = os.path.join(os.getcwd(), os.path.basename(url))
process_file(file_path)
command = f'axel -n 3 {url}'
result = subprocess.run(command, shell=True, check=True)
if result.returncode == 0:
file_path = os.path.join(os.getcwd(), os.path.basename(url))
process_file(file_path)
else:
print(f"Download failed for {url}")
except Exception as e:
print(f"Error during download and processing {url}: {e}")
@ -93,7 +99,7 @@ def main():
futures = [executor.submit(download_and_process_file, url) for url in urls]
# Wait for all downloads and processing to complete before starting the next iteration
wait(futures)
as_completed(futures)
if __name__ == "__main__":
main()
main()