Update warc_wat_url_processor.py
This commit is contained in:
parent
3ce09f46d7
commit
06e3399861
@ -38,14 +38,30 @@ def process_file(file_path):
|
||||
print(f"URLs written to {output_file_path}")
|
||||
|
||||
# Use zstd command-line tool for compression
|
||||
command = f'zstd -T0 -18 --long {output_file_path} -o {output_file_path}.zst'
|
||||
subprocess.run(command, shell=True)
|
||||
print(f"Compressed file saved as '{output_file_path}.zst'")
|
||||
compressed_file_path = f'{output_file_path}.zst'
|
||||
command_compress = f'zstd -T0 -14 --long {output_file_path} -o {compressed_file_path}'
|
||||
subprocess.run(command_compress, shell=True)
|
||||
print(f"Compressed file saved as '{compressed_file_path}'")
|
||||
|
||||
# Remove the original gzipped file
|
||||
os.remove(file_path)
|
||||
print(f"Original file removed: {file_path}")
|
||||
|
||||
# Remove the original _urls.txt file
|
||||
os.remove(output_file_path)
|
||||
print(f"Original file removed: {output_file_path}")
|
||||
|
||||
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
|
||||
filename = os.path.basename(output_file_path).replace('_urls.txt', '')
|
||||
command = f'sed -i "/{filename}/d" "urls_to_download.txt"'
|
||||
if subprocess.run(command, shell=True).returncode == 0:
|
||||
print(f"File {filename} has been successfully removed from urls_to_download.txt")
|
||||
with open('urls_to_download.txt', 'r') as file:
|
||||
remaining_count = sum(1 for line in file)
|
||||
print(f"URLs remaining to be processed: {remaining_count}")
|
||||
else:
|
||||
print(f"Failed to remove {filename} from urls_to_download.txt")
|
||||
|
||||
def extract_urls_from_directory(directory_path):
|
||||
file_list = sorted(os.listdir(directory_path))
|
||||
pool = Pool(processes=7)
|
||||
|
Loading…
Reference in New Issue
Block a user