Updated to support zst compressed files
This commit is contained in:
parent
95abf80bd1
commit
9e6e5190d3
@ -8,13 +8,23 @@ def process_file(file, directory_path, output_file_path, keyword, counter_lock,
|
|||||||
|
|
||||||
print(f"\nProcessing {file_path}...")
|
print(f"\nProcessing {file_path}...")
|
||||||
|
|
||||||
# Run the command and append the output to the same output file
|
# Determine the appropriate command based on file extension
|
||||||
command = f"zcat {file_path} | grep '{keyword}'"
|
if file.endswith(".gz"):
|
||||||
|
command = f"zcat {file_path}"
|
||||||
|
elif file.endswith(".zst"):
|
||||||
|
command = f"zstdcat {file_path}"
|
||||||
|
elif file.endswith(".txt"):
|
||||||
|
command = f"cat {file_path}"
|
||||||
|
else:
|
||||||
|
print(f"Skipping {file_path}. Unsupported file extension.")
|
||||||
|
return
|
||||||
|
|
||||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, text=True)
|
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, text=True)
|
||||||
|
|
||||||
# Stream the output line by line and append to the output file
|
# Stream the output line by line and append to the output file
|
||||||
with open(output_file_path, "a") as output_file:
|
with open(output_file_path, "a") as output_file:
|
||||||
for line in process.stdout:
|
for line in process.stdout:
|
||||||
|
if keyword in line:
|
||||||
output_file.write(line)
|
output_file.write(line)
|
||||||
|
|
||||||
process.wait() # Wait for the process to finish
|
process.wait() # Wait for the process to finish
|
||||||
@ -22,23 +32,23 @@ def process_file(file, directory_path, output_file_path, keyword, counter_lock,
|
|||||||
# Update the processed files count
|
# Update the processed files count
|
||||||
with counter_lock:
|
with counter_lock:
|
||||||
processed_counter[0] += 1
|
processed_counter[0] += 1
|
||||||
remaining_count = len(gzipped_files) - processed_counter[0]
|
remaining_count = len(files) - processed_counter[0]
|
||||||
print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}")
|
print(f"{file_path} processed. URLs appended to {output_file_path}. Remaining files: {remaining_count}")
|
||||||
|
|
||||||
# Ask the user for the directory containing .txt.gz files
|
# Ask the user for the directory containing files
|
||||||
directory_path = input("Enter the directory path containing .txt.gz files: ")
|
directory_path = input("Enter the directory path containing files: ")
|
||||||
|
|
||||||
# Ensure the directory exists
|
# Ensure the directory exists
|
||||||
if not os.path.exists(directory_path):
|
if not os.path.exists(directory_path):
|
||||||
print(f"Error: The directory '{directory_path}' does not exist.")
|
print(f"Error: The directory '{directory_path}' does not exist.")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# List all files in the directory that end with .txt.gz
|
# List all files in the directory
|
||||||
gzipped_files = [file for file in os.listdir(directory_path) if file.endswith(".txt.gz")]
|
files = os.listdir(directory_path)
|
||||||
|
|
||||||
# Check if there are any .txt.gz files in the directory
|
# Check if there are any files in the directory
|
||||||
if not gzipped_files:
|
if not files:
|
||||||
print("Error: No .txt.gz files found in the specified directory.")
|
print("Error: No files found in the specified directory.")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# Ask the user for the output directory
|
# Ask the user for the output directory
|
||||||
@ -65,10 +75,10 @@ processed_counter = [0] # Using a list to store an integer (mutable) to pass by
|
|||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
|
with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
|
||||||
# Submit each file for processing
|
# Submit each file for processing
|
||||||
futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in gzipped_files]
|
futures = [executor.submit(process_file, file, directory_path, output_file_path, keyword, counter_lock, processed_counter) for file in files]
|
||||||
|
|
||||||
# Wait for all tasks to complete
|
# Wait for all tasks to complete
|
||||||
for future in futures:
|
for future in futures:
|
||||||
future.result()
|
future.result()
|
||||||
|
|
||||||
print(f"\nAll files processed. URLs appended to {output_file_path}")
|
print(f"\nAll files processed. Matching lines appended to {output_file_path}")
|
Loading…
Reference in New Issue
Block a user