Update archiveteam_project_url_extractor.py
This commit is contained in:
parent
fd15770812
commit
3649ea306f
@ -3,9 +3,9 @@ import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Lock
|
||||
|
||||
# Function to process each file
|
||||
def process_file(file, directory_path, output_directory, patterns, counter_lock, processed_counter):
|
||||
file_path = os.path.join(directory_path, file)
|
||||
|
||||
print(f"\nProcessing {file_path}...")
|
||||
|
||||
# Determine the appropriate command based on file extension
|
||||
@ -21,18 +21,18 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
|
||||
|
||||
# Iterate through each pattern and process the file accordingly
|
||||
for pattern, output_filename in patterns.items():
|
||||
# Run the command and append the output to the corresponding output file
|
||||
grep_command = f"grep -E '{pattern}'"
|
||||
full_command = f"{command} | {grep_command}"
|
||||
|
||||
result = subprocess.run(full_command, shell=True, stdout=subprocess.PIPE, text=True)
|
||||
|
||||
# Generate the output file path based on the output filename
|
||||
# Open the output file in append mode
|
||||
output_file_path = os.path.join(output_directory, output_filename)
|
||||
if not os.path.exists(output_file_path):
|
||||
open(output_file_path, 'a').close() # Create the file if it doesn't exist
|
||||
|
||||
# Append the output to the corresponding output file
|
||||
with open(output_file_path, "a") as output_file:
|
||||
output_file.write(result.stdout)
|
||||
with subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, text=True) as proc:
|
||||
with open(output_file_path, "a") as output_file:
|
||||
for line in proc.stdout:
|
||||
output_file.write(line)
|
||||
|
||||
# Update the processed files count outside the inner loop
|
||||
with counter_lock:
|
||||
@ -40,6 +40,18 @@ def process_file(file, directory_path, output_directory, patterns, counter_lock,
|
||||
remaining_count = len(gzipped_files) - processed_counter[0]
|
||||
print(f"{file_path} processed. Remaining files: {remaining_count}")
|
||||
|
||||
# Function to check the size of the output files
|
||||
def check_file_size(output_directory):
|
||||
for filename in os.listdir(output_directory):
|
||||
output_file_path = os.path.join(output_directory, filename)
|
||||
output_file_size = os.path.getsize(output_file_path)
|
||||
|
||||
# If the output file size is greater than or equal to 125GB (125 * 1024 * 1024 * 1024 bytes),
|
||||
# create a new output file with a suffix indicating its sequence number
|
||||
if output_file_size >= 125 * 1024 * 1024 * 1024:
|
||||
new_filename = f"{filename[:-4]}_{processed_counter[0]}.txt"
|
||||
os.rename(output_file_path, os.path.join(output_directory, new_filename))
|
||||
|
||||
# Ask the user for the directory containing .txt, .txt.gz, and .zst files
|
||||
directory_path = input("Enter the directory path containing .txt, .txt.gz, and .zst files: ")
|
||||
|
||||
@ -59,10 +71,6 @@ if not gzipped_files:
|
||||
# Ask the user for the output directory
|
||||
output_directory = input("Enter the output directory path: ")
|
||||
|
||||
# Ensure the output directory exists; if not, create it
|
||||
if not os.path.exists(output_directory):
|
||||
os.makedirs(output_directory)
|
||||
|
||||
# Define the URL patterns and their corresponding output filenames
|
||||
url_patterns = {
|
||||
r'(\S+\.blogspot|\S*blogger)\.\S+': 'filtered_blogspot_blogger.txt',
|
||||
@ -87,4 +95,7 @@ with ThreadPoolExecutor(max_workers=num_concurrent_instances) as executor:
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
print(f"\nAll files processed. URLs appended to corresponding output files.")
|
||||
# Check file size for any remaining files
|
||||
check_file_size(output_directory)
|
||||
|
||||
print(f"\nAll files processed. URLs appended to corresponding output files.")
|
Loading…
Reference in New Issue
Block a user