Migrated_Blogger_URL_Extrac.../blogger_remove_img_lines.py
2023-12-12 09:11:22 +00:00

58 lines
2.3 KiB
Python

import os
import re
from concurrent.futures import ThreadPoolExecutor
def filter_image_lines(input_file_path, output_file_path):
# Check if the input file exists
if not os.path.exists(input_file_path):
print(f"The file {input_file_path} does not exist.")
return
# Regular expression pattern for image file extensions
image_extensions_pattern = re.compile(r'\b(?:jpg|png|gif|jpeg)\b', re.IGNORECASE)
with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
for line in input_file:
# Check if the line contains any of the specified image file extensions using regex
if not image_extensions_pattern.search(line):
# Write the line to the output file if it doesn't contain an image file extension
output_file.write(line)
print(f"Filtered lines with image file extensions in {input_file_path}. Filtered URLs saved to {output_file_path}.")
def process_directory(directory_path, output_directory, concurrency):
# Get a list of all text files in the specified directory
text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
if not text_files:
print(f"No text files found in the directory {directory_path}.")
return
# Process text files concurrently with the specified concurrency level
with ThreadPoolExecutor(max_workers=concurrency) as executor:
# Submit each text file for processing
futures = [executor.submit(
filter_image_lines,
os.path.join(directory_path, text_file),
os.path.join(output_directory, f"filtered_{text_file}")
) for text_file in text_files]
# Wait for all tasks to complete
for future in futures:
future.result()
print("Concurrent processing of text files completed.")
# Get user input for the directory
directory_path = input("Enter the path to the directory containing text files: ")
# Get user input for the output directory
output_directory = input("Enter the path to the directory for saving filtered URLs: ")
# Get user input for the concurrency level
concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))
# Call the function to process all text files in the directory concurrently
process_directory(directory_path, output_directory, concurrency)