commit 24547dd98ddd97f9288b821a3ce8a01ac05f0a33 Author: datechnoman Date: Tue Dec 12 09:11:22 2023 +0000 Upload files to "/" diff --git a/blogger_remove_img_lines.py b/blogger_remove_img_lines.py new file mode 100644 index 0000000..84e84e3 --- /dev/null +++ b/blogger_remove_img_lines.py @@ -0,0 +1,57 @@ +import os +import re +from concurrent.futures import ThreadPoolExecutor + +def filter_image_lines(input_file_path, output_file_path): + # Check if the input file exists + if not os.path.exists(input_file_path): + print(f"The file {input_file_path} does not exist.") + return + + # Regular expression pattern for image file extensions + image_extensions_pattern = re.compile(r'\b(?:jpg|png|gif|jpeg)\b', re.IGNORECASE) + + with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file: + for line in input_file: + # Check if the line contains any of the specified image file extensions using regex + if not image_extensions_pattern.search(line): + # Write the line to the output file if it doesn't contain an image file extension + output_file.write(line) + + print(f"Filtered lines with image file extensions in {input_file_path}. Filtered URLs saved to {output_file_path}.") + +def process_directory(directory_path, output_directory, concurrency): + # Get a list of all text files in the specified directory + text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')] + + if not text_files: + print(f"No text files found in the directory {directory_path}.") + return + + # Process text files concurrently with the specified concurrency level + with ThreadPoolExecutor(max_workers=concurrency) as executor: + # Submit each text file for processing + futures = [executor.submit( + filter_image_lines, + os.path.join(directory_path, text_file), + os.path.join(output_directory, f"filtered_{text_file}") + ) for text_file in text_files] + + # Wait for all tasks to complete + for future in futures: + future.result() + + print("Concurrent processing of text files completed.") + +# Get user input for the directory +directory_path = input("Enter the path to the directory containing text files: ") + +# Get user input for the output directory +output_directory = input("Enter the path to the directory for saving filtered URLs: ") + +# Get user input for the concurrency level +concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): ")) + +# Call the function to process all text files in the directory concurrently +process_directory(directory_path, output_directory, concurrency) + diff --git a/blogger_url_cleaner.py b/blogger_url_cleaner.py new file mode 100644 index 0000000..a82aeaf --- /dev/null +++ b/blogger_url_cleaner.py @@ -0,0 +1,67 @@ +import os +import re +from concurrent.futures import ThreadPoolExecutor + +def filter_and_extract_urls(input_file_path, output_file_path): + # Check if the input file exists + if not os.path.exists(input_file_path): + print(f"The file {input_file_path} does not exist.") + return + + # Regular expression pattern for URLs + url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE) + + unique_urls = set() + + with open(input_file_path, 'r') as input_file: + for line in input_file: + # Find all URLs in the line + urls = url_pattern.findall(line) + for url in urls: + # Extract the desired format + formatted_url = url.split('/')[2] + # Add the formatted URL to the set of unique URLs + unique_urls.add(formatted_url) + + # Write unique URLs to the output file + with open(output_file_path, 'w') as output_file: + for unique_url in unique_urls: + output_file.write(unique_url + '\n') + + print(f"Extracted and filtered URLs in {input_file_path}. Unique filtered URLs saved to {output_file_path}.") + +def process_directory(directory_path, output_directory, concurrency): + # Get a list of all text files in the specified directory + text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')] + + if not text_files: + print(f"No text files found in the directory {directory_path}.") + return + + # Process text files concurrently with the specified concurrency level + with ThreadPoolExecutor(max_workers=concurrency) as executor: + # Submit each text file for processing + futures = [executor.submit( + filter_and_extract_urls, + os.path.join(directory_path, text_file), + os.path.join(output_directory, f"filtered_{text_file}") + ) for text_file in text_files] + + # Wait for all tasks to complete + for future in futures: + future.result() + + print("Concurrent processing of text files completed.") + +# Get user input for the directory +directory_path = input("Enter the path to the directory containing text files: ") + +# Get user input for the output directory +output_directory = input("Enter the path to the directory for saving filtered URLs: ") + +# Get user input for the concurrency level +concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): ")) + +# Call the function to process all text files in the directory concurrently +process_directory(directory_path, output_directory, concurrency) +