Migrated_Blogger_URL_Extrac.../blogger_url_cleaner.py

import os
import re
from concurrent.futures import ThreadPoolExecutor

def filter_and_extract_urls(input_file_path, output_file_path):
    # Check if the input file exists
    if not os.path.exists(input_file_path):
        print(f"The file {input_file_path} does not exist.")
        return

    # Regular expression pattern for URLs
    url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE)

    unique_urls = set()

    with open(input_file_path, 'r') as input_file:
        for line in input_file:
            # Find all URLs in the line
            urls = url_pattern.findall(line)
            for url in urls:
                # Extract the desired format
                formatted_url = url.split('/')[2]
                # Add the formatted URL to the set of unique URLs
                unique_urls.add(formatted_url)

    # Write unique URLs to the output file
    with open(output_file_path, 'w') as output_file:
        for unique_url in unique_urls:
            output_file.write(unique_url + '\n')

    print(f"Extracted and filtered URLs in {input_file_path}. Unique filtered URLs saved to {output_file_path}.")

def process_directory(directory_path, output_directory, concurrency):
    # Get a list of all text files in the specified directory
    text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]

    if not text_files:
        print(f"No text files found in the directory {directory_path}.")
        return

    # Process text files concurrently with the specified concurrency level
    with ThreadPoolExecutor(max_workers=concurrency) as executor:
        # Submit each text file for processing
        futures = [executor.submit(
            filter_and_extract_urls,
            os.path.join(directory_path, text_file),
            os.path.join(output_directory, f"filtered_{text_file}")
        ) for text_file in text_files]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

    print("Concurrent processing of text files completed.")

# Get user input for the directory
directory_path = input("Enter the path to the directory containing text files: ")

# Get user input for the output directory
output_directory = input("Enter the path to the directory for saving filtered URLs: ")

# Get user input for the concurrency level
concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))

# Call the function to process all text files in the directory concurrently
process_directory(directory_path, output_directory, concurrency)
Upload files to "/" 2023-12-12 09:11:22 +00:00			`import os`
			`import re`
			`from concurrent.futures import ThreadPoolExecutor`

			`def filter_and_extract_urls(input_file_path, output_file_path):`
			`# Check if the input file exists`
			`if not os.path.exists(input_file_path):`
			`print(f"The file {input_file_path} does not exist.")`
			`return`

			`# Regular expression pattern for URLs`
			`url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE)`

			`unique_urls = set()`

			`with open(input_file_path, 'r') as input_file:`
			`for line in input_file:`
			`# Find all URLs in the line`
			`urls = url_pattern.findall(line)`
			`for url in urls:`
			`# Extract the desired format`
			`formatted_url = url.split('/')[2]`
			`# Add the formatted URL to the set of unique URLs`
			`unique_urls.add(formatted_url)`

			`# Write unique URLs to the output file`
			`with open(output_file_path, 'w') as output_file:`
			`for unique_url in unique_urls:`
			`output_file.write(unique_url + '\n')`

			`print(f"Extracted and filtered URLs in {input_file_path}. Unique filtered URLs saved to {output_file_path}.")`

			`def process_directory(directory_path, output_directory, concurrency):`
			`# Get a list of all text files in the specified directory`
			`text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]`

			`if not text_files:`
			`print(f"No text files found in the directory {directory_path}.")`
			`return`

			`# Process text files concurrently with the specified concurrency level`
			`with ThreadPoolExecutor(max_workers=concurrency) as executor:`
			`# Submit each text file for processing`
			`futures = [executor.submit(`
			`filter_and_extract_urls,`
			`os.path.join(directory_path, text_file),`
			`os.path.join(output_directory, f"filtered_{text_file}")`
			`) for text_file in text_files]`

			`# Wait for all tasks to complete`
			`for future in futures:`
			`future.result()`

			`print("Concurrent processing of text files completed.")`

			`# Get user input for the directory`
			`directory_path = input("Enter the path to the directory containing text files: ")`

			`# Get user input for the output directory`
			`output_directory = input("Enter the path to the directory for saving filtered URLs: ")`

			`# Get user input for the concurrency level`
			`concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))`

			`# Call the function to process all text files in the directory concurrently`
			`process_directory(directory_path, output_directory, concurrency)`