Upload files to "/"

2023-12-12 09:11:22 +00:00 · 2023-12-12 09:11:22 +00:00 · 24547dd98d
commit 24547dd98d
2 changed files with 124 additions and 0 deletions
--- a/blogger_remove_img_lines.py
+++ b/blogger_remove_img_lines.py
@ -0,0 +1,57 @@
+import os
+import re
+from concurrent.futures import ThreadPoolExecutor
+
+def filter_image_lines(input_file_path, output_file_path):
+    # Check if the input file exists
+    if not os.path.exists(input_file_path):
+        print(f"The file {input_file_path} does not exist.")
+        return
+
+    # Regular expression pattern for image file extensions
+    image_extensions_pattern = re.compile(r'\b(?:jpg|png|gif|jpeg)\b', re.IGNORECASE)
+
+    with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
+        for line in input_file:
+            # Check if the line contains any of the specified image file extensions using regex
+            if not image_extensions_pattern.search(line):
+                # Write the line to the output file if it doesn't contain an image file extension
+                output_file.write(line)
+
+    print(f"Filtered lines with image file extensions in {input_file_path}. Filtered URLs saved to {output_file_path}.")
+
+def process_directory(directory_path, output_directory, concurrency):
+    # Get a list of all text files in the specified directory
+    text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
+
+    if not text_files:
+        print(f"No text files found in the directory {directory_path}.")
+        return
+
+    # Process text files concurrently with the specified concurrency level
+    with ThreadPoolExecutor(max_workers=concurrency) as executor:
+        # Submit each text file for processing
+        futures = [executor.submit(
+            filter_image_lines,
+            os.path.join(directory_path, text_file),
+            os.path.join(output_directory, f"filtered_{text_file}")
+        ) for text_file in text_files]
+
+        # Wait for all tasks to complete
+        for future in futures:
+            future.result()
+
+    print("Concurrent processing of text files completed.")
+
+# Get user input for the directory
+directory_path = input("Enter the path to the directory containing text files: ")
+
+# Get user input for the output directory
+output_directory = input("Enter the path to the directory for saving filtered URLs: ")
+
+# Get user input for the concurrency level
+concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))
+
+# Call the function to process all text files in the directory concurrently
+process_directory(directory_path, output_directory, concurrency)
+
--- a/blogger_url_cleaner.py
+++ b/blogger_url_cleaner.py
@ -0,0 +1,67 @@
+import os
+import re
+from concurrent.futures import ThreadPoolExecutor
+
+def filter_and_extract_urls(input_file_path, output_file_path):
+    # Check if the input file exists
+    if not os.path.exists(input_file_path):
+        print(f"The file {input_file_path} does not exist.")
+        return
+
+    # Regular expression pattern for URLs
+    url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE)
+
+    unique_urls = set()
+
+    with open(input_file_path, 'r') as input_file:
+        for line in input_file:
+            # Find all URLs in the line
+            urls = url_pattern.findall(line)
+            for url in urls:
+                # Extract the desired format
+                formatted_url = url.split('/')[2]
+                # Add the formatted URL to the set of unique URLs
+                unique_urls.add(formatted_url)
+
+    # Write unique URLs to the output file
+    with open(output_file_path, 'w') as output_file:
+        for unique_url in unique_urls:
+            output_file.write(unique_url + '\n')
+
+    print(f"Extracted and filtered URLs in {input_file_path}. Unique filtered URLs saved to {output_file_path}.")
+
+def process_directory(directory_path, output_directory, concurrency):
+    # Get a list of all text files in the specified directory
+    text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
+
+    if not text_files:
+        print(f"No text files found in the directory {directory_path}.")
+        return
+
+    # Process text files concurrently with the specified concurrency level
+    with ThreadPoolExecutor(max_workers=concurrency) as executor:
+        # Submit each text file for processing
+        futures = [executor.submit(
+            filter_and_extract_urls,
+            os.path.join(directory_path, text_file),
+            os.path.join(output_directory, f"filtered_{text_file}")
+        ) for text_file in text_files]
+
+        # Wait for all tasks to complete
+        for future in futures:
+            future.result()
+
+    print("Concurrent processing of text files completed.")
+
+# Get user input for the directory
+directory_path = input("Enter the path to the directory containing text files: ")
+
+# Get user input for the output directory
+output_directory = input("Enter the path to the directory for saving filtered URLs: ")
+
+# Get user input for the concurrency level
+concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))
+
+# Call the function to process all text files in the directory concurrently
+process_directory(directory_path, output_directory, concurrency)
+