Upload files to "/"
This commit is contained in:
commit
24547dd98d
57
blogger_remove_img_lines.py
Normal file
57
blogger_remove_img_lines.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
def filter_image_lines(input_file_path, output_file_path):
|
||||||
|
# Check if the input file exists
|
||||||
|
if not os.path.exists(input_file_path):
|
||||||
|
print(f"The file {input_file_path} does not exist.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Regular expression pattern for image file extensions
|
||||||
|
image_extensions_pattern = re.compile(r'\b(?:jpg|png|gif|jpeg)\b', re.IGNORECASE)
|
||||||
|
|
||||||
|
with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
|
||||||
|
for line in input_file:
|
||||||
|
# Check if the line contains any of the specified image file extensions using regex
|
||||||
|
if not image_extensions_pattern.search(line):
|
||||||
|
# Write the line to the output file if it doesn't contain an image file extension
|
||||||
|
output_file.write(line)
|
||||||
|
|
||||||
|
print(f"Filtered lines with image file extensions in {input_file_path}. Filtered URLs saved to {output_file_path}.")
|
||||||
|
|
||||||
|
def process_directory(directory_path, output_directory, concurrency):
|
||||||
|
# Get a list of all text files in the specified directory
|
||||||
|
text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
|
||||||
|
|
||||||
|
if not text_files:
|
||||||
|
print(f"No text files found in the directory {directory_path}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Process text files concurrently with the specified concurrency level
|
||||||
|
with ThreadPoolExecutor(max_workers=concurrency) as executor:
|
||||||
|
# Submit each text file for processing
|
||||||
|
futures = [executor.submit(
|
||||||
|
filter_image_lines,
|
||||||
|
os.path.join(directory_path, text_file),
|
||||||
|
os.path.join(output_directory, f"filtered_{text_file}")
|
||||||
|
) for text_file in text_files]
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
|
print("Concurrent processing of text files completed.")
|
||||||
|
|
||||||
|
# Get user input for the directory
|
||||||
|
directory_path = input("Enter the path to the directory containing text files: ")
|
||||||
|
|
||||||
|
# Get user input for the output directory
|
||||||
|
output_directory = input("Enter the path to the directory for saving filtered URLs: ")
|
||||||
|
|
||||||
|
# Get user input for the concurrency level
|
||||||
|
concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))
|
||||||
|
|
||||||
|
# Call the function to process all text files in the directory concurrently
|
||||||
|
process_directory(directory_path, output_directory, concurrency)
|
||||||
|
|
67
blogger_url_cleaner.py
Normal file
67
blogger_url_cleaner.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
def filter_and_extract_urls(input_file_path, output_file_path):
|
||||||
|
# Check if the input file exists
|
||||||
|
if not os.path.exists(input_file_path):
|
||||||
|
print(f"The file {input_file_path} does not exist.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Regular expression pattern for URLs
|
||||||
|
url_pattern = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE)
|
||||||
|
|
||||||
|
unique_urls = set()
|
||||||
|
|
||||||
|
with open(input_file_path, 'r') as input_file:
|
||||||
|
for line in input_file:
|
||||||
|
# Find all URLs in the line
|
||||||
|
urls = url_pattern.findall(line)
|
||||||
|
for url in urls:
|
||||||
|
# Extract the desired format
|
||||||
|
formatted_url = url.split('/')[2]
|
||||||
|
# Add the formatted URL to the set of unique URLs
|
||||||
|
unique_urls.add(formatted_url)
|
||||||
|
|
||||||
|
# Write unique URLs to the output file
|
||||||
|
with open(output_file_path, 'w') as output_file:
|
||||||
|
for unique_url in unique_urls:
|
||||||
|
output_file.write(unique_url + '\n')
|
||||||
|
|
||||||
|
print(f"Extracted and filtered URLs in {input_file_path}. Unique filtered URLs saved to {output_file_path}.")
|
||||||
|
|
||||||
|
def process_directory(directory_path, output_directory, concurrency):
|
||||||
|
# Get a list of all text files in the specified directory
|
||||||
|
text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
|
||||||
|
|
||||||
|
if not text_files:
|
||||||
|
print(f"No text files found in the directory {directory_path}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Process text files concurrently with the specified concurrency level
|
||||||
|
with ThreadPoolExecutor(max_workers=concurrency) as executor:
|
||||||
|
# Submit each text file for processing
|
||||||
|
futures = [executor.submit(
|
||||||
|
filter_and_extract_urls,
|
||||||
|
os.path.join(directory_path, text_file),
|
||||||
|
os.path.join(output_directory, f"filtered_{text_file}")
|
||||||
|
) for text_file in text_files]
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
|
print("Concurrent processing of text files completed.")
|
||||||
|
|
||||||
|
# Get user input for the directory
|
||||||
|
directory_path = input("Enter the path to the directory containing text files: ")
|
||||||
|
|
||||||
|
# Get user input for the output directory
|
||||||
|
output_directory = input("Enter the path to the directory for saving filtered URLs: ")
|
||||||
|
|
||||||
|
# Get user input for the concurrency level
|
||||||
|
concurrency = int(input("Enter the level of concurrency (e.g., 2, 4, 8): "))
|
||||||
|
|
||||||
|
# Call the function to process all text files in the directory concurrently
|
||||||
|
process_directory(directory_path, output_directory, concurrency)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user