CommonCrawl_URL_Processor/urlextractor_archiveteam.sh

55 lines
2.1 KiB
Bash
Raw Normal View History

2023-12-04 10:39:04 +00:00
#!/bin/bash
2023-12-05 11:34:29 +00:00
check_disk_space() {
while true; do
available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G')
2023-12-12 10:22:22 +00:00
if [ "$available_space" -ge 20 ]; then
2023-12-05 11:34:29 +00:00
break
else
2023-12-12 10:22:22 +00:00
echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB"
2023-12-05 11:34:29 +00:00
sleep 300 # Sleep for 5 minutes
fi
done
}
# Check for free disk space before proceeding
check_disk_space
2023-12-04 10:39:04 +00:00
directory=$(dirname "$0")
gzip_file() {
file="$1"
gzip "$file"
}
export -f gzip_file
for file in "$directory"/*_urls.txt; do
filename=$(basename "$file")
# Commented out the lines that extract URLs
# grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
# grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
# grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
# grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
2023-12-04 10:39:04 +00:00
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
parallel gzip_file ::: "$file" &
2023-12-10 05:10:10 +00:00
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
2023-12-12 10:22:22 +00:00
if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/CommonCrawl_URL_Processor/urls_to_download.txt"; then
2023-12-10 05:10:10 +00:00
echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"
2023-12-12 10:22:22 +00:00
remaining_count=$(wc -l "/opt/CommonCrawl_URL_Processor/urls_to_download.txt" | awk '{print $1}')
2023-12-10 05:10:10 +00:00
echo "WAT files remaining to be processed $remaining_count"
else
echo "Failed to remove $filename from urls_to_download.txt"
fi
2023-12-04 10:39:04 +00:00
fi
done
# Wait for all gzip processes to finish
2023-12-05 10:20:47 +00:00
wait
2023-12-12 10:22:22 +00:00
# Move compressed files to /opt/CommonCrawl_URL_Processor
# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/