2023-12-04 10:39:04 +00:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
directory=$(dirname "$0")
|
|
|
|
|
|
|
|
gzip_file() {
|
|
|
|
file="$1"
|
|
|
|
gzip "$file"
|
|
|
|
}
|
|
|
|
|
|
|
|
export -f gzip_file
|
|
|
|
|
|
|
|
for file in "$directory"/*_urls.txt; do
|
|
|
|
filename=$(basename "$file")
|
|
|
|
grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/commoncrawl/export/mediafire_urls.txt"
|
|
|
|
grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/commoncrawl/export/imgur_urls.txt"
|
|
|
|
grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/commoncrawl/export/discord_urls.txt"
|
|
|
|
|
|
|
|
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
|
|
|
parallel gzip_file ::: "$file" &
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
|
|
|
# Wait for all gzip processes to finish
|
2023-12-05 02:41:43 +00:00
|
|
|
wait
|