CommonCrawl_URL_Processor/urlextractor_archiveteam.sh

27 lines
847 B
Bash

#!/bin/bash
directory=$(dirname "$0")
gzip_file() {
file="$1"
gzip "$file"
}
export -f gzip_file
for file in "$directory"/*_urls.txt; do
filename=$(basename "$file")
grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/commoncrawl/export/mediafire_urls.txt"
grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/commoncrawl/export/imgur_urls.txt"
grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/commoncrawl/export/discord_urls.txt"
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
parallel gzip_file ::: "$file" &
fi
done
# Wait for all gzip processes to finish
wait
# Move compressed files to /opt/commoncrawl
mv "$directory"/*.gz /opt/commoncrawl/