#!/bin/bash check_disk_space() { while true; do available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G') if [ "$available_space" -ge 20 ]; then break else echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB" sleep 300 # Sleep for 5 minutes fi done } # Check for free disk space before proceeding check_disk_space directory=$(dirname "$0") gzip_file() { file="$1" gzip "$file" } export -f gzip_file for file in "$directory"/*_urls.txt; do filename=$(basename "$file") grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/commoncrawl/export/mediafire_urls.txt" grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/commoncrawl/export/imgur_urls.txt" grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/commoncrawl/export/discord_urls.txt" if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then parallel gzip_file ::: "$file" & fi done # Wait for all gzip processes to finish wait # Move compressed files to /opt/commoncrawl # mv "$directory"/*.gz /opt/commoncrawl/