diff --git a/urlextractor_archiveteam.sh b/urlextractor_archiveteam.sh index 581c4f2..03bb0fd 100644 --- a/urlextractor_archiveteam.sh +++ b/urlextractor_archiveteam.sh @@ -3,10 +3,10 @@ check_disk_space() { while true; do available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G') - if [ "$available_space" -ge 30 ]; then + if [ "$available_space" -ge 20 ]; then break else - echo "Waiting for more than 30GB free space on /dev/sda1. Current available space: ${available_space}GB" + echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB" sleep 300 # Sleep for 5 minutes fi done @@ -26,17 +26,17 @@ export -f gzip_file for file in "$directory"/*_urls.txt; do filename=$(basename "$file") - grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/commoncrawl/export/mediafire_urls.txt" - grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/commoncrawl/export/imgur_urls.txt" - grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/commoncrawl/export/discord_urls.txt" + grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt" + grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt" + grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt" if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then parallel gzip_file ::: "$file" & # Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt - if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/commoncrawl/urls_to_download.txt"; then + if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/CommonCrawl_URL_Processor/urls_to_download.txt"; then echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt" - remaining_count=$(wc -l "/opt/commoncrawl/urls_to_download.txt" | awk '{print $1}') + remaining_count=$(wc -l "/opt/CommonCrawl_URL_Processor/urls_to_download.txt" | awk '{print $1}') echo "WAT files remaining to be processed $remaining_count" else echo "Failed to remove $filename from urls_to_download.txt" @@ -47,5 +47,5 @@ done # Wait for all gzip processes to finish wait -# Move compressed files to /opt/commoncrawl -# mv "$directory"/*.gz /opt/commoncrawl/ \ No newline at end of file +# Move compressed files to /opt/CommonCrawl_URL_Processor +# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/ \ No newline at end of file