Delete urlextractor_archiveteam.sh

2024-03-31 11:39:22 +00:00 · 2024-03-31 11:39:22 +00:00 · c2d5ad43b8
commit c2d5ad43b8
parent 8b7607c7e6
1 changed files with 0 additions and 54 deletions
--- a/urlextractor_archiveteam.sh
+++ b/urlextractor_archiveteam.sh
@ -1,54 +0,0 @@
-#!/bin/bash
-
-check_disk_space() {
-    while true; do
-        available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G')
-        if [ "$available_space" -ge 20 ]; then
-            break
-        else
-            echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB"
-            sleep 300  # Sleep for 5 minutes
-        fi
-    done
-}
-
-# Check for free disk space before proceeding
-check_disk_space
-
-directory=$(dirname "$0")
-
-gzip_file() {
-    file="$1"
-    gzip "$file"
-}
-
-export -f gzip_file
-
-for file in "$directory"/*_urls.txt; do
-    filename=$(basename "$file")
-    
-    # Commented out the lines that extract URLs
-    # grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
-    # grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
-    # grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
-    # grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
-
-    if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
-        parallel gzip_file ::: "$file" &
-        
-        # Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
-        if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/CommonCrawl_URL_Processor/urls_to_download.txt"; then
-            echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"
-            remaining_count=$(wc -l "/opt/CommonCrawl_URL_Processor/urls_to_download.txt" | awk '{print $1}')
-            echo "WAT files remaining to be processed $remaining_count"
-        else
-            echo "Failed to remove $filename from urls_to_download.txt"
-        fi
-    fi
-done
-
-# Wait for all gzip processes to finish
-wait
-
-# Move compressed files to /opt/CommonCrawl_URL_Processor
-# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/