CommonCrawl_URL_Processor/urlextractor_archiveteam.sh

#!/bin/bash

check_disk_space() {
    while true; do
        available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G')
        if [ "$available_space" -ge 20 ]; then
            break
        else
            echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB"
            sleep 300  # Sleep for 5 minutes
        fi
    done
}

# Check for free disk space before proceeding
check_disk_space

directory=$(dirname "$0")

gzip_file() {
    file="$1"
    gzip "$file"
}

export -f gzip_file

for file in "$directory"/*_urls.txt; do
    filename=$(basename "$file")
    
    # Commented out the lines that extract URLs
    # grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
    # grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
    # grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
    # grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"

    if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
        parallel gzip_file ::: "$file" &
        
        # Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
        if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/CommonCrawl_URL_Processor/urls_to_download.txt"; then
            echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"
            remaining_count=$(wc -l "/opt/CommonCrawl_URL_Processor/urls_to_download.txt" | awk '{print $1}')
            echo "WAT files remaining to be processed $remaining_count"
        else
            echo "Failed to remove $filename from urls_to_download.txt"
        fi
    fi
done

# Wait for all gzip processes to finish
wait

# Move compressed files to /opt/CommonCrawl_URL_Processor
# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/
Upload files to "/" 2023-12-04 10:39:04 +00:00			`#!/bin/bash`

Update urlextractor_archiveteam.sh 2023-12-05 11:34:29 +00:00			`check_disk_space() {`
			`while true; do`
			`available_space=$(df -BG /dev/sda1 \| awk 'NR==2 {print $4}' \| tr -d 'G')`
Update urlextractor_archiveteam.sh 2023-12-12 10:22:22 +00:00			`if [ "$available_space" -ge 20 ]; then`
Update urlextractor_archiveteam.sh 2023-12-05 11:34:29 +00:00			`break`
			`else`
Update urlextractor_archiveteam.sh 2023-12-12 10:22:22 +00:00			`echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB"`
Update urlextractor_archiveteam.sh 2023-12-05 11:34:29 +00:00			`sleep 300 # Sleep for 5 minutes`
			`fi`
			`done`
			`}`

			`# Check for free disk space before proceeding`
			`check_disk_space`

Upload files to "/" 2023-12-04 10:39:04 +00:00			`directory=$(dirname "$0")`

			`gzip_file() {`
			`file="$1"`
			`gzip "$file"`
			`}`

			`export -f gzip_file`

			`for file in "$directory"/*_urls.txt; do`
			`filename=$(basename "$file")`
Commented out URL Extractions (to be done post downloading of files) 2024-01-12 04:02:06 +00:00
			`# Commented out the lines that extract URLs`
			`# grep -E "http(s)?://(www\.)?mediafire.com" "$file" \| sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"`
			`# grep -E "http(s)?://(www\.)?i.imgur.com" "$file" \| sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"`
			`# grep -E "http(s)?://(www\.)?pastebin.com" "$file" \| sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"`
			`# grep "https://cdn.discordapp.com/" "$file" \| sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"`
Upload files to "/" 2023-12-04 10:39:04 +00:00
			`if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then`
			`parallel gzip_file ::: "$file" &`
Update urlextractor_archiveteam.sh 2023-12-10 05:10:10 +00:00
			`# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt`
Update urlextractor_archiveteam.sh 2023-12-12 10:22:22 +00:00			`if sed -i "/$(echo "$filename" \| sed 's/_urls.txt//')/d" "/opt/CommonCrawl_URL_Processor/urls_to_download.txt"; then`
Update urlextractor_archiveteam.sh 2023-12-10 05:10:10 +00:00			`echo "File $(echo "$filename" \| sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"`
Update urlextractor_archiveteam.sh 2023-12-12 10:22:22 +00:00			`remaining_count=$(wc -l "/opt/CommonCrawl_URL_Processor/urls_to_download.txt" \| awk '{print $1}')`
Update urlextractor_archiveteam.sh 2023-12-10 05:10:10 +00:00			`echo "WAT files remaining to be processed $remaining_count"`
			`else`
			`echo "Failed to remove $filename from urls_to_download.txt"`
			`fi`
Upload files to "/" 2023-12-04 10:39:04 +00:00			`fi`
			`done`

			`# Wait for all gzip processes to finish`
Update urlextractor_archiveteam.sh 2023-12-05 10:20:47 +00:00			`wait`

Update urlextractor_archiveteam.sh 2023-12-12 10:22:22 +00:00			`# Move compressed files to /opt/CommonCrawl_URL_Processor`
Commented out URL Extractions (to be done post downloading of files) 2024-01-12 04:02:06 +00:00			`# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/`