Update urlextractor_archiveteam.sh
This commit is contained in:
parent
ac0f299269
commit
7f9480bc40
@ -3,10 +3,10 @@
|
|||||||
check_disk_space() {
|
check_disk_space() {
|
||||||
while true; do
|
while true; do
|
||||||
available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G')
|
available_space=$(df -BG /dev/sda1 | awk 'NR==2 {print $4}' | tr -d 'G')
|
||||||
if [ "$available_space" -ge 30 ]; then
|
if [ "$available_space" -ge 20 ]; then
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
echo "Waiting for more than 30GB free space on /dev/sda1. Current available space: ${available_space}GB"
|
echo "Waiting for more than 20GB free space on /dev/sda1. Current available space: ${available_space}GB"
|
||||||
sleep 300 # Sleep for 5 minutes
|
sleep 300 # Sleep for 5 minutes
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
@ -26,17 +26,17 @@ export -f gzip_file
|
|||||||
|
|
||||||
for file in "$directory"/*_urls.txt; do
|
for file in "$directory"/*_urls.txt; do
|
||||||
filename=$(basename "$file")
|
filename=$(basename "$file")
|
||||||
grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/commoncrawl/export/mediafire_urls.txt"
|
grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
|
||||||
grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/commoncrawl/export/imgur_urls.txt"
|
grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
|
||||||
grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/commoncrawl/export/discord_urls.txt"
|
grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
|
||||||
|
|
||||||
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
||||||
parallel gzip_file ::: "$file" &
|
parallel gzip_file ::: "$file" &
|
||||||
|
|
||||||
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
|
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
|
||||||
if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/commoncrawl/urls_to_download.txt"; then
|
if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/CommonCrawl_URL_Processor/urls_to_download.txt"; then
|
||||||
echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"
|
echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"
|
||||||
remaining_count=$(wc -l "/opt/commoncrawl/urls_to_download.txt" | awk '{print $1}')
|
remaining_count=$(wc -l "/opt/CommonCrawl_URL_Processor/urls_to_download.txt" | awk '{print $1}')
|
||||||
echo "WAT files remaining to be processed $remaining_count"
|
echo "WAT files remaining to be processed $remaining_count"
|
||||||
else
|
else
|
||||||
echo "Failed to remove $filename from urls_to_download.txt"
|
echo "Failed to remove $filename from urls_to_download.txt"
|
||||||
@ -47,5 +47,5 @@ done
|
|||||||
# Wait for all gzip processes to finish
|
# Wait for all gzip processes to finish
|
||||||
wait
|
wait
|
||||||
|
|
||||||
# Move compressed files to /opt/commoncrawl
|
# Move compressed files to /opt/CommonCrawl_URL_Processor
|
||||||
# mv "$directory"/*.gz /opt/commoncrawl/
|
# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/
|
Loading…
Reference in New Issue
Block a user