From d97491b4f05f76adef04904ab682a5aa5c8eb1c8 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Sun, 10 Dec 2023 05:10:10 +0000 Subject: [PATCH] Update urlextractor_archiveteam.sh --- urlextractor_archiveteam.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/urlextractor_archiveteam.sh b/urlextractor_archiveteam.sh index cc000a6..581c4f2 100644 --- a/urlextractor_archiveteam.sh +++ b/urlextractor_archiveteam.sh @@ -32,6 +32,15 @@ for file in "$directory"/*_urls.txt; do if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then parallel gzip_file ::: "$file" & + + # Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt + if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/commoncrawl/urls_to_download.txt"; then + echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt" + remaining_count=$(wc -l "/opt/commoncrawl/urls_to_download.txt" | awk '{print $1}') + echo "WAT files remaining to be processed $remaining_count" + else + echo "Failed to remove $filename from urls_to_download.txt" + fi fi done