Update urlextractor_archiveteam.sh
This commit is contained in:
parent
6a90fc7f5e
commit
d97491b4f0
@ -32,6 +32,15 @@ for file in "$directory"/*_urls.txt; do
|
|||||||
|
|
||||||
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
||||||
parallel gzip_file ::: "$file" &
|
parallel gzip_file ::: "$file" &
|
||||||
|
|
||||||
|
# Remove the line containing the filename (without "_urls.txt") from urls_to_download.txt
|
||||||
|
if sed -i "/$(echo "$filename" | sed 's/_urls.txt//')/d" "/opt/commoncrawl/urls_to_download.txt"; then
|
||||||
|
echo "File $(echo "$filename" | sed 's/_urls.txt//') has been successfully removed from urls_to_download.txt"
|
||||||
|
remaining_count=$(wc -l "/opt/commoncrawl/urls_to_download.txt" | awk '{print $1}')
|
||||||
|
echo "WAT files remaining to be processed $remaining_count"
|
||||||
|
else
|
||||||
|
echo "Failed to remove $filename from urls_to_download.txt"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user