Commented out URL Extractions (to be done post downloading of files)

2024-01-12 04:02:06 +00:00 · 2024-01-12 04:02:06 +00:00 · b881641c69
commit b881641c69
parent cba96e96e7
1 changed files with 7 additions and 5 deletions
--- a/urlextractor_archiveteam.sh
+++ b/urlextractor_archiveteam.sh
@ -26,10 +26,12 @@ export -f gzip_file
 for file in "$directory"/*_urls.txt; do
    filename=$(basename "$file")
-    grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
+    
-    grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
+    # Commented out the lines that extract URLs
-    grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
+    # grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
-    grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
+    # grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
    # grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
    # grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
    if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
        parallel gzip_file ::: "$file" &