From b881641c69375ccf8d3d53d172bd26980d5b1e2e Mon Sep 17 00:00:00 2001 From: datechnoman Date: Fri, 12 Jan 2024 04:02:06 +0000 Subject: [PATCH] Commented out URL Extractions (to be done post downloading of files) --- urlextractor_archiveteam.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/urlextractor_archiveteam.sh b/urlextractor_archiveteam.sh index 5878834..b8289fb 100644 --- a/urlextractor_archiveteam.sh +++ b/urlextractor_archiveteam.sh @@ -26,10 +26,12 @@ export -f gzip_file for file in "$directory"/*_urls.txt; do filename=$(basename "$file") - grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt" - grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt" - grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt" - grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt" + + # Commented out the lines that extract URLs + # grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt" + # grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt" + # grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt" + # grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt" if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then parallel gzip_file ::: "$file" & @@ -49,4 +51,4 @@ done wait # Move compressed files to /opt/CommonCrawl_URL_Processor -# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/ \ No newline at end of file +# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/