Commented out URL Extractions (to be done post downloading of files)
This commit is contained in:
parent
cba96e96e7
commit
b881641c69
@ -26,10 +26,12 @@ export -f gzip_file
|
|||||||
|
|
||||||
for file in "$directory"/*_urls.txt; do
|
for file in "$directory"/*_urls.txt; do
|
||||||
filename=$(basename "$file")
|
filename=$(basename "$file")
|
||||||
grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
|
|
||||||
grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
|
# Commented out the lines that extract URLs
|
||||||
grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
|
# grep -E "http(s)?://(www\.)?mediafire.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/mediafire_urls.txt"
|
||||||
grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
|
# grep -E "http(s)?://(www\.)?i.imgur.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/imgur_urls.txt"
|
||||||
|
# grep -E "http(s)?://(www\.)?pastebin.com" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/pastebin_urls.txt"
|
||||||
|
# grep "https://cdn.discordapp.com/" "$file" | sort -u >> "/opt/CommonCrawl_URL_Processor/export/discord_urls.txt"
|
||||||
|
|
||||||
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
if [[ $filename != "mediafire_urls.txt" && $filename != "t.me_urls.txt" && $filename != "telegram.me_urls.txt" && $filename != "sitemap_urls.txt" ]]; then
|
||||||
parallel gzip_file ::: "$file" &
|
parallel gzip_file ::: "$file" &
|
||||||
@ -49,4 +51,4 @@ done
|
|||||||
wait
|
wait
|
||||||
|
|
||||||
# Move compressed files to /opt/CommonCrawl_URL_Processor
|
# Move compressed files to /opt/CommonCrawl_URL_Processor
|
||||||
# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/
|
# mv "$directory"/*.gz /opt/CommonCrawl_URL_Processor/
|
||||||
|
Loading…
Reference in New Issue
Block a user