Update mediafire_automated_cdx_processor.py
This commit is contained in:
parent
d56d7bb81f
commit
2bf2e02275
@ -6,8 +6,9 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
||||||
CONCURRENCY = 6
|
ROOT_DIRECTORY = "/root/mediafire_files"
|
||||||
BATCH_SIZE = 6
|
CONCURRENCY = 10
|
||||||
|
BATCH_SIZE = 10
|
||||||
|
|
||||||
def run_cdxsummary(file_path, json_filepath):
|
def run_cdxsummary(file_path, json_filepath):
|
||||||
# Construct the cdxsummary command
|
# Construct the cdxsummary command
|
||||||
@ -75,7 +76,7 @@ def process_batch(urls, start_index, end_index):
|
|||||||
for file_path in downloaded_files:
|
for file_path in downloaded_files:
|
||||||
# Construct file paths
|
# Construct file paths
|
||||||
file_path = os.path.join(os.getcwd(), file_path)
|
file_path = os.path.join(os.getcwd(), file_path)
|
||||||
json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
|
json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
|
||||||
|
|
||||||
# Run cdxsummary and delete .cdx.gz file
|
# Run cdxsummary and delete .cdx.gz file
|
||||||
executor.submit(run_cdxsummary, file_path, json_filepath)
|
executor.submit(run_cdxsummary, file_path, json_filepath)
|
||||||
@ -91,7 +92,7 @@ def run_ia_command():
|
|||||||
ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist"
|
ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist"
|
||||||
|
|
||||||
# Output file for ia search results
|
# Output file for ia search results
|
||||||
output_file = "ia_search_results.txt"
|
output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Run the ia search command and write the output to a text file
|
# Run the ia search command and write the output to a text file
|
||||||
@ -118,7 +119,7 @@ def create_mediafire_directory():
|
|||||||
|
|
||||||
def move_file(source_path):
|
def move_file(source_path):
|
||||||
file_name = os.path.basename(source_path)
|
file_name = os.path.basename(source_path)
|
||||||
destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||||
|
|
||||||
# Check if the file has a ".cdx.gz" extension before moving
|
# Check if the file has a ".cdx.gz" extension before moving
|
||||||
if file_name.endswith(".cdx.gz"):
|
if file_name.endswith(".cdx.gz"):
|
||||||
@ -141,7 +142,7 @@ def main():
|
|||||||
create_mediafire_directory()
|
create_mediafire_directory()
|
||||||
|
|
||||||
# Create directory_output.txt
|
# Create directory_output.txt
|
||||||
directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt')
|
directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt')
|
||||||
with open(directory_output_path, 'w') as directory_output_file:
|
with open(directory_output_path, 'w') as directory_output_file:
|
||||||
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
|
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
|
||||||
if filename.endswith(".cdx.json"):
|
if filename.endswith(".cdx.json"):
|
||||||
@ -150,7 +151,7 @@ def main():
|
|||||||
run_ia_command() # Run IA command after creating directory_output.txt
|
run_ia_command() # Run IA command after creating directory_output.txt
|
||||||
|
|
||||||
# Read the URLs from the file
|
# Read the URLs from the file
|
||||||
with open('ia_search_results.txt', 'r') as ia_search_file:
|
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
|
||||||
ia_search_urls = ia_search_file.readlines()
|
ia_search_urls = ia_search_file.readlines()
|
||||||
|
|
||||||
# Extract filenames from URLs
|
# Extract filenames from URLs
|
||||||
@ -164,7 +165,7 @@ def main():
|
|||||||
filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
|
filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
|
||||||
|
|
||||||
# Write filtered URLs to urls_to_download.txt
|
# Write filtered URLs to urls_to_download.txt
|
||||||
urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt')
|
urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt')
|
||||||
with open(urls_to_download_path, 'w') as urls_to_download_file:
|
with open(urls_to_download_path, 'w') as urls_to_download_file:
|
||||||
urls_to_download_file.writelines(filtered_urls)
|
urls_to_download_file.writelines(filtered_urls)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user