Update mediafire_automated_cdx_processor.py
This commit is contained in:
parent
d56d7bb81f
commit
2bf2e02275
@ -6,8 +6,9 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from urllib.parse import urlparse
|
||||
|
||||
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
||||
CONCURRENCY = 6
|
||||
BATCH_SIZE = 6
|
||||
ROOT_DIRECTORY = "/root/mediafire_files"
|
||||
CONCURRENCY = 10
|
||||
BATCH_SIZE = 10
|
||||
|
||||
def run_cdxsummary(file_path, json_filepath):
|
||||
# Construct the cdxsummary command
|
||||
@ -75,7 +76,7 @@ def process_batch(urls, start_index, end_index):
|
||||
for file_path in downloaded_files:
|
||||
# Construct file paths
|
||||
file_path = os.path.join(os.getcwd(), file_path)
|
||||
json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
|
||||
json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
|
||||
|
||||
# Run cdxsummary and delete .cdx.gz file
|
||||
executor.submit(run_cdxsummary, file_path, json_filepath)
|
||||
@ -91,7 +92,7 @@ def run_ia_command():
|
||||
ia_search_command = f"ia search 'collection:archiveteam_mediafire addeddate:[{two_days_before} TO {current_date}]' --itemlist"
|
||||
|
||||
# Output file for ia search results
|
||||
output_file = "ia_search_results.txt"
|
||||
output_file = os.path.join(ROOT_DIRECTORY, "ia_search_results.txt")
|
||||
|
||||
try:
|
||||
# Run the ia search command and write the output to a text file
|
||||
@ -118,7 +119,7 @@ def create_mediafire_directory():
|
||||
|
||||
def move_file(source_path):
|
||||
file_name = os.path.basename(source_path)
|
||||
destination_path = os.path.join(MEDIAFIRE_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||
destination_path = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||
|
||||
# Check if the file has a ".cdx.gz" extension before moving
|
||||
if file_name.endswith(".cdx.gz"):
|
||||
@ -141,7 +142,7 @@ def main():
|
||||
create_mediafire_directory()
|
||||
|
||||
# Create directory_output.txt
|
||||
directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt')
|
||||
directory_output_path = os.path.join(ROOT_DIRECTORY, 'directory_output.txt')
|
||||
with open(directory_output_path, 'w') as directory_output_file:
|
||||
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
|
||||
if filename.endswith(".cdx.json"):
|
||||
@ -150,7 +151,7 @@ def main():
|
||||
run_ia_command() # Run IA command after creating directory_output.txt
|
||||
|
||||
# Read the URLs from the file
|
||||
with open('ia_search_results.txt', 'r') as ia_search_file:
|
||||
with open(os.path.join(ROOT_DIRECTORY, 'ia_search_results.txt'), 'r') as ia_search_file:
|
||||
ia_search_urls = ia_search_file.readlines()
|
||||
|
||||
# Extract filenames from URLs
|
||||
@ -164,7 +165,7 @@ def main():
|
||||
filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
|
||||
|
||||
# Write filtered URLs to urls_to_download.txt
|
||||
urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt')
|
||||
urls_to_download_path = os.path.join(ROOT_DIRECTORY, 'urls_to_download.txt')
|
||||
with open(urls_to_download_path, 'w') as urls_to_download_file:
|
||||
urls_to_download_file.writelines(filtered_urls)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user