Added in support to compare already existing files to skip them being reprocessed
This commit is contained in:
parent
61f5a4ddbf
commit
d56d7bb81f
@ -3,10 +3,11 @@ import subprocess
|
|||||||
import json
|
import json
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
MEDIAFIRE_DIRECTORY = "/opt/MediaFire"
|
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
||||||
CONCURRENCY = 6 # Set the desired concurrency for downloading multiple files
|
CONCURRENCY = 6
|
||||||
BATCH_SIZE = 6 # Set the batch size for processing
|
BATCH_SIZE = 6
|
||||||
|
|
||||||
def run_cdxsummary(file_path, json_filepath):
|
def run_cdxsummary(file_path, json_filepath):
|
||||||
# Construct the cdxsummary command
|
# Construct the cdxsummary command
|
||||||
@ -48,6 +49,7 @@ def run_cdxsummary(file_path, json_filepath):
|
|||||||
print(f"Error running cdxsummary command: {e}")
|
print(f"Error running cdxsummary command: {e}")
|
||||||
|
|
||||||
def download_file(url):
|
def download_file(url):
|
||||||
|
# Command to download the file using axel
|
||||||
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
|
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
|
||||||
subprocess.run(command, shell=True)
|
subprocess.run(command, shell=True)
|
||||||
# Return the downloaded file path
|
# Return the downloaded file path
|
||||||
@ -125,22 +127,52 @@ def move_file(source_path):
|
|||||||
os.rename(source_path, destination_path)
|
os.rename(source_path, destination_path)
|
||||||
print(f"Moved '{file_name}' to the root folder.")
|
print(f"Moved '{file_name}' to the root folder.")
|
||||||
|
|
||||||
|
def filter_urls_to_download(urls_to_filter, filenames_to_exclude):
|
||||||
|
filtered_urls = [url for url in urls_to_filter if not any(filename in url for filename in filenames_to_exclude)]
|
||||||
|
return filtered_urls
|
||||||
|
|
||||||
|
def extract_filename_from_url(url):
|
||||||
|
# Extract the filename part from the URL
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
filename = os.path.basename(parsed_url.path)
|
||||||
|
return filename
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
run_ia_command() # Run IA command first
|
|
||||||
create_mediafire_directory()
|
create_mediafire_directory()
|
||||||
|
|
||||||
# Read the URLs from the file
|
# Create directory_output.txt
|
||||||
with open('ia_search_results.txt', 'r') as file:
|
directory_output_path = os.path.join(MEDIAFIRE_DIRECTORY, 'directory_output.txt')
|
||||||
urls = file.readlines()
|
with open(directory_output_path, 'w') as directory_output_file:
|
||||||
|
for filename in os.listdir(MEDIAFIRE_DIRECTORY):
|
||||||
|
if filename.endswith(".cdx.json"):
|
||||||
|
directory_output_file.write(f"{filename}\n")
|
||||||
|
|
||||||
# Remove leading/trailing whitespace from the URLs
|
run_ia_command() # Run IA command after creating directory_output.txt
|
||||||
urls = [url.strip() for url in urls]
|
|
||||||
|
# Read the URLs from the file
|
||||||
|
with open('ia_search_results.txt', 'r') as ia_search_file:
|
||||||
|
ia_search_urls = ia_search_file.readlines()
|
||||||
|
|
||||||
|
# Extract filenames from URLs
|
||||||
|
ia_search_filenames = [extract_filename_from_url(url.strip()) for url in ia_search_urls]
|
||||||
|
|
||||||
|
# Read the filenames from directory_output.txt and remove the .json extension
|
||||||
|
with open(directory_output_path, 'r') as directory_output_file:
|
||||||
|
directory_filenames = [line.strip().replace(".cdx.json", "") for line in directory_output_file.readlines()]
|
||||||
|
|
||||||
|
# Filter URLs that don't match filenames
|
||||||
|
filtered_urls = filter_urls_to_download(ia_search_urls, directory_filenames)
|
||||||
|
|
||||||
|
# Write filtered URLs to urls_to_download.txt
|
||||||
|
urls_to_download_path = os.path.join(MEDIAFIRE_DIRECTORY, 'urls_to_download.txt')
|
||||||
|
with open(urls_to_download_path, 'w') as urls_to_download_file:
|
||||||
|
urls_to_download_file.writelines(filtered_urls)
|
||||||
|
|
||||||
# Process URLs in batches
|
# Process URLs in batches
|
||||||
for i in range(0, len(urls), BATCH_SIZE):
|
for i in range(0, len(filtered_urls), BATCH_SIZE):
|
||||||
start_index = i
|
start_index = i
|
||||||
end_index = min(i + BATCH_SIZE, len(urls))
|
end_index = min(i + BATCH_SIZE, len(filtered_urls))
|
||||||
process_batch(urls, start_index, end_index)
|
process_batch(filtered_urls, start_index, end_index)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user