This commit is contained in:
datechnoman 2024-01-08 11:05:07 +00:00
parent 2bf2e02275
commit 42be737ee6

View File

@ -7,8 +7,8 @@ from urllib.parse import urlparse
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire" MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
ROOT_DIRECTORY = "/root/mediafire_files" ROOT_DIRECTORY = "/root/mediafire_files"
CONCURRENCY = 10 CONCURRENCY = 6
BATCH_SIZE = 10 BATCH_SIZE = 6
def run_cdxsummary(file_path, json_filepath): def run_cdxsummary(file_path, json_filepath):
# Construct the cdxsummary command # Construct the cdxsummary command
@ -50,9 +50,13 @@ def run_cdxsummary(file_path, json_filepath):
print(f"Error running cdxsummary command: {e}") print(f"Error running cdxsummary command: {e}")
def download_file(url): def download_file(url):
# Strip newline characters from the URL
url = url.strip()
# Command to download the file using axel # Command to download the file using axel
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
subprocess.run(command, shell=True) subprocess.run(command, shell=True)
# Return the downloaded file path # Return the downloaded file path
return os.path.basename(url) return os.path.basename(url)
@ -73,13 +77,22 @@ def process_batch(urls, start_index, end_index):
# Move files and run cdxsummary for each downloaded file # Move files and run cdxsummary for each downloaded file
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
for file_path in downloaded_files: futures = []
for file_url in downloaded_files:
# Extracting filename from the URL
file_name = os.path.basename(file_url)
# Construct file paths # Construct file paths
file_path = os.path.join(os.getcwd(), file_path) file_path = os.path.join(os.getcwd(), file_name)
json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
# Run cdxsummary and delete .cdx.gz file # Run cdxsummary and delete .cdx.gz file
executor.submit(run_cdxsummary, file_path, json_filepath) futures.append(executor.submit(run_cdxsummary, file_path, json_filepath))
# Wait for all tasks to complete before proceeding to the next batch
for future in futures:
future.result()
def run_ia_command(): def run_ia_command():
# Get the current date formatted as YYYY-MM-DD # Get the current date formatted as YYYY-MM-DD