Code fix
This commit is contained in:
parent
2bf2e02275
commit
42be737ee6
@ -7,8 +7,8 @@ from urllib.parse import urlparse
|
|||||||
|
|
||||||
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
MEDIAFIRE_DIRECTORY = "/opt/cdxfiles/mediafire"
|
||||||
ROOT_DIRECTORY = "/root/mediafire_files"
|
ROOT_DIRECTORY = "/root/mediafire_files"
|
||||||
CONCURRENCY = 10
|
CONCURRENCY = 6
|
||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 6
|
||||||
|
|
||||||
def run_cdxsummary(file_path, json_filepath):
|
def run_cdxsummary(file_path, json_filepath):
|
||||||
# Construct the cdxsummary command
|
# Construct the cdxsummary command
|
||||||
@ -50,9 +50,13 @@ def run_cdxsummary(file_path, json_filepath):
|
|||||||
print(f"Error running cdxsummary command: {e}")
|
print(f"Error running cdxsummary command: {e}")
|
||||||
|
|
||||||
def download_file(url):
|
def download_file(url):
|
||||||
|
# Strip newline characters from the URL
|
||||||
|
url = url.strip()
|
||||||
|
|
||||||
# Command to download the file using axel
|
# Command to download the file using axel
|
||||||
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
|
command = f'axel -n 1 {url}' # Set concurrency to 1 for each individual file
|
||||||
subprocess.run(command, shell=True)
|
subprocess.run(command, shell=True)
|
||||||
|
|
||||||
# Return the downloaded file path
|
# Return the downloaded file path
|
||||||
return os.path.basename(url)
|
return os.path.basename(url)
|
||||||
|
|
||||||
@ -73,13 +77,22 @@ def process_batch(urls, start_index, end_index):
|
|||||||
|
|
||||||
# Move files and run cdxsummary for each downloaded file
|
# Move files and run cdxsummary for each downloaded file
|
||||||
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
||||||
for file_path in downloaded_files:
|
futures = []
|
||||||
|
|
||||||
|
for file_url in downloaded_files:
|
||||||
|
# Extracting filename from the URL
|
||||||
|
file_name = os.path.basename(file_url)
|
||||||
|
|
||||||
# Construct file paths
|
# Construct file paths
|
||||||
file_path = os.path.join(os.getcwd(), file_path)
|
file_path = os.path.join(os.getcwd(), file_name)
|
||||||
json_filepath = os.path.join(ROOT_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
|
json_filepath = os.path.join(ROOT_DIRECTORY, file_name.replace(".cdx.gz", ".cdx.json"))
|
||||||
|
|
||||||
# Run cdxsummary and delete .cdx.gz file
|
# Run cdxsummary and delete .cdx.gz file
|
||||||
executor.submit(run_cdxsummary, file_path, json_filepath)
|
futures.append(executor.submit(run_cdxsummary, file_path, json_filepath))
|
||||||
|
|
||||||
|
# Wait for all tasks to complete before proceeding to the next batch
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
def run_ia_command():
|
def run_ia_command():
|
||||||
# Get the current date formatted as YYYY-MM-DD
|
# Get the current date formatted as YYYY-MM-DD
|
||||||
|
Loading…
Reference in New Issue
Block a user