Implemented subprocess for running multiple json extractions at once

This commit is contained in:
datechnoman 2023-12-27 08:46:36 +00:00
parent 8496391064
commit 182c58f1ce

View File

@ -1,8 +1,8 @@
import os import os
import subprocess import subprocess
from concurrent.futures import ThreadPoolExecutor
import json import json
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
MEDIAFIRE_DIRECTORY = "/opt/MediaFire" MEDIAFIRE_DIRECTORY = "/opt/MediaFire"
CONCURRENCY = 4 # Set the desired concurrency for downloading multiple files CONCURRENCY = 4 # Set the desired concurrency for downloading multiple files
@ -62,13 +62,14 @@ def process_batch(urls, start_index, end_index):
downloaded_files = download_files(batch_urls) downloaded_files = download_files(batch_urls)
# Move files and run cdxsummary for each downloaded file # Move files and run cdxsummary for each downloaded file
for file_path in downloaded_files: with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
# Construct file paths for file_path in downloaded_files:
file_path = os.path.join(os.getcwd(), file_path) # Construct file paths
json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json")) file_path = os.path.join(os.getcwd(), file_path)
json_filepath = os.path.join(MEDIAFIRE_DIRECTORY, file_path.replace(".cdx.gz", ".cdx.json"))
# Run cdxsummary and delete .cdx.gz file # Run cdxsummary and delete .cdx.gz file
run_cdxsummary(file_path, json_filepath) executor.submit(run_cdxsummary, file_path, json_filepath)
def run_ia_command(): def run_ia_command():
# Get the current date formatted as YYYY-MM-DD # Get the current date formatted as YYYY-MM-DD
@ -133,4 +134,3 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()