From 7d5b3653c6edebee0ba289ce587fa4229eca6428 Mon Sep 17 00:00:00 2001 From: datechnoman Date: Fri, 15 Dec 2023 10:35:02 +0000 Subject: [PATCH] Upload files to "/" --- commoncrawl_wat_path_comparer.py | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 commoncrawl_wat_path_comparer.py diff --git a/commoncrawl_wat_path_comparer.py b/commoncrawl_wat_path_comparer.py new file mode 100644 index 0000000..dda3fee --- /dev/null +++ b/commoncrawl_wat_path_comparer.py @@ -0,0 +1,57 @@ +import os + +def get_user_input(message, default_value=""): + user_input = input(message + f" ({default_value}): ") + return user_input if user_input else default_value + +def read_paths_from_file(file_path): + with open(file_path, 'r') as file: + return [line.strip() for line in file] + +def list_files_in_directory(directory): + return [f.replace("_urls.txt", "") for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))] + +def main(): + # Get the directory containing wat.paths file + wat_paths_directory = get_user_input("Enter the directory containing the "wat.paths" file") + + # Construct the full path to wat.paths file + wat_paths_file_path = os.path.join(wat_paths_directory, 'wat.paths') + + # Check if wat.paths file exists + if not os.path.isfile(wat_paths_file_path): + print(f"Error: wat.paths file not found in the specified directory.") + return + + # Get the last level of the directory to compare against + compare_last_level_directory = get_user_input("Enter the CommonCrawl Folder directory to compare against (Eg; CC-MAIN-2023-40_September_October_2023):") + + # Combine the pre-filled path with the user-provided last level directory + compare_directory = os.path.join("/media/ArchiveTeam/CommonCrawl_Files/WAT_URLs/", compare_last_level_directory) + + # List files in the directory and modify each line + directory_files = list_files_in_directory(compare_directory) + + # Read paths from the wat.paths file + paths = read_paths_from_file(wat_paths_file_path) + + # Filter out paths that partially match directory lines + filtered_paths = [path for path in paths if not any(directory_line in path for directory_line in directory_files)] + + # Write the list of modified files to a text file + output_file_path = 'missing_wat_files.txt' + with open(output_file_path, 'w') as output_file: + output_file.write("\n".join(filtered_paths)) + + # Append "https://data.commoncrawl.org/" to each line in the output file + with open(output_file_path, 'r') as output_file: + lines = output_file.readlines() + + with open(output_file_path, 'w') as output_file: + for line in lines: + output_file.write(f"https://data.commoncrawl.org/{line}") + + print(f"List of modified files in {compare_directory} written to {output_file_path}") + +if __name__ == "__main__": + main()