import os def get_user_input(message, default_value=""): user_input = input(message + f" ({default_value}): ") return user_input if user_input else default_value def read_paths_from_file(file_path): with open(file_path, 'r') as file: return [line.strip() for line in file] def list_files_in_directory(directory): return [f.replace("_urls.txt.zst", ".gz") for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))] def main(): # Get the directory containing wat.paths file wat_paths_directory = get_user_input("Enter the directory containing the wat.paths file") # Construct the full path to wat.paths file wat_paths_file_path = os.path.join(wat_paths_directory, 'wat.paths') # Check if wat.paths file exists if not os.path.isfile(wat_paths_file_path): print(f"Error: wat.paths file not found in the specified directory.") return # Get the last level of the directory to compare against compare_last_level_directory = get_user_input("Enter the CommonCrawl Folder directory to compare against (Eg; CC-MAIN-2023-40_September_October_2023):") # Combine the pre-filled path with the user-provided last level directory compare_directory = os.path.join("/media/ArchiveTeam/CommonCrawl_Files/WAT_URLs/", compare_last_level_directory) # List files in the directory and modify each line directory_files = list_files_in_directory(compare_directory) # Read paths from the wat.paths file paths = read_paths_from_file(wat_paths_file_path) # Filter out paths that exactly match directory lines filtered_paths = [path for path in paths if os.path.basename(path) not in directory_files] # Write the list of modified files to a text file output_file_path = 'missing_wat_files.txt' with open(output_file_path, 'w') as output_file: output_file.write("\n".join(filtered_paths)) # Append "https://data.commoncrawl.org/" to each line in the output file with open(output_file_path, 'r') as output_file: lines = output_file.readlines() with open(output_file_path, 'w') as output_file: for line in lines: output_file.write(f"https://data.commoncrawl.org/{line}") print(f"List of modified files in {compare_directory} written to {output_file_path}") if __name__ == "__main__": main()