Update commoncrawl_wat_path_comparer.py
This commit is contained in:
parent
7d5b3653c6
commit
10c2658bff
@ -1,57 +1,57 @@
|
||||
import os
|
||||
|
||||
def get_user_input(message, default_value=""):
|
||||
user_input = input(message + f" ({default_value}): ")
|
||||
return user_input if user_input else default_value
|
||||
|
||||
def read_paths_from_file(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
return [line.strip() for line in file]
|
||||
|
||||
def list_files_in_directory(directory):
|
||||
return [f.replace("_urls.txt", "") for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
|
||||
|
||||
def main():
|
||||
# Get the directory containing wat.paths file
|
||||
wat_paths_directory = get_user_input("Enter the directory containing the "wat.paths" file")
|
||||
|
||||
# Construct the full path to wat.paths file
|
||||
wat_paths_file_path = os.path.join(wat_paths_directory, 'wat.paths')
|
||||
|
||||
# Check if wat.paths file exists
|
||||
if not os.path.isfile(wat_paths_file_path):
|
||||
print(f"Error: wat.paths file not found in the specified directory.")
|
||||
return
|
||||
|
||||
# Get the last level of the directory to compare against
|
||||
compare_last_level_directory = get_user_input("Enter the CommonCrawl Folder directory to compare against (Eg; CC-MAIN-2023-40_September_October_2023):")
|
||||
|
||||
# Combine the pre-filled path with the user-provided last level directory
|
||||
compare_directory = os.path.join("/media/ArchiveTeam/CommonCrawl_Files/WAT_URLs/", compare_last_level_directory)
|
||||
|
||||
# List files in the directory and modify each line
|
||||
directory_files = list_files_in_directory(compare_directory)
|
||||
|
||||
# Read paths from the wat.paths file
|
||||
paths = read_paths_from_file(wat_paths_file_path)
|
||||
|
||||
# Filter out paths that partially match directory lines
|
||||
filtered_paths = [path for path in paths if not any(directory_line in path for directory_line in directory_files)]
|
||||
|
||||
# Write the list of modified files to a text file
|
||||
output_file_path = 'missing_wat_files.txt'
|
||||
with open(output_file_path, 'w') as output_file:
|
||||
output_file.write("\n".join(filtered_paths))
|
||||
|
||||
# Append "https://data.commoncrawl.org/" to each line in the output file
|
||||
with open(output_file_path, 'r') as output_file:
|
||||
lines = output_file.readlines()
|
||||
|
||||
with open(output_file_path, 'w') as output_file:
|
||||
for line in lines:
|
||||
output_file.write(f"https://data.commoncrawl.org/{line}")
|
||||
|
||||
print(f"List of modified files in {compare_directory} written to {output_file_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
import os
|
||||
|
||||
def get_user_input(message, default_value=""):
|
||||
user_input = input(message + f" ({default_value}): ")
|
||||
return user_input if user_input else default_value
|
||||
|
||||
def read_paths_from_file(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
return [line.strip() for line in file]
|
||||
|
||||
def list_files_in_directory(directory):
|
||||
return [f.replace("_urls.txt", "") for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
|
||||
|
||||
def main():
|
||||
# Get the directory containing wat.paths file
|
||||
wat_paths_directory = get_user_input("Enter the directory containing the wat.paths file")
|
||||
|
||||
# Construct the full path to wat.paths file
|
||||
wat_paths_file_path = os.path.join(wat_paths_directory, 'wat.paths')
|
||||
|
||||
# Check if wat.paths file exists
|
||||
if not os.path.isfile(wat_paths_file_path):
|
||||
print(f"Error: wat.paths file not found in the specified directory.")
|
||||
return
|
||||
|
||||
# Get the last level of the directory to compare against
|
||||
compare_last_level_directory = get_user_input("Enter the CommonCrawl Folder directory to compare against (Eg; CC-MAIN-2023-40_September_October_2023):")
|
||||
|
||||
# Combine the pre-filled path with the user-provided last level directory
|
||||
compare_directory = os.path.join("/media/ArchiveTeam/CommonCrawl_Files/WAT_URLs/", compare_last_level_directory)
|
||||
|
||||
# List files in the directory and modify each line
|
||||
directory_files = list_files_in_directory(compare_directory)
|
||||
|
||||
# Read paths from the wat.paths file
|
||||
paths = read_paths_from_file(wat_paths_file_path)
|
||||
|
||||
# Filter out paths that partially match directory lines
|
||||
filtered_paths = [path for path in paths if not any(directory_line in path for directory_line in directory_files)]
|
||||
|
||||
# Write the list of modified files to a text file
|
||||
output_file_path = 'missing_wat_files.txt'
|
||||
with open(output_file_path, 'w') as output_file:
|
||||
output_file.write("\n".join(filtered_paths))
|
||||
|
||||
# Append "https://data.commoncrawl.org/" to each line in the output file
|
||||
with open(output_file_path, 'r') as output_file:
|
||||
lines = output_file.readlines()
|
||||
|
||||
with open(output_file_path, 'w') as output_file:
|
||||
for line in lines:
|
||||
output_file.write(f"https://data.commoncrawl.org/{line}")
|
||||
|
||||
print(f"List of modified files in {compare_directory} written to {output_file_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user