I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
Okay, here is a Python script that accomplishes the task. It reads the file list, gets the size of each file, sorts them largest first, and then iteratively appends the content of the next largest file to the currently smallest output file.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import os import sys # --- Configuration --- FILE_LIST_PATH = '/tmp/files' OUTPUT_DIR = '/tmp' OUTPUT_PREFIX = 'merged_' NUM_OUTPUT_FILES = 16 CHUNK_SIZE = 1024 * 1024 # Read/write in 1MB chunks for efficiency with large files # --- End Configuration --- def get_files_with_sizes(list_path): """Reads the list file and returns a list of (filepath, size) tuples.""" files_with_sizes = [] print(f"Reading file list from: {list_path}") try: with open(list_path, 'r') as f_list: for line in f_list: filepath = line.strip() if not filepath: continue if os.path.isfile(filepath): try: size = os.path.getsize(filepath) files_with_sizes.append((filepath, size)) except OSError as e: print(f"Warning: Could not get size for '{filepath}'. Skipping. Error: {e}", file=sys.stderr) else: print(f"Warning: Path '{filepath}' listed in {list_path} is not a file or does not exist. Skipping.", file=sys.stderr) except FileNotFoundError: print(f"Error: File list '{list_path}' not found.", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error reading file list '{list_path}': {e}", file=sys.stderr) sys.exit(1) if not files_with_sizes: print("Error: No valid files found from the list.", file=sys.stderr) sys.exit(1) print(f"Found {len(files_with_sizes)} valid files.") return files_with_sizes def merge_and_split_files(files_with_sizes, num_output, out_dir, out_prefix): """Merges files and splits them according to the specified strategy.""" # Sort files by size, largest first files_with_sizes.sort(key=lambda item: item[1], reverse=True) print("Sorted files by size (largest first).") # Initialize output file sizes and handles output_file_paths = [os.path.join(out_dir, f"{out_prefix}{i}") for i in range(num_output)] output_file_sizes = [0] * num_output output_file_handles = [] try: # Ensure output directory exists (though /tmp usually does) os.makedirs(out_dir, exist_ok=True) # Open all output files in append binary mode ('ab') # Opening them once is more efficient than open/close for each input file print("Opening output files...") for i in range(num_output): path = output_file_paths[i] # Clear the file if it exists from a previous run try: with open(path, 'wb') as f_out_init: pass # Just truncate the file # Now open in append mode output_file_handles.append(open(path, 'ab')) except IOError as e: print(f"Error opening output file '{path}' for writing: {e}", file=sys.stderr) # Close any already opened handles before exiting for handle in output_file_handles: handle.close() sys.exit(1) print(f"Processing {len(files_with_sizes)} input files...") processed_count = 0 # Iterate through sorted files (largest first) for filepath, size in files_with_sizes: # Find the index of the currently smallest output file min_size = float('inf') min_index = -1 for i in range(num_output): if output_file_sizes[i] < min_size: min_size = output_file_sizes[i] min_index = i # Select the target output file handle target_handle = output_file_handles[min_index] target_path = output_file_paths[min_index] # print(f" Adding '{os.path.basename(filepath)}' ({size} bytes) to '{os.path.basename(target_path)}' (current size: {output_file_sizes[min_index]})") # Append the content of the input file to the target output file try: with open(filepath, 'rb') as f_in: while True: chunk = f_in.read(CHUNK_SIZE) if not chunk: break target_handle.write(chunk) # Update the size tracker for the target output file output_file_sizes[min_index] += size processed_count += 1 if processed_count % 100 == 0: print(f" Processed {processed_count}/{len(files_with_sizes)} files...") except IOError as e: print(f"Warning: Error reading '{filepath}' or writing to '{target_path}'. Skipping file. Error: {e}", file=sys.stderr) except Exception as e: print(f"Warning: Unexpected error processing '{filepath}'. Skipping file. Error: {e}", file=sys.stderr) finally: # Ensure all output files are closed print("Closing output files...") for handle in output_file_handles: if handle and not handle.closed: handle.close() print("\n--- Processing Complete ---") print("Final approximate sizes of output files:") for i in range(num_output): final_size_mb = output_file_sizes[i] / (1024 * 1024) print(f" {output_file_paths[i]}: {output_file_sizes[i]} bytes (~{final_size_mb:.2f} MB)") # --- Main Execution --- if __name__ == "__main__": # 1. Get the list of files and their sizes files_to_process = get_files_with_sizes(FILE_LIST_PATH) # 2. Perform the merge and split operation merge_and_split_files( files_to_process, NUM_OUTPUT_FILES, OUTPUT_DIR, OUTPUT_PREFIX ) print("\nScript finished successfully.") |
1 2 | python merge_split.py |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import os import sys # --- Configuration --- FILE_LIST_PATH = '/tmp/files' OUTPUT_DIR = '/tmp' OUTPUT_PREFIX = 'merged_' NUM_OUTPUT_FILES = 16 CHUNK_SIZE = 1024 * 1024 # Read/write in 1MB chunks for efficiency with large files # --- End Configuration --- def get_files_with_sizes(list_path): """Reads the list file and returns a list of (filepath, size) tuples.""" files_with_sizes = [] print(f"Reading file list from: {list_path}") try: with open(list_path, 'r') as f_list: for line in f_list: filepath = line.strip() if not filepath: continue if os.path.isfile(filepath): try: size = os.path.getsize(filepath) files_with_sizes.append((filepath, size)) except OSError as e: print(f"Warning: Could not get size for '{filepath}'. Skipping. Error: {e}", file=sys.stderr) else: print(f"Warning: Path '{filepath}' listed in {list_path} is not a file or does not exist. Skipping.", file=sys.stderr) except FileNotFoundError: print(f"Error: File list '{list_path}' not found.", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error reading file list '{list_path}': {e}", file=sys.stderr) sys.exit(1) if not files_with_sizes: print("Error: No valid files found from the list.", file=sys.stderr) sys.exit(1) print(f"Found {len(files_with_sizes)} valid files.") return files_with_sizes def merge_and_split_files(files_with_sizes, num_output, out_dir, out_prefix): """Merges files and splits them according to the specified strategy.""" # Sort files by size, largest first files_with_sizes.sort(key=lambda item: item[1], reverse=True) print("Sorted files by size (largest first).") # Initialize output file sizes and handles output_file_paths = [os.path.join(out_dir, f"{out_prefix}{i}") for i in range(num_output)] output_file_sizes = [0] * num_output output_file_handles = [] try: # Ensure output directory exists (though /tmp usually does) os.makedirs(out_dir, exist_ok=True) # Open all output files in append binary mode ('ab') # Opening them once is more efficient than open/close for each input file print("Opening output files...") for i in range(num_output): path = output_file_paths[i] # Clear the file if it exists from a previous run try: with open(path, 'wb') as f_out_init: pass # Just truncate the file # Now open in append mode output_file_handles.append(open(path, 'ab')) except IOError as e: print(f"Error opening output file '{path}' for writing: {e}", file=sys.stderr) # Close any already opened handles before exiting for handle in output_file_handles: handle.close() sys.exit(1) print(f"Processing {len(files_with_sizes)} input files...") processed_count = 0 # Iterate through sorted files (largest first) for filepath, size in files_with_sizes: # Find the index of the currently smallest output file min_size = float('inf') min_index = -1 for i in range(num_output): if output_file_sizes[i] < min_size: min_size = output_file_sizes[i] min_index = i # Select the target output file handle target_handle = output_file_handles[min_index] target_path = output_file_paths[min_index] # print(f" Adding '{os.path.basename(filepath)}' ({size} bytes) to '{os.path.basename(target_path)}' (current size: {output_file_sizes[min_index]})") # Append the content of the input file to the target output file try: with open(filepath, 'rb') as f_in: while True: chunk = f_in.read(CHUNK_SIZE) if not chunk: break target_handle.write(chunk) # Update the size tracker for the target output file output_file_sizes[min_index] += size processed_count += 1 if processed_count % 100 == 0: print(f" Processed {processed_count}/{len(files_with_sizes)} files...") except IOError as e: print(f"Warning: Error reading '{filepath}' or writing to '{target_path}'. Skipping file. Error: {e}", file=sys.stderr) except Exception as e: print(f"Warning: Unexpected error processing '{filepath}'. Skipping file. Error: {e}", file=sys.stderr) finally: # Ensure all output files are closed print("Closing output files...") for handle in output_file_handles: if handle and not handle.closed: handle.close() print("\n--- Processing Complete ---") print("Final approximate sizes of output files:") for i in range(num_output): final_size_mb = output_file_sizes[i] / (1024 * 1024) print(f" {output_file_paths[i]}: {output_file_sizes[i]} bytes (~{final_size_mb:.2f} MB)") # --- Main Execution --- if __name__ == "__main__": # 1. Get the list of files and their sizes files_to_process = get_files_with_sizes(FILE_LIST_PATH) # 2. Perform the merge and split operation merge_and_split_files( files_to_process, NUM_OUTPUT_FILES, OUTPUT_DIR, OUTPUT_PREFIX ) print("\nScript finished successfully.") |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import os import sys # --- Configuration --- FILE_LIST_PATH = '/tmp/files' OUTPUT_DIR = '/tmp' OUTPUT_PREFIX = 'merged_' NUM_OUTPUT_FILES = 16 CHUNK_SIZE = 1024 * 1024 # Read/write in 1MB chunks for efficiency with large files # --- End Configuration --- def get_files_with_sizes(list_path): """Reads the list file and returns a list of (filepath, size) tuples.""" files_with_sizes = [] print(f"Reading file list from: {list_path}") try: with open(list_path, 'r') as f_list: for line in f_list: filepath = line.strip() if not filepath: continue if os.path.isfile(filepath): try: size = os.path.getsize(filepath) files_with_sizes.append((filepath, size)) except OSError as e: print(f"Warning: Could not get size for '{filepath}'. Skipping. Error: {e}", file=sys.stderr) else: print(f"Warning: Path '{filepath}' listed in {list_path} is not a file or does not exist. Skipping.", file=sys.stderr) except FileNotFoundError: print(f"Error: File list '{list_path}' not found.", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error reading file list '{list_path}': {e}", file=sys.stderr) sys.exit(1) if not files_with_sizes: print("Error: No valid files found from the list.", file=sys.stderr) sys.exit(1) print(f"Found {len(files_with_sizes)} valid files.") return files_with_sizes def merge_and_split_files(files_with_sizes, num_output, out_dir, out_prefix): """Merges files and splits them according to the specified strategy.""" # Sort files by size, largest first files_with_sizes.sort(key=lambda item: item[1], reverse=True) print("Sorted files by size (largest first).") # Initialize output file sizes and handles output_file_paths = [os.path.join(out_dir, f"{out_prefix}{i}") for i in range(num_output)] output_file_sizes = [0] * num_output output_file_handles = [] try: # Ensure output directory exists (though /tmp usually does) os.makedirs(out_dir, exist_ok=True) # Open all output files in append binary mode ('ab') # Opening them once is more efficient than open/close for each input file print("Opening output files...") for i in range(num_output): path = output_file_paths[i] # Clear the file if it exists from a previous run try: with open(path, 'wb') as f_out_init: pass # Just truncate the file # Now open in append mode output_file_handles.append(open(path, 'ab')) except IOError as e: print(f"Error opening output file '{path}' for writing: {e}", file=sys.stderr) # Close any already opened handles before exiting for handle in output_file_handles: handle.close() sys.exit(1) print(f"Processing {len(files_with_sizes)} input files...") processed_count = 0 # Iterate through sorted files (largest first) for filepath, size in files_with_sizes: # Find the index of the currently smallest output file min_size = float('inf') min_index = -1 for i in range(num_output): if output_file_sizes[i] < min_size: min_size = output_file_sizes[i] min_index = i # Select the target output file handle target_handle = output_file_handles[min_index] target_path = output_file_paths[min_index] # print(f" Adding '{os.path.basename(filepath)}' ({size} bytes) to '{os.path.basename(target_path)}' (current size: {output_file_sizes[min_index]})") # Append the content of the input file to the target output file try: with open(filepath, 'rb') as f_in: while True: chunk = f_in.read(CHUNK_SIZE) if not chunk: break target_handle.write(chunk) # Update the size tracker for the target output file output_file_sizes[min_index] += size processed_count += 1 if processed_count % 100 == 0: print(f" Processed {processed_count}/{len(files_with_sizes)} files...") except IOError as e: print(f"Warning: Error reading '{filepath}' or writing to '{target_path}'. Skipping file. Error: {e}", file=sys.stderr) except Exception as e: print(f"Warning: Unexpected error processing '{filepath}'. Skipping file. Error: {e}", file=sys.stderr) finally: # Ensure all output files are closed print("Closing output files...") for handle in output_file_handles: if handle and not handle.closed: handle.close() print("\n--- Processing Complete ---") print("Final approximate sizes of output files:") for i in range(num_output): final_size_mb = output_file_sizes[i] / (1024 * 1024) print(f" {output_file_paths[i]}: {output_file_sizes[i]} bytes (~{final_size_mb:.2f} MB)") # --- Main Execution --- if __name__ == "__main__": # 1. Get the list of files and their sizes files_to_process = get_files_with_sizes(FILE_LIST_PATH) # 2. Perform the merge and split operation merge_and_split_files( files_to_process, NUM_OUTPUT_FILES, OUTPUT_DIR, OUTPUT_PREFIX ) print("\nScript finished successfully.") |
Reading file list from: /tmp/files
Found 1000 valid files.
Sorted files by size (largest first).
Opening output files...
Processing 1000 input files...
Processed 100/1000 files...
Processed 200/1000 files...
Processed 300/1000 files...
Processed 400/1000 files...
Processed 500/1000 files...
Processed 600/1000 files...
Processed 700/1000 files...
Processed 800/1000 files...
Processed 900/1000 files...
Processed 1000/1000 files...
Closing output files...
--- Processing Complete ---
Final approximate sizes of output files:
/tmp/merged_0: 37512 bytes (~0.04 MB)
/tmp/merged_1: 37511 bytes (~0.04 MB)
/tmp/merged_2: 37511 bytes (~0.04 MB)
/tmp/merged_3: 37516 bytes (~0.04 MB)
/tmp/merged_4: 37513 bytes (~0.04 MB)
/tmp/merged_5: 37511 bytes (~0.04 MB)
/tmp/merged_6: 37511 bytes (~0.04 MB)
/tmp/merged_7: 37511 bytes (~0.04 MB)
/tmp/merged_8: 37510 bytes (~0.04 MB)
/tmp/merged_9: 37511 bytes (~0.04 MB)
/tmp/merged_10: 37516 bytes (~0.04 MB)
/tmp/merged_11: 37510 bytes (~0.04 MB)
/tmp/merged_12: 37514 bytes (~0.04 MB)
/tmp/merged_13: 37510 bytes (~0.04 MB)
/tmp/merged_14: 37510 bytes (~0.04 MB)
/tmp/merged_15: 37512 bytes (~0.04 MB)
Script finished successfully.
final: True Resulting in output: True