I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
Okay, here's a Python script that accomplishes this. It reads the list of file paths, determines the target output file for each input file based on which output file is currently the smallest, and copies the content chunk by chunk to manage memory usage.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3 import os import sys import math # --- Configuration --- INPUT_LIST_FILE = "/tmp/files" OUTPUT_DIR = "/tmp" OUTPUT_PREFIX = "merged_" NUM_OUTPUT_FILES = 16 CHUNK_SIZE = 1024 * 1024 # Read/write in 1MB chunks # --- Helper Function --- def get_file_size(filepath): """Safely gets the size of a file.""" try: return os.path.getsize(filepath) except OSError as e: print(f"Warning: Could not get size for {filepath}: {e}", file=sys.stderr) return None # --- Main Logic --- def merge_files(): # 1. Read the list of input file paths try: with open(INPUT_LIST_FILE, 'r') as f: # Read lines, strip whitespace, filter out empty lines input_file_paths = [line.strip() for line in f if line.strip()] except FileNotFoundError: print(f"Error: Input list file not found: {INPUT_LIST_FILE}", file=sys.stderr) sys.exit(1) except OSError as e: print(f"Error: Could not read input list file {INPUT_LIST_FILE}: {e}", file=sys.stderr) sys.exit(1) if not input_file_paths: print("Warning: Input list file is empty. No files to merge.", file=sys.stderr) # Create empty output files anyway? Or exit? Let's create empty ones. # Fall through to the output file opening logic. print(f"Found {len(input_file_paths)} file paths to process.") # 2. Prepare output files and size tracking output_file_handles = [] output_file_paths = [] # Track the current size of each output file output_file_sizes = [0] * NUM_OUTPUT_FILES try: os.makedirs(OUTPUT_DIR, exist_ok=True) # Ensure output directory exists for i in range(NUM_OUTPUT_FILES): path = os.path.join(OUTPUT_DIR, f"{OUTPUT_PREFIX}{i}") output_file_paths.append(path) # Open in binary append mode ('ab') or binary write ('wb') # 'wb' is safer if rerunning, ensuring clean slate. # If you need to append across runs, use 'ab'. Let's use 'wb'. handle = open(path, 'wb') output_file_handles.append(handle) except OSError as e: print(f"Error: Could not open output files in {OUTPUT_DIR}: {e}", file=sys.stderr) # Clean up any already opened files for handle in output_file_handles: handle.close() sys.exit(1) # 3. Process each input file and distribute content files_processed = 0 bytes_processed = 0 skipped_files = 0 print("Starting merge process...") for input_path in input_file_paths: input_size = get_file_size(input_path) if input_size is None: print(f"Skipping file due to size error: {input_path}", file=sys.stderr) skipped_files += 1 continue # Skip this file # Find the index of the output file that is currently smallest target_index = output_file_sizes.index(min(output_file_sizes)) target_handle = output_file_handles[target_index] target_path = output_file_paths[target_index] # print(f"Adding {input_path} ({input_size} bytes) to {target_path} (current size: {output_file_sizes[target_index]})") # Verbose try: with open(input_path, 'rb') as infile: while True: chunk = infile.read(CHUNK_SIZE) if not chunk: break target_handle.write(chunk) # Update the size tracker for the target file output_file_sizes[target_index] += input_size files_processed += 1 bytes_processed += input_size except FileNotFoundError: print(f"Warning: Input file disappeared or is not accessible: {input_path}", file=sys.stderr) skipped_files += 1 except OSError as e: print(f"Warning: Error reading {input_path} or writing to {target_path}: {e}", file=sys.stderr) skipped_files += 1 except Exception as e: print(f"Warning: Unexpected error processing {input_path}: {e}", file=sys.stderr) skipped_files += 1 # 4. Clean up: Close all output files print("Closing output files...") for handle in output_file_handles: try: handle.close() except OSError as e: print(f"Warning: Error closing file {handle.name}: {e}", file=sys.stderr) # 5. Final Report print("\n--- Merge Complete ---") print(f"Successfully processed {files_processed} input files.") print(f"Skipped {skipped_files} input files due to errors.") print(f"Total bytes written: {bytes_processed}") print("Final approximate sizes of output files:") for i in range(NUM_OUTPUT_FILES): # Get actual final size for more accuracy than our tracker final_size = get_file_size(output_file_paths[i]) size_str = f"{final_size} bytes" if final_size is not None else "Error getting size" print(f" {output_file_paths[i]}: {size_str}") if __name__ == "__main__": merge_files() |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3 import os import sys import math # --- Configuration --- INPUT_LIST_FILE = "/tmp/files" OUTPUT_DIR = "/tmp" OUTPUT_PREFIX = "merged_" NUM_OUTPUT_FILES = 16 CHUNK_SIZE = 1024 * 1024 # Read/write in 1MB chunks # --- Helper Function --- def get_file_size(filepath): """Safely gets the size of a file.""" try: return os.path.getsize(filepath) except OSError as e: print(f"Warning: Could not get size for {filepath}: {e}", file=sys.stderr) return None # --- Main Logic --- def merge_files(): # 1. Read the list of input file paths try: with open(INPUT_LIST_FILE, 'r') as f: # Read lines, strip whitespace, filter out empty lines input_file_paths = [line.strip() for line in f if line.strip()] except FileNotFoundError: print(f"Error: Input list file not found: {INPUT_LIST_FILE}", file=sys.stderr) sys.exit(1) except OSError as e: print(f"Error: Could not read input list file {INPUT_LIST_FILE}: {e}", file=sys.stderr) sys.exit(1) if not input_file_paths: print("Warning: Input list file is empty. No files to merge.", file=sys.stderr) # Create empty output files anyway? Or exit? Let's create empty ones. # Fall through to the output file opening logic. print(f"Found {len(input_file_paths)} file paths to process.") # 2. Prepare output files and size tracking output_file_handles = [] output_file_paths = [] # Track the current size of each output file output_file_sizes = [0] * NUM_OUTPUT_FILES try: os.makedirs(OUTPUT_DIR, exist_ok=True) # Ensure output directory exists for i in range(NUM_OUTPUT_FILES): path = os.path.join(OUTPUT_DIR, f"{OUTPUT_PREFIX}{i}") output_file_paths.append(path) # Open in binary append mode ('ab') or binary write ('wb') # 'wb' is safer if rerunning, ensuring clean slate. # If you need to append across runs, use 'ab'. Let's use 'wb'. handle = open(path, 'wb') output_file_handles.append(handle) except OSError as e: print(f"Error: Could not open output files in {OUTPUT_DIR}: {e}", file=sys.stderr) # Clean up any already opened files for handle in output_file_handles: handle.close() sys.exit(1) # 3. Process each input file and distribute content files_processed = 0 bytes_processed = 0 skipped_files = 0 print("Starting merge process...") for input_path in input_file_paths: input_size = get_file_size(input_path) if input_size is None: print(f"Skipping file due to size error: {input_path}", file=sys.stderr) skipped_files += 1 continue # Skip this file # Find the index of the output file that is currently smallest target_index = output_file_sizes.index(min(output_file_sizes)) target_handle = output_file_handles[target_index] target_path = output_file_paths[target_index] # print(f"Adding {input_path} ({input_size} bytes) to {target_path} (current size: {output_file_sizes[target_index]})") # Verbose try: with open(input_path, 'rb') as infile: while True: chunk = infile.read(CHUNK_SIZE) if not chunk: break target_handle.write(chunk) # Update the size tracker for the target file output_file_sizes[target_index] += input_size files_processed += 1 bytes_processed += input_size except FileNotFoundError: print(f"Warning: Input file disappeared or is not accessible: {input_path}", file=sys.stderr) skipped_files += 1 except OSError as e: print(f"Warning: Error reading {input_path} or writing to {target_path}: {e}", file=sys.stderr) skipped_files += 1 except Exception as e: print(f"Warning: Unexpected error processing {input_path}: {e}", file=sys.stderr) skipped_files += 1 # 4. Clean up: Close all output files print("Closing output files...") for handle in output_file_handles: try: handle.close() except OSError as e: print(f"Warning: Error closing file {handle.name}: {e}", file=sys.stderr) # 5. Final Report print("\n--- Merge Complete ---") print(f"Successfully processed {files_processed} input files.") print(f"Skipped {skipped_files} input files due to errors.") print(f"Total bytes written: {bytes_processed}") print("Final approximate sizes of output files:") for i in range(NUM_OUTPUT_FILES): # Get actual final size for more accuracy than our tracker final_size = get_file_size(output_file_paths[i]) size_str = f"{final_size} bytes" if final_size is not None else "Error getting size" print(f" {output_file_paths[i]}: {size_str}") if __name__ == "__main__": merge_files() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3 import os import sys import math # --- Configuration --- INPUT_LIST_FILE = "/tmp/files" OUTPUT_DIR = "/tmp" OUTPUT_PREFIX = "merged_" NUM_OUTPUT_FILES = 16 CHUNK_SIZE = 1024 * 1024 # Read/write in 1MB chunks # --- Helper Function --- def get_file_size(filepath): """Safely gets the size of a file.""" try: return os.path.getsize(filepath) except OSError as e: print(f"Warning: Could not get size for {filepath}: {e}", file=sys.stderr) return None # --- Main Logic --- def merge_files(): # 1. Read the list of input file paths try: with open(INPUT_LIST_FILE, 'r') as f: # Read lines, strip whitespace, filter out empty lines input_file_paths = [line.strip() for line in f if line.strip()] except FileNotFoundError: print(f"Error: Input list file not found: {INPUT_LIST_FILE}", file=sys.stderr) sys.exit(1) except OSError as e: print(f"Error: Could not read input list file {INPUT_LIST_FILE}: {e}", file=sys.stderr) sys.exit(1) if not input_file_paths: print("Warning: Input list file is empty. No files to merge.", file=sys.stderr) # Create empty output files anyway? Or exit? Let's create empty ones. # Fall through to the output file opening logic. print(f"Found {len(input_file_paths)} file paths to process.") # 2. Prepare output files and size tracking output_file_handles = [] output_file_paths = [] # Track the current size of each output file output_file_sizes = [0] * NUM_OUTPUT_FILES try: os.makedirs(OUTPUT_DIR, exist_ok=True) # Ensure output directory exists for i in range(NUM_OUTPUT_FILES): path = os.path.join(OUTPUT_DIR, f"{OUTPUT_PREFIX}{i}") output_file_paths.append(path) # Open in binary append mode ('ab') or binary write ('wb') # 'wb' is safer if rerunning, ensuring clean slate. # If you need to append across runs, use 'ab'. Let's use 'wb'. handle = open(path, 'wb') output_file_handles.append(handle) except OSError as e: print(f"Error: Could not open output files in {OUTPUT_DIR}: {e}", file=sys.stderr) # Clean up any already opened files for handle in output_file_handles: handle.close() sys.exit(1) # 3. Process each input file and distribute content files_processed = 0 bytes_processed = 0 skipped_files = 0 print("Starting merge process...") for input_path in input_file_paths: input_size = get_file_size(input_path) if input_size is None: print(f"Skipping file due to size error: {input_path}", file=sys.stderr) skipped_files += 1 continue # Skip this file # Find the index of the output file that is currently smallest target_index = output_file_sizes.index(min(output_file_sizes)) target_handle = output_file_handles[target_index] target_path = output_file_paths[target_index] # print(f"Adding {input_path} ({input_size} bytes) to {target_path} (current size: {output_file_sizes[target_index]})") # Verbose try: with open(input_path, 'rb') as infile: while True: chunk = infile.read(CHUNK_SIZE) if not chunk: break target_handle.write(chunk) # Update the size tracker for the target file output_file_sizes[target_index] += input_size files_processed += 1 bytes_processed += input_size except FileNotFoundError: print(f"Warning: Input file disappeared or is not accessible: {input_path}", file=sys.stderr) skipped_files += 1 except OSError as e: print(f"Warning: Error reading {input_path} or writing to {target_path}: {e}", file=sys.stderr) skipped_files += 1 except Exception as e: print(f"Warning: Unexpected error processing {input_path}: {e}", file=sys.stderr) skipped_files += 1 # 4. Clean up: Close all output files print("Closing output files...") for handle in output_file_handles: try: handle.close() except OSError as e: print(f"Warning: Error closing file {handle.name}: {e}", file=sys.stderr) # 5. Final Report print("\n--- Merge Complete ---") print(f"Successfully processed {files_processed} input files.") print(f"Skipped {skipped_files} input files due to errors.") print(f"Total bytes written: {bytes_processed}") print("Final approximate sizes of output files:") for i in range(NUM_OUTPUT_FILES): # Get actual final size for more accuracy than our tracker final_size = get_file_size(output_file_paths[i]) size_str = f"{final_size} bytes" if final_size is not None else "Error getting size" print(f" {output_file_paths[i]}: {size_str}") if __name__ == "__main__": merge_files() |
Found 1000 file paths to process.
Starting merge process...
Closing output files...
--- Merge Complete ---
Successfully processed 1000 input files.
Skipped 0 input files due to errors.
Total bytes written: 581964
Final approximate sizes of output files:
/tmp/merged_0: 36070 bytes
/tmp/merged_1: 36056 bytes
/tmp/merged_2: 36477 bytes
/tmp/merged_3: 36281 bytes
/tmp/merged_4: 36502 bytes
/tmp/merged_5: 36354 bytes
/tmp/merged_6: 36612 bytes
/tmp/merged_7: 36175 bytes
/tmp/merged_8: 36461 bytes
/tmp/merged_9: 36373 bytes
/tmp/merged_10: 36943 bytes
/tmp/merged_11: 36387 bytes
/tmp/merged_12: 36457 bytes
/tmp/merged_13: 36104 bytes
/tmp/merged_14: 36453 bytes
/tmp/merged_15: 36259 bytes
final: False Resulting in output: False