I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
Here's a Python script that will merge the contents of files listed in /tmp/files into 16 approximately equal-sized output files:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os from pathlib import Path NUM_OUTPUT_FILES = 16 # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get total size of all files total_size = 0 file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) total_size += size file_sizes.append((path, size)) except OSError: print(f"Warning: Could not access file {path}") continue # Calculate target size per output file target_size = total_size / NUM_OUTPUT_FILES # Distribute files across output files current_output = 0 current_size = 0 output_files = [open(f'/tmp/merged_{i}', 'wb') for i in range(NUM_OUTPUT_FILES)] try: for filepath, size in file_sizes: try: with open(filepath, 'rb') as infile: data = infile.read() output_files[current_output].write(data) current_size += size # If current output file is approximately at target size, move to next if current_size >= target_size and current_output < NUM_OUTPUT_FILES - 1: current_output += 1 current_size = 0 except IOError: print(f"Warning: Could not read file {filepath}") continue finally: # Close all output files for f in output_files: f.close() print(f"Merged files into {NUM_OUTPUT_FILES} files of approximately {target_size/1024/1024:.2f} MB each") |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os from pathlib import Path NUM_OUTPUT_FILES = 16 # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get total size of all files total_size = 0 file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) total_size += size file_sizes.append((path, size)) except OSError: print(f"Warning: Could not access file {path}") continue # Calculate target size per output file target_size = total_size / NUM_OUTPUT_FILES # Distribute files across output files current_output = 0 current_size = 0 output_files = [open(f'/tmp/merged_{i}', 'wb') for i in range(NUM_OUTPUT_FILES)] try: for filepath, size in file_sizes: try: with open(filepath, 'rb') as infile: data = infile.read() output_files[current_output].write(data) current_size += size # If current output file is approximately at target size, move to next if current_size >= target_size and current_output < NUM_OUTPUT_FILES - 1: current_output += 1 current_size = 0 except IOError: print(f"Warning: Could not read file {filepath}") continue finally: # Close all output files for f in output_files: f.close() print(f"Merged files into {NUM_OUTPUT_FILES} files of approximately {target_size/1024/1024:.2f} MB each") |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os from pathlib import Path NUM_OUTPUT_FILES = 16 # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get total size of all files total_size = 0 file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) total_size += size file_sizes.append((path, size)) except OSError: print(f"Warning: Could not access file {path}") continue # Calculate target size per output file target_size = total_size / NUM_OUTPUT_FILES # Distribute files across output files current_output = 0 current_size = 0 output_files = [open(f'/tmp/merged_{i}', 'wb') for i in range(NUM_OUTPUT_FILES)] try: for filepath, size in file_sizes: try: with open(filepath, 'rb') as infile: data = infile.read() output_files[current_output].write(data) current_size += size # If current output file is approximately at target size, move to next if current_size >= target_size and current_output < NUM_OUTPUT_FILES - 1: current_output += 1 current_size = 0 except IOError: print(f"Warning: Could not read file {filepath}") continue finally: # Close all output files for f in output_files: f.close() print(f"Merged files into {NUM_OUTPUT_FILES} files of approximately {target_size/1024/1024:.2f} MB each") |
Merged files into 16 files of approximately 0.03 MB each
final: False Resulting in output: False
I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
Here's a Python script that will distribute the contents of files listed in /tmp/files into 16 merged files of approximately equal size:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import os NUM_OUTPUT_FILES = 16 # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get file sizes and total size file_sizes = [] total_size = 0 for path in file_paths: try: size = os.path.getsize(path) file_sizes.append((path, size)) total_size += size except OSError: print(f"Warning: Could not access file {path}") continue # Calculate target size per output file target_size = total_size / NUM_OUTPUT_FILES # Sort files by size (largest first) for better distribution file_sizes.sort(key=lambda x: x[1], reverse=True) # Initialize output files with size tracking output_files = [(f"/tmp/merged_{i}", 0) for i in range(NUM_OUTPUT_FILES)] # Distribute files to output buckets for file_path, file_size in file_sizes: # Find the output file with smallest current size output_file, current_size = min(output_files, key=lambda x: x[1]) # Add file to the selected output output_idx = int(output_file.split('_')[1]) try: with open(file_path, 'rb') as input_file: with open(output_file, 'ab') as output: output.write(input_file.read()) output.write(b'\n') # Add newline between files except IOError as e: print(f"Error processing file {file_path}: {e}") continue # Update size tracking output_files[output_idx] = (output_file, current_size + file_size) # Print final size distribution for output_file, size in output_files: print(f"{output_file}: {size / 1024 / 1024:.2f} MB") |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import os NUM_OUTPUT_FILES = 16 # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get file sizes and total size file_sizes = [] total_size = 0 for path in file_paths: try: size = os.path.getsize(path) file_sizes.append((path, size)) total_size += size except OSError: print(f"Warning: Could not access file {path}") continue # Calculate target size per output file target_size = total_size / NUM_OUTPUT_FILES # Sort files by size (largest first) for better distribution file_sizes.sort(key=lambda x: x[1], reverse=True) # Initialize output files with size tracking output_files = [(f"/tmp/merged_{i}", 0) for i in range(NUM_OUTPUT_FILES)] # Distribute files to output buckets for file_path, file_size in file_sizes: # Find the output file with smallest current size output_file, current_size = min(output_files, key=lambda x: x[1]) # Add file to the selected output output_idx = int(output_file.split('_')[1]) try: with open(file_path, 'rb') as input_file: with open(output_file, 'ab') as output: output.write(input_file.read()) output.write(b'\n') # Add newline between files except IOError as e: print(f"Error processing file {file_path}: {e}") continue # Update size tracking output_files[output_idx] = (output_file, current_size + file_size) # Print final size distribution for output_file, size in output_files: print(f"{output_file}: {size / 1024 / 1024:.2f} MB") |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import os NUM_OUTPUT_FILES = 16 # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get file sizes and total size file_sizes = [] total_size = 0 for path in file_paths: try: size = os.path.getsize(path) file_sizes.append((path, size)) total_size += size except OSError: print(f"Warning: Could not access file {path}") continue # Calculate target size per output file target_size = total_size / NUM_OUTPUT_FILES # Sort files by size (largest first) for better distribution file_sizes.sort(key=lambda x: x[1], reverse=True) # Initialize output files with size tracking output_files = [(f"/tmp/merged_{i}", 0) for i in range(NUM_OUTPUT_FILES)] # Distribute files to output buckets for file_path, file_size in file_sizes: # Find the output file with smallest current size output_file, current_size = min(output_files, key=lambda x: x[1]) # Add file to the selected output output_idx = int(output_file.split('_')[1]) try: with open(file_path, 'rb') as input_file: with open(output_file, 'ab') as output: output.write(input_file.read()) output.write(b'\n') # Add newline between files except IOError as e: print(f"Error processing file {file_path}: {e}") continue # Update size tracking output_files[output_idx] = (output_file, current_size + file_size) # Print final size distribution for output_file, size in output_files: print(f"{output_file}: {size / 1024 / 1024:.2f} MB") |
/tmp/merged_0: 0.04 MB
/tmp/merged_1: 0.04 MB
/tmp/merged_2: 0.04 MB
/tmp/merged_3: 0.04 MB
/tmp/merged_4: 0.04 MB
/tmp/merged_5: 0.04 MB
/tmp/merged_6: 0.04 MB
/tmp/merged_7: 0.04 MB
/tmp/merged_8: 0.04 MB
/tmp/merged_9: 0.04 MB
/tmp/merged_10: 0.04 MB
/tmp/merged_11: 0.04 MB
/tmp/merged_12: 0.04 MB
/tmp/merged_13: 0.04 MB
/tmp/merged_14: 0.04 MB
/tmp/merged_15: 0.04 MB
final: True Resulting in output: True
I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
Here's a Python script that will merge the contents of files into 16 approximately equal-sized output files:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import os from pathlib import Path NUM_OUTPUT_FILES = 16 def get_file_sizes(file_list_path): """Get sizes of all input files.""" sizes = [] with open(file_list_path, 'r') as f: for filepath in f: filepath = filepath.strip() try: sizes.append((filepath, os.path.getsize(filepath))) except OSError: print(f"Warning: Could not get size for {filepath}") return sizes def distribute_files(files_with_sizes): """Distribute files among output buckets to balance total sizes.""" # Sort files by size in descending order sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True) # Initialize buckets buckets = [[] for _ in range(NUM_OUTPUT_FILES)] bucket_sizes = [0] * NUM_OUTPUT_FILES # Distribute files using a greedy approach for filepath, size in sorted_files: # Find bucket with minimum current size min_bucket_idx = bucket_sizes.index(min(bucket_sizes)) buckets[min_bucket_idx].append(filepath) bucket_sizes[min_bucket_idx] += size return buckets def merge_files(buckets): """Merge files according to the calculated distribution.""" for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" print(f"Creating {output_path}") with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: outfile.write(infile.read()) except IOError as e: print(f"Error processing {filepath}: {e}") def main(): file_list_path = "/tmp/files" # Get sizes of all input files print("Calculating file sizes...") files_with_sizes = get_file_sizes(file_list_path) # Distribute files among buckets print("Distributing files...") buckets = distribute_files(files_with_sizes) # Merge files according to distribution print("Merging files...") merge_files(buckets) print("Done!") if __name__ == "__main__": main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | def merge_files(buckets): """Merge files according to the calculated distribution.""" buffer_size = 1024 * 1024 # 1MB buffer for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" print(f"Creating {output_path}") with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: while True: buffer = infile.read(buffer_size) if not buffer: break outfile.write(buffer) except IOError as e: print(f"Error processing {filepath}: {e}") |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import os from pathlib import Path NUM_OUTPUT_FILES = 16 def get_file_sizes(file_list_path): """Get sizes of all input files.""" sizes = [] with open(file_list_path, 'r') as f: for filepath in f: filepath = filepath.strip() try: sizes.append((filepath, os.path.getsize(filepath))) except OSError: print(f"Warning: Could not get size for {filepath}") return sizes def distribute_files(files_with_sizes): """Distribute files among output buckets to balance total sizes.""" # Sort files by size in descending order sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True) # Initialize buckets buckets = [[] for _ in range(NUM_OUTPUT_FILES)] bucket_sizes = [0] * NUM_OUTPUT_FILES # Distribute files using a greedy approach for filepath, size in sorted_files: # Find bucket with minimum current size min_bucket_idx = bucket_sizes.index(min(bucket_sizes)) buckets[min_bucket_idx].append(filepath) bucket_sizes[min_bucket_idx] += size return buckets def merge_files(buckets): """Merge files according to the calculated distribution.""" buffer_size = 1024 * 1024 # 1MB buffer for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" print(f"Creating {output_path}") with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: while True: buffer = infile.read(buffer_size) if not buffer: break outfile.write(buffer) except IOError as e: print(f"Error processing {filepath}: {e}") def main(): file_list_path = "/tmp/files" # Get sizes of all input files print("Calculating file sizes...") files_with_sizes = get_file_sizes(file_list_path) # Distribute files among buckets print("Distributing files...") buckets = distribute_files(files_with_sizes) # Merge files according to distribution print("Merging files...") merge_files(buckets) print("Done!") if __name__ == "__main__": main() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import os from pathlib import Path NUM_OUTPUT_FILES = 16 def get_file_sizes(file_list_path): """Get sizes of all input files.""" sizes = [] with open(file_list_path, 'r') as f: for filepath in f: filepath = filepath.strip() try: sizes.append((filepath, os.path.getsize(filepath))) except OSError: print(f"Warning: Could not get size for {filepath}") return sizes def distribute_files(files_with_sizes): """Distribute files among output buckets to balance total sizes.""" # Sort files by size in descending order sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True) # Initialize buckets buckets = [[] for _ in range(NUM_OUTPUT_FILES)] bucket_sizes = [0] * NUM_OUTPUT_FILES # Distribute files using a greedy approach for filepath, size in sorted_files: # Find bucket with minimum current size min_bucket_idx = bucket_sizes.index(min(bucket_sizes)) buckets[min_bucket_idx].append(filepath) bucket_sizes[min_bucket_idx] += size return buckets def merge_files(buckets): """Merge files according to the calculated distribution.""" buffer_size = 1024 * 1024 # 1MB buffer for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" print(f"Creating {output_path}") with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: while True: buffer = infile.read(buffer_size) if not buffer: break outfile.write(buffer) except IOError as e: print(f"Error processing {filepath}: {e}") def main(): file_list_path = "/tmp/files" # Get sizes of all input files print("Calculating file sizes...") files_with_sizes = get_file_sizes(file_list_path) # Distribute files among buckets print("Distributing files...") buckets = distribute_files(files_with_sizes) # Merge files according to distribution print("Merging files...") merge_files(buckets) print("Done!") if __name__ == "__main__": main() |
Calculating file sizes...
Distributing files...
Merging files...
Creating /tmp/merged_0
Creating /tmp/merged_1
Creating /tmp/merged_2
Creating /tmp/merged_3
Creating /tmp/merged_4
Creating /tmp/merged_5
Creating /tmp/merged_6
Creating /tmp/merged_7
Creating /tmp/merged_8
Creating /tmp/merged_9
Creating /tmp/merged_10
Creating /tmp/merged_11
Creating /tmp/merged_12
Creating /tmp/merged_13
Creating /tmp/merged_14
Creating /tmp/merged_15
Done!
final: True Resulting in output: True
I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
Here's a Python script that will merge the contents of files listed in /tmp/files into 16 evenly-sized output files:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | #!/usr/bin/env python3 import os from pathlib import Path NUM_OUTPUT_FILES = 16 # Read the list of input files with open('/tmp/files', 'r') as f: input_files = [line.strip() for line in f if line.strip()] # Calculate total size of all input files total_size = 0 file_sizes = [] for filepath in input_files: try: size = os.path.getsize(filepath) total_size += size file_sizes.append((filepath, size)) except OSError: print(f"Warning: Could not access file {filepath}") continue # Calculate target size for each output file target_size_per_file = total_size / NUM_OUTPUT_FILES # Initialize output files and their current sizes output_files = {i: {'size': 0, 'path': f'/tmp/merged_{i}'} for i in range(NUM_OUTPUT_FILES)} # Sort files by size in descending order for better distribution file_sizes.sort(key=lambda x: x[1], reverse=True) # Distribute files to output buckets for filepath, size in file_sizes: # Find the output file with the smallest current size target_output = min(output_files.items(), key=lambda x: x[1]['size'])[0] try: # Read input file with open(filepath, 'rb') as infile: content = infile.read() # Append to output file with open(output_files[target_output]['path'], 'ab') as outfile: outfile.write(content) # Update size tracking output_files[target_output]['size'] += size except IOError as e: print(f"Error processing file {filepath}: {e}") continue # Print statistics print("\nMerge complete. Output file sizes:") for i in range(NUM_OUTPUT_FILES): size_mb = output_files[i]['size'] / (1024 * 1024) print(f"/tmp/merged_{i}: {size_mb:.2f} MB") |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | #!/usr/bin/env python3 import os from pathlib import Path NUM_OUTPUT_FILES = 16 # Read the list of input files with open('/tmp/files', 'r') as f: input_files = [line.strip() for line in f if line.strip()] # Calculate total size of all input files total_size = 0 file_sizes = [] for filepath in input_files: try: size = os.path.getsize(filepath) total_size += size file_sizes.append((filepath, size)) except OSError: print(f"Warning: Could not access file {filepath}") continue # Calculate target size for each output file target_size_per_file = total_size / NUM_OUTPUT_FILES # Initialize output files and their current sizes output_files = {i: {'size': 0, 'path': f'/tmp/merged_{i}'} for i in range(NUM_OUTPUT_FILES)} # Sort files by size in descending order for better distribution file_sizes.sort(key=lambda x: x[1], reverse=True) # Distribute files to output buckets for filepath, size in file_sizes: # Find the output file with the smallest current size target_output = min(output_files.items(), key=lambda x: x[1]['size'])[0] try: # Read input file with open(filepath, 'rb') as infile: content = infile.read() # Append to output file with open(output_files[target_output]['path'], 'ab') as outfile: outfile.write(content) # Update size tracking output_files[target_output]['size'] += size except IOError as e: print(f"Error processing file {filepath}: {e}") continue # Print statistics print("\nMerge complete. Output file sizes:") for i in range(NUM_OUTPUT_FILES): size_mb = output_files[i]['size'] / (1024 * 1024) print(f"/tmp/merged_{i}: {size_mb:.2f} MB") |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | #!/usr/bin/env python3 import os from pathlib import Path NUM_OUTPUT_FILES = 16 # Read the list of input files with open('/tmp/files', 'r') as f: input_files = [line.strip() for line in f if line.strip()] # Calculate total size of all input files total_size = 0 file_sizes = [] for filepath in input_files: try: size = os.path.getsize(filepath) total_size += size file_sizes.append((filepath, size)) except OSError: print(f"Warning: Could not access file {filepath}") continue # Calculate target size for each output file target_size_per_file = total_size / NUM_OUTPUT_FILES # Initialize output files and their current sizes output_files = {i: {'size': 0, 'path': f'/tmp/merged_{i}'} for i in range(NUM_OUTPUT_FILES)} # Sort files by size in descending order for better distribution file_sizes.sort(key=lambda x: x[1], reverse=True) # Distribute files to output buckets for filepath, size in file_sizes: # Find the output file with the smallest current size target_output = min(output_files.items(), key=lambda x: x[1]['size'])[0] try: # Read input file with open(filepath, 'rb') as infile: content = infile.read() # Append to output file with open(output_files[target_output]['path'], 'ab') as outfile: outfile.write(content) # Update size tracking output_files[target_output]['size'] += size except IOError as e: print(f"Error processing file {filepath}: {e}") continue # Print statistics print("\nMerge complete. Output file sizes:") for i in range(NUM_OUTPUT_FILES): size_mb = output_files[i]['size'] / (1024 * 1024) print(f"/tmp/merged_{i}: {size_mb:.2f} MB") |
Merge complete. Output file sizes:
/tmp/merged_0: 0.04 MB
/tmp/merged_1: 0.04 MB
/tmp/merged_2: 0.04 MB
/tmp/merged_3: 0.04 MB
/tmp/merged_4: 0.04 MB
/tmp/merged_5: 0.04 MB
/tmp/merged_6: 0.04 MB
/tmp/merged_7: 0.04 MB
/tmp/merged_8: 0.04 MB
/tmp/merged_9: 0.04 MB
/tmp/merged_10: 0.04 MB
/tmp/merged_11: 0.04 MB
/tmp/merged_12: 0.04 MB
/tmp/merged_13: 0.04 MB
/tmp/merged_14: 0.04 MB
/tmp/merged_15: 0.04 MB
final: True Resulting in output: True
I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
Here's a Python script that will distribute the contents of files into 16 merged files of approximately equal size:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import os NUM_OUTPUT_FILES = 16 def get_file_sizes(file_list_path): """Get sizes of all files from the list.""" sizes = [] with open(file_list_path, 'r') as f: for line in f: filepath = line.strip() try: sizes.append((filepath, os.path.getsize(filepath))) except (OSError, IOError): print(f"Warning: Could not get size for {filepath}") return sizes def distribute_files(files_with_sizes): """Distribute files into NUM_OUTPUT_FILES buckets trying to balance total size.""" # Sort files by size in descending order sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True) # Initialize buckets buckets = [[] for _ in range(NUM_OUTPUT_FILES)] bucket_sizes = [0] * NUM_OUTPUT_FILES # Distribute files using greedy approach for filepath, size in sorted_files: # Find bucket with minimum current size min_bucket_idx = bucket_sizes.index(min(bucket_sizes)) buckets[min_bucket_idx].append(filepath) bucket_sizes[min_bucket_idx] += size return buckets def merge_files(buckets): """Merge files according to the calculated distribution.""" for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: outfile.write(infile.read()) except (OSError, IOError) as e: print(f"Warning: Error processing {filepath}: {e}") def main(): file_list_path = "/tmp/files" # Get file sizes print("Getting file sizes...") files_with_sizes = get_file_sizes(file_list_path) # Distribute files print("Calculating distribution...") buckets = distribute_files(files_with_sizes) # Merge files print("Merging files...") merge_files(buckets) # Print statistics print("\nDistribution statistics:") for i, bucket in enumerate(buckets): total_size = sum(os.path.getsize(f) for f in bucket) print(f"merged_{i}: {len(bucket)} files, {total_size/1024/1024:.2f} MB") if __name__ == "__main__": main() |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import os NUM_OUTPUT_FILES = 16 def get_file_sizes(file_list_path): """Get sizes of all files from the list.""" sizes = [] with open(file_list_path, 'r') as f: for line in f: filepath = line.strip() try: sizes.append((filepath, os.path.getsize(filepath))) except (OSError, IOError): print(f"Warning: Could not get size for {filepath}") return sizes def distribute_files(files_with_sizes): """Distribute files into NUM_OUTPUT_FILES buckets trying to balance total size.""" # Sort files by size in descending order sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True) # Initialize buckets buckets = [[] for _ in range(NUM_OUTPUT_FILES)] bucket_sizes = [0] * NUM_OUTPUT_FILES # Distribute files using greedy approach for filepath, size in sorted_files: # Find bucket with minimum current size min_bucket_idx = bucket_sizes.index(min(bucket_sizes)) buckets[min_bucket_idx].append(filepath) bucket_sizes[min_bucket_idx] += size return buckets def merge_files(buckets): """Merge files according to the calculated distribution.""" for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: outfile.write(infile.read()) except (OSError, IOError) as e: print(f"Warning: Error processing {filepath}: {e}") def main(): file_list_path = "/tmp/files" # Get file sizes print("Getting file sizes...") files_with_sizes = get_file_sizes(file_list_path) # Distribute files print("Calculating distribution...") buckets = distribute_files(files_with_sizes) # Merge files print("Merging files...") merge_files(buckets) # Print statistics print("\nDistribution statistics:") for i, bucket in enumerate(buckets): total_size = sum(os.path.getsize(f) for f in bucket) print(f"merged_{i}: {len(bucket)} files, {total_size/1024/1024:.2f} MB") if __name__ == "__main__": main() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import os NUM_OUTPUT_FILES = 16 def get_file_sizes(file_list_path): """Get sizes of all files from the list.""" sizes = [] with open(file_list_path, 'r') as f: for line in f: filepath = line.strip() try: sizes.append((filepath, os.path.getsize(filepath))) except (OSError, IOError): print(f"Warning: Could not get size for {filepath}") return sizes def distribute_files(files_with_sizes): """Distribute files into NUM_OUTPUT_FILES buckets trying to balance total size.""" # Sort files by size in descending order sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True) # Initialize buckets buckets = [[] for _ in range(NUM_OUTPUT_FILES)] bucket_sizes = [0] * NUM_OUTPUT_FILES # Distribute files using greedy approach for filepath, size in sorted_files: # Find bucket with minimum current size min_bucket_idx = bucket_sizes.index(min(bucket_sizes)) buckets[min_bucket_idx].append(filepath) bucket_sizes[min_bucket_idx] += size return buckets def merge_files(buckets): """Merge files according to the calculated distribution.""" for i, bucket in enumerate(buckets): output_path = f"/tmp/merged_{i}" with open(output_path, 'wb') as outfile: for filepath in bucket: try: with open(filepath, 'rb') as infile: outfile.write(infile.read()) except (OSError, IOError) as e: print(f"Warning: Error processing {filepath}: {e}") def main(): file_list_path = "/tmp/files" # Get file sizes print("Getting file sizes...") files_with_sizes = get_file_sizes(file_list_path) # Distribute files print("Calculating distribution...") buckets = distribute_files(files_with_sizes) # Merge files print("Merging files...") merge_files(buckets) # Print statistics print("\nDistribution statistics:") for i, bucket in enumerate(buckets): total_size = sum(os.path.getsize(f) for f in bucket) print(f"merged_{i}: {len(bucket)} files, {total_size/1024/1024:.2f} MB") if __name__ == "__main__": main() |
Getting file sizes...
Calculating distribution...
Merging files...
Distribution statistics:
merged_0: 45 files, 0.04 MB
merged_1: 48 files, 0.04 MB
merged_2: 48 files, 0.04 MB
merged_3: 53 files, 0.04 MB
merged_4: 53 files, 0.04 MB
merged_5: 68 files, 0.04 MB
merged_6: 68 files, 0.04 MB
merged_7: 69 files, 0.04 MB
merged_8: 68 files, 0.04 MB
merged_9: 68 files, 0.04 MB
merged_10: 68 files, 0.04 MB
merged_11: 69 files, 0.04 MB
merged_12: 69 files, 0.04 MB
merged_13: 68 files, 0.04 MB
merged_14: 69 files, 0.04 MB
merged_15: 69 files, 0.04 MB
final: True Resulting in output: True