I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
Here’s a Python script that accomplishes your task:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import os def main(): # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f.readlines()] # Get the size of each file and sort by size (largest first) file_sizes = [(path, os.path.getsize(path)) for path in file_paths] file_sizes.sort(key=lambda x: x[1], reverse=True) # Initialize 16 output files with their current sizes output_files = [f'/tmp/merged_{i}' for i in range(16)] output_sizes = [0] * 16 # Open all output files for writing with open(output_files[0], 'w') as f0, \ open(output_files[1], 'w') as f1, \ open(output_files[2], 'w') as f2, \ open(output_files[3], 'w') as f3, \ open(output_files[4], 'w') as f4, \ open(output_files[5], 'w') as f5, \ open(output_files[6], 'w') as f6, \ open(output_files[7], 'w') as f7, \ open(output_files[8], 'w') as f8, \ open(output_files[9], 'w') as f9, \ open(output_files[10], 'w') as f10, \ open(output_files[11], 'w') as f11, \ open(output_files[12], 'w') as f12, \ open(output_files[13], 'w') as f13, \ open(output_files[14], 'w') as f14, \ open(output_files[15], 'w') as f15: file_handles = [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15] # Process each file for path, size in file_sizes: # Find the output file with the smallest size min_index = output_sizes.index(min(output_sizes)) # Read the content of the input file with open(path, 'r') as input_file: content = input_file.read() # Write the content to the selected output file file_handles[min_index].write(content) # Update the size of the selected output file output_sizes[min_index] += size if __name__ == '__main__': main() |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import os def main(): # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f.readlines()] # Get the size of each file and sort by size (largest first) file_sizes = [(path, os.path.getsize(path)) for path in file_paths] file_sizes.sort(key=lambda x: x[1], reverse=True) # Initialize 16 output files with their current sizes output_files = [f'/tmp/merged_{i}' for i in range(16)] output_sizes = [0] * 16 # Open all output files for writing with open(output_files[0], 'w') as f0, \ open(output_files[1], 'w') as f1, \ open(output_files[2], 'w') as f2, \ open(output_files[3], 'w') as f3, \ open(output_files[4], 'w') as f4, \ open(output_files[5], 'w') as f5, \ open(output_files[6], 'w') as f6, \ open(output_files[7], 'w') as f7, \ open(output_files[8], 'w') as f8, \ open(output_files[9], 'w') as f9, \ open(output_files[10], 'w') as f10, \ open(output_files[11], 'w') as f11, \ open(output_files[12], 'w') as f12, \ open(output_files[13], 'w') as f13, \ open(output_files[14], 'w') as f14, \ open(output_files[15], 'w') as f15: file_handles = [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15] # Process each file for path, size in file_sizes: # Find the output file with the smallest size min_index = output_sizes.index(min(output_sizes)) # Read the content of the input file with open(path, 'r') as input_file: content = input_file.read() # Write the content to the selected output file file_handles[min_index].write(content) # Update the size of the selected output file output_sizes[min_index] += size if __name__ == '__main__': main() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import os def main(): # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f.readlines()] # Get the size of each file and sort by size (largest first) file_sizes = [(path, os.path.getsize(path)) for path in file_paths] file_sizes.sort(key=lambda x: x[1], reverse=True) # Initialize 16 output files with their current sizes output_files = [f'/tmp/merged_{i}' for i in range(16)] output_sizes = [0] * 16 # Open all output files for writing with open(output_files[0], 'w') as f0, \ open(output_files[1], 'w') as f1, \ open(output_files[2], 'w') as f2, \ open(output_files[3], 'w') as f3, \ open(output_files[4], 'w') as f4, \ open(output_files[5], 'w') as f5, \ open(output_files[6], 'w') as f6, \ open(output_files[7], 'w') as f7, \ open(output_files[8], 'w') as f8, \ open(output_files[9], 'w') as f9, \ open(output_files[10], 'w') as f10, \ open(output_files[11], 'w') as f11, \ open(output_files[12], 'w') as f12, \ open(output_files[13], 'w') as f13, \ open(output_files[14], 'w') as f14, \ open(output_files[15], 'w') as f15: file_handles = [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15] # Process each file for path, size in file_sizes: # Find the output file with the smallest size min_index = output_sizes.index(min(output_sizes)) # Read the content of the input file with open(path, 'r') as input_file: content = input_file.read() # Write the content to the selected output file file_handles[min_index].write(content) # Update the size of the selected output file output_sizes[min_index] += size if __name__ == '__main__': main() |
final: True Resulting in output: True