I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
# Python Script to Merge Files with Balanced Distribution
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | #!/usr/bin/env python3 import os import heapq def main(): # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get size of each file and sort (largest first) files_with_sizes = [] for path in file_paths: try: size = os.path.getsize(path) files_with_sizes.append((size, path)) except (OSError, IOError) as e: print(f"Warning: Could not get size of {path}: {e}") # Sort by size (largest first) files_with_sizes.sort(reverse=True) # Initialize 16 output files (clean or create them) for i in range(16): with open(f'/tmp/merged_{i}', 'wb') as f: pass # Initialize min-heap to track output file sizes output_files = [(0, i) for i in range(16)] # (size, index) heapq.heapify(output_files) # Process each input file for file_size, file_path in files_with_sizes: # Get the smallest output file curr_size, idx = heapq.heappop(output_files) output_path = f'/tmp/merged_{idx}' try: # Read the content of the input file as binary with open(file_path, 'rb') as in_f: content = in_f.read() # Append content to the output file with open(output_path, 'ab') as out_f: out_f.write(content) # Update the size of the output file and put it back in the heap new_size = curr_size + file_size heapq.heappush(output_files, (new_size, idx)) except Exception as e: print(f"Error processing {file_path}: {e}") # Put the output file back with its original size heapq.heappush(output_files, (curr_size, idx)) # Print final size distribution sizes = [size for size, _ in output_files] print(f"Files merged successfully into 16 files.") print(f"Size distribution: min={min(sizes)}, max={max(sizes)}, diff={max(sizes)-min(sizes)}") if __name__ == "__main__": main() |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | #!/usr/bin/env python3 import os import heapq def main(): # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get size of each file and sort (largest first) files_with_sizes = [] for path in file_paths: try: size = os.path.getsize(path) files_with_sizes.append((size, path)) except (OSError, IOError) as e: print(f"Warning: Could not get size of {path}: {e}") # Sort by size (largest first) files_with_sizes.sort(reverse=True) # Initialize 16 output files (clean or create them) for i in range(16): with open(f'/tmp/merged_{i}', 'wb') as f: pass # Initialize min-heap to track output file sizes output_files = [(0, i) for i in range(16)] # (size, index) heapq.heapify(output_files) # Process each input file for file_size, file_path in files_with_sizes: # Get the smallest output file curr_size, idx = heapq.heappop(output_files) output_path = f'/tmp/merged_{idx}' try: # Read the content of the input file as binary with open(file_path, 'rb') as in_f: content = in_f.read() # Append content to the output file with open(output_path, 'ab') as out_f: out_f.write(content) # Update the size of the output file and put it back in the heap new_size = curr_size + file_size heapq.heappush(output_files, (new_size, idx)) except Exception as e: print(f"Error processing {file_path}: {e}") # Put the output file back with its original size heapq.heappush(output_files, (curr_size, idx)) # Print final size distribution sizes = [size for size, _ in output_files] print(f"Files merged successfully into 16 files.") print(f"Size distribution: min={min(sizes)}, max={max(sizes)}, diff={max(sizes)-min(sizes)}") if __name__ == "__main__": main() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | #!/usr/bin/env python3 import os import heapq def main(): # Read the list of file paths with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f if line.strip()] # Get size of each file and sort (largest first) files_with_sizes = [] for path in file_paths: try: size = os.path.getsize(path) files_with_sizes.append((size, path)) except (OSError, IOError) as e: print(f"Warning: Could not get size of {path}: {e}") # Sort by size (largest first) files_with_sizes.sort(reverse=True) # Initialize 16 output files (clean or create them) for i in range(16): with open(f'/tmp/merged_{i}', 'wb') as f: pass # Initialize min-heap to track output file sizes output_files = [(0, i) for i in range(16)] # (size, index) heapq.heapify(output_files) # Process each input file for file_size, file_path in files_with_sizes: # Get the smallest output file curr_size, idx = heapq.heappop(output_files) output_path = f'/tmp/merged_{idx}' try: # Read the content of the input file as binary with open(file_path, 'rb') as in_f: content = in_f.read() # Append content to the output file with open(output_path, 'ab') as out_f: out_f.write(content) # Update the size of the output file and put it back in the heap new_size = curr_size + file_size heapq.heappush(output_files, (new_size, idx)) except Exception as e: print(f"Error processing {file_path}: {e}") # Put the output file back with its original size heapq.heappush(output_files, (curr_size, idx)) # Print final size distribution sizes = [size for size, _ in output_files] print(f"Files merged successfully into 16 files.") print(f"Size distribution: min={min(sizes)}, max={max(sizes)}, diff={max(sizes)-min(sizes)}") if __name__ == "__main__": main() |
Files merged successfully into 16 files.
Size distribution: min=36803, max=36809, diff=6
final: True Resulting in output: True
I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
# File Merger and Distributor Here's a Python script that reads file paths from `/tmp/files`, merges their contents, and distributes them across 16 output files with approximately equal sizes:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | #!/usr/bin/env python3 import os import heapq import sys def merge_and_distribute_files(): # Read the list of file paths try: with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f.readlines()] except Exception as e: print(f"Error reading input file: {e}") sys.exit(1) # Get file sizes and sort by size (largest first) file_sizes = [] for file_path in file_paths: try: size = os.path.getsize(file_path) file_sizes.append((size, file_path)) except Exception as e: print(f"Warning: Couldn't get size of {file_path}: {e}") if not file_sizes: print("No valid files found") return file_sizes.sort(reverse=True) # Initialize 16 output buckets output_files = [f'/tmp/merged_{i}' for i in range(16)] output_sizes = [0] * 16 # Min heap to always get the smallest output file size_heap = [(0, i) for i in range(16)] heapq.heapify(size_heap) # Track which files go to which output file_distribution = [[] for _ in range(16)] # Distribute files to the output buckets for file_size, file_path in file_sizes: # Get the smallest output bucket current_size, index = heapq.heappop(size_heap) # Add file to this bucket file_distribution[index].append(file_path) # Update the size and put back in heap new_size = current_size + file_size output_sizes[index] = new_size heapq.heappush(size_heap, (new_size, index)) # Now merge the files into each output file for i, files in enumerate(file_distribution): try: with open(output_files[i], 'w') as out_file: for file_path in files: try: with open(file_path, 'r') as in_file: out_file.write(in_file.read()) except Exception as e: print(f"Warning: Could not read {file_path}: {e}") print(f"Created {output_files[i]} with size {output_sizes[i]} bytes") except Exception as e: print(f"Error creating {output_files[i]}: {e}") if __name__ == "__main__": merge_and_distribute_files() |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | #!/usr/bin/env python3 import os import heapq import sys def merge_and_distribute_files(): # Read the list of file paths try: with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f.readlines()] except Exception as e: print(f"Error reading input file: {e}") sys.exit(1) # Get file sizes and sort by size (largest first) file_sizes = [] for file_path in file_paths: try: size = os.path.getsize(file_path) file_sizes.append((size, file_path)) except Exception as e: print(f"Warning: Couldn't get size of {file_path}: {e}") if not file_sizes: print("No valid files found") return file_sizes.sort(reverse=True) # Initialize 16 output buckets output_files = [f'/tmp/merged_{i}' for i in range(16)] output_sizes = [0] * 16 # Min heap to always get the smallest output file size_heap = [(0, i) for i in range(16)] heapq.heapify(size_heap) # Track which files go to which output file_distribution = [[] for _ in range(16)] # Distribute files to the output buckets for file_size, file_path in file_sizes: # Get the smallest output bucket current_size, index = heapq.heappop(size_heap) # Add file to this bucket file_distribution[index].append(file_path) # Update the size and put back in heap new_size = current_size + file_size output_sizes[index] = new_size heapq.heappush(size_heap, (new_size, index)) # Now merge the files into each output file for i, files in enumerate(file_distribution): try: with open(output_files[i], 'w') as out_file: for file_path in files: try: with open(file_path, 'r') as in_file: out_file.write(in_file.read()) except Exception as e: print(f"Warning: Could not read {file_path}: {e}") print(f"Created {output_files[i]} with size {output_sizes[i]} bytes") except Exception as e: print(f"Error creating {output_files[i]}: {e}") if __name__ == "__main__": merge_and_distribute_files() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | #!/usr/bin/env python3 import os import heapq import sys def merge_and_distribute_files(): # Read the list of file paths try: with open('/tmp/files', 'r') as f: file_paths = [line.strip() for line in f.readlines()] except Exception as e: print(f"Error reading input file: {e}") sys.exit(1) # Get file sizes and sort by size (largest first) file_sizes = [] for file_path in file_paths: try: size = os.path.getsize(file_path) file_sizes.append((size, file_path)) except Exception as e: print(f"Warning: Couldn't get size of {file_path}: {e}") if not file_sizes: print("No valid files found") return file_sizes.sort(reverse=True) # Initialize 16 output buckets output_files = [f'/tmp/merged_{i}' for i in range(16)] output_sizes = [0] * 16 # Min heap to always get the smallest output file size_heap = [(0, i) for i in range(16)] heapq.heapify(size_heap) # Track which files go to which output file_distribution = [[] for _ in range(16)] # Distribute files to the output buckets for file_size, file_path in file_sizes: # Get the smallest output bucket current_size, index = heapq.heappop(size_heap) # Add file to this bucket file_distribution[index].append(file_path) # Update the size and put back in heap new_size = current_size + file_size output_sizes[index] = new_size heapq.heappush(size_heap, (new_size, index)) # Now merge the files into each output file for i, files in enumerate(file_distribution): try: with open(output_files[i], 'w') as out_file: for file_path in files: try: with open(file_path, 'r') as in_file: out_file.write(in_file.read()) except Exception as e: print(f"Warning: Could not read {file_path}: {e}") print(f"Created {output_files[i]} with size {output_sizes[i]} bytes") except Exception as e: print(f"Error creating {output_files[i]}: {e}") if __name__ == "__main__": merge_and_distribute_files() |
Created /tmp/merged_0 with size 36998 bytes
Created /tmp/merged_1 with size 37004 bytes
Created /tmp/merged_2 with size 36999 bytes
Created /tmp/merged_3 with size 37004 bytes
Created /tmp/merged_4 with size 37003 bytes
Created /tmp/merged_5 with size 37003 bytes
Created /tmp/merged_6 with size 36997 bytes
Created /tmp/merged_7 with size 36998 bytes
Created /tmp/merged_8 with size 37002 bytes
Created /tmp/merged_9 with size 36996 bytes
Created /tmp/merged_10 with size 37002 bytes
Created /tmp/merged_11 with size 37000 bytes
Created /tmp/merged_12 with size 36999 bytes
Created /tmp/merged_13 with size 36997 bytes
Created /tmp/merged_14 with size 37004 bytes
Created /tmp/merged_15 with size 36995 bytes
final: True Resulting in output: True