I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
Sure! Below is a Python script that accomplishes the task you described. This script: 1. **Reads** a list of file paths from `/tmp/files`. 2. **Sorts** the files by size in descending order. 3. **Distributes** the files to 16 output files (`/tmp/merged_0` to `/tmp/merged_15`) such that each merged file has approximately the same total size. It always assigns the next largest file to the currently smallest merged file. 4. **Merges** the assigned files into their respective output files. ### Python Script: `merge_and_split.py`
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | #!/usr/bin/env python3 import os import heapq import sys def main(): input_list_path = "/tmp/files" output_dir = "/tmp" num_outputs = 16 output_prefix = "merged_" # Step 1: Read all file paths try: with open(input_list_path, 'r') as f: file_paths = [line.strip() for line in f if line.strip()] except Exception as e: print(f"Error reading {input_list_path}: {e}", file=sys.stderr) sys.exit(1) if not file_paths: print(f"No file paths found in {input_list_path}.", file=sys.stderr) sys.exit(1) # Step 2: Get file sizes and sort descending file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) file_sizes.append(( -size, path)) # Negative size for descending sort except Exception as e: print(f"Error getting size for {path}: {e}", file=sys.stderr) # Optionally, skip the file or exit continue # Sort files by size descending file_sizes.sort() # Step 3: Initialize heap for output files # Heap elements are tuples: (current_total_size, output_index) heap = [(0, i) for i in range(num_outputs)] heapq.heapify(heap) # Mapping from output index to list of file paths assignments = {i: [] for i in range(num_outputs)} # Step 4: Assign files to outputs for neg_size, path in file_sizes: size = -neg_size # Convert back to positive current_total, out_idx = heapq.heappop(heap) assignments[out_idx].append(path) new_total = current_total + size heapq.heappush(heap, (new_total, out_idx)) # Optional: Print assignment sizes print("Assignment of files to output merged files:") for i in range(num_outputs): total_size = sum(os.path.getsize(p) for p in assignments[i] if os.path.exists(p)) print(f" /tmp/{output_prefix}{i}: {len(assignments[i])} files, {total_size} bytes") # Step 5: Merge files for i in range(num_outputs): output_path = os.path.join(output_dir, f"{output_prefix}{i}") try: with open(output_path, 'wb') as outfile: for infile_path in assignments[i]: try: with open(infile_path, 'rb') as infile: while True: chunk = infile.read(1024 * 1024) # 1MB chunks if not chunk: break outfile.write(chunk) except Exception as e: print(f"Error reading {infile_path}: {e}", file=sys.stderr) # Optionally, continue or abort continue except Exception as e: print(f"Error writing to {output_path}: {e}", file=sys.stderr) continue print("Merging completed successfully.") if __name__ == "__main__": main() |
1 2 | chmod +x merge_and_split.py |
1 2 | ./merge_and_split.py |
1 2 | python3 merge_and_split.py |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | #!/usr/bin/env python3 import os import heapq import sys def main(): input_list_path = "/tmp/files" output_dir = "/tmp" num_outputs = 16 output_prefix = "merged_" # Step 1: Read all file paths try: with open(input_list_path, 'r') as f: file_paths = [line.strip() for line in f if line.strip()] except Exception as e: print(f"Error reading {input_list_path}: {e}", file=sys.stderr) sys.exit(1) if not file_paths: print(f"No file paths found in {input_list_path}.", file=sys.stderr) sys.exit(1) # Step 2: Get file sizes and sort descending file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) file_sizes.append(( -size, path)) # Negative size for descending sort except Exception as e: print(f"Error getting size for {path}: {e}", file=sys.stderr) # Optionally, skip the file or exit continue # Sort files by size descending file_sizes.sort() # Step 3: Initialize heap for output files # Heap elements are tuples: (current_total_size, output_index) heap = [(0, i) for i in range(num_outputs)] heapq.heapify(heap) # Mapping from output index to list of file paths assignments = {i: [] for i in range(num_outputs)} # Step 4: Assign files to outputs for neg_size, path in file_sizes: size = -neg_size # Convert back to positive current_total, out_idx = heapq.heappop(heap) assignments[out_idx].append(path) new_total = current_total + size heapq.heappush(heap, (new_total, out_idx)) # Optional: Print assignment sizes print("Assignment of files to output merged files:") for i in range(num_outputs): total_size = sum(os.path.getsize(p) for p in assignments[i] if os.path.exists(p)) print(f" /tmp/{output_prefix}{i}: {len(assignments[i])} files, {total_size} bytes") # Step 5: Merge files for i in range(num_outputs): output_path = os.path.join(output_dir, f"{output_prefix}{i}") try: with open(output_path, 'wb') as outfile: for infile_path in assignments[i]: try: with open(infile_path, 'rb') as infile: while True: chunk = infile.read(1024 * 1024) # 1MB chunks if not chunk: break outfile.write(chunk) except Exception as e: print(f"Error reading {infile_path}: {e}", file=sys.stderr) # Optionally, continue or abort continue except Exception as e: print(f"Error writing to {output_path}: {e}", file=sys.stderr) continue print("Merging completed successfully.") if __name__ == "__main__": main() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | #!/usr/bin/env python3 import os import heapq import sys def main(): input_list_path = "/tmp/files" output_dir = "/tmp" num_outputs = 16 output_prefix = "merged_" # Step 1: Read all file paths try: with open(input_list_path, 'r') as f: file_paths = [line.strip() for line in f if line.strip()] except Exception as e: print(f"Error reading {input_list_path}: {e}", file=sys.stderr) sys.exit(1) if not file_paths: print(f"No file paths found in {input_list_path}.", file=sys.stderr) sys.exit(1) # Step 2: Get file sizes and sort descending file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) file_sizes.append(( -size, path)) # Negative size for descending sort except Exception as e: print(f"Error getting size for {path}: {e}", file=sys.stderr) # Optionally, skip the file or exit continue # Sort files by size descending file_sizes.sort() # Step 3: Initialize heap for output files # Heap elements are tuples: (current_total_size, output_index) heap = [(0, i) for i in range(num_outputs)] heapq.heapify(heap) # Mapping from output index to list of file paths assignments = {i: [] for i in range(num_outputs)} # Step 4: Assign files to outputs for neg_size, path in file_sizes: size = -neg_size # Convert back to positive current_total, out_idx = heapq.heappop(heap) assignments[out_idx].append(path) new_total = current_total + size heapq.heappush(heap, (new_total, out_idx)) # Optional: Print assignment sizes print("Assignment of files to output merged files:") for i in range(num_outputs): total_size = sum(os.path.getsize(p) for p in assignments[i] if os.path.exists(p)) print(f" /tmp/{output_prefix}{i}: {len(assignments[i])} files, {total_size} bytes") # Step 5: Merge files for i in range(num_outputs): output_path = os.path.join(output_dir, f"{output_prefix}{i}") try: with open(output_path, 'wb') as outfile: for infile_path in assignments[i]: try: with open(infile_path, 'rb') as infile: while True: chunk = infile.read(1024 * 1024) # 1MB chunks if not chunk: break outfile.write(chunk) except Exception as e: print(f"Error reading {infile_path}: {e}", file=sys.stderr) # Optionally, continue or abort continue except Exception as e: print(f"Error writing to {output_path}: {e}", file=sys.stderr) continue print("Merging completed successfully.") if __name__ == "__main__": main() |
Assignment of files to output merged files:
/tmp/merged_0: 35 files, 37727 bytes
/tmp/merged_1: 35 files, 37718 bytes
/tmp/merged_2: 38 files, 37722 bytes
/tmp/merged_3: 56 files, 37722 bytes
/tmp/merged_4: 60 files, 37726 bytes
/tmp/merged_5: 70 files, 37723 bytes
/tmp/merged_6: 70 files, 37722 bytes
/tmp/merged_7: 71 files, 37727 bytes
/tmp/merged_8: 71 files, 37721 bytes
/tmp/merged_9: 70 files, 37723 bytes
/tmp/merged_10: 71 files, 37718 bytes
/tmp/merged_11: 71 files, 37727 bytes
/tmp/merged_12: 70 files, 37721 bytes
/tmp/merged_13: 71 files, 37726 bytes
/tmp/merged_14: 70 files, 37719 bytes
/tmp/merged_15: 71 files, 37721 bytes
Merging completed successfully.
final: True Resulting in output: True
I have setup the docker container to run the model evaluation.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
Sure! Below is a Python script that accomplishes the task you described. This script: 1. **Reads** the list of file paths from `/tmp/files`. 2. **Retrieves** the size of each file. 3. **Sorts** the files in descending order based on their size. 4. **Distributes** the files to 16 merged output files (`/tmp/merged_0` to `/tmp/merged_15`) using a greedy algorithm to ensure each merged file has approximately the same total size. 5. **Writes** the contents of each file to the assigned merged file. ### Python Script: `merge_files.py`
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import os import heapq def get_file_list(file_list_path): """ Reads the list of file paths from the given file. :param file_list_path: Path to the file containing list of file paths :return: List of file paths """ with open(file_list_path, 'r') as f: files = [line.strip() for line in f if line.strip()] return files def get_file_sizes(file_paths): """ Gets the sizes of the files. :param file_paths: List of file paths :return: List of tuples (file_path, size) """ file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) file_sizes.append((path, size)) except OSError as e: print(f"Error accessing {path}: {e}") return file_sizes def assign_files_to_bins(file_sizes, num_bins=16): """ Assigns files to bins to balance the total size. :param file_sizes: List of tuples (file_path, size), sorted descending by size :param num_bins: Number of bins/merged files :return: List where index represents bin number and contains list of file paths """ # Initialize a heap with (current_total_size, bin_index) heap = [(0, i) for i in range(num_bins)] heapq.heapify(heap) # Initialize list of files for each bin bins = [[] for _ in range(num_bins)] for file_path, size in file_sizes: current_size, bin_index = heapq.heappop(heap) bins[bin_index].append(file_path) new_size = current_size + size heapq.heappush(heap, (new_size, bin_index)) return bins def merge_bins(bins, output_dir="/tmp", output_prefix="merged_"): """ Merges the files in each bin into a single output file. :param bins: List where each element is a list of file paths :param output_dir: Directory to store merged files :param output_prefix: Prefix for merged files """ # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) for i, bin_files in enumerate(bins): output_path = os.path.join(output_dir, f"{output_prefix}{i}") try: with open(output_path, 'wb') as outfile: for file_path in bin_files: try: with open(file_path, 'rb') as infile: while True: chunk = infile.read(1024 * 1024) # Read in chunks of 1MB if not chunk: break outfile.write(chunk) except OSError as e: print(f"Error reading {file_path}: {e}") # Optionally, print the total size of each merged file total_size = sum(os.path.getsize(fp) for fp in bin_files if os.path.exists(fp)) print(f"Merged {len(bin_files)} files into {output_path} (Total size: {total_size} bytes)") except OSError as e: print(f"Error writing to {output_path}: {e}") def main(): file_list_path = "/tmp/files" print("Reading file list...") file_paths = get_file_list(file_list_path) print(f"Total files found: {len(file_paths)}") print("Getting file sizes...") file_sizes = get_file_sizes(file_paths) print(f"Total accessible files: {len(file_sizes)}") print("Sorting files by size (largest first)...") file_sizes.sort(key=lambda x: x[1], reverse=True) print("Assigning files to bins...") bins = assign_files_to_bins(file_sizes, num_bins=16) print("Merging files into output bins...") merge_bins(bins, output_dir="/tmp", output_prefix="merged_") print("Merging completed.") if __name__ == "__main__": main() |
1 2 | chmod +x merge_files.py |
1 2 | python3 merge_files.py |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import os import heapq def get_file_list(file_list_path): """ Reads the list of file paths from the given file. :param file_list_path: Path to the file containing list of file paths :return: List of file paths """ with open(file_list_path, 'r') as f: files = [line.strip() for line in f if line.strip()] return files def get_file_sizes(file_paths): """ Gets the sizes of the files. :param file_paths: List of file paths :return: List of tuples (file_path, size) """ file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) file_sizes.append((path, size)) except OSError as e: print(f"Error accessing {path}: {e}") return file_sizes def assign_files_to_bins(file_sizes, num_bins=16): """ Assigns files to bins to balance the total size. :param file_sizes: List of tuples (file_path, size), sorted descending by size :param num_bins: Number of bins/merged files :return: List where index represents bin number and contains list of file paths """ # Initialize a heap with (current_total_size, bin_index) heap = [(0, i) for i in range(num_bins)] heapq.heapify(heap) # Initialize list of files for each bin bins = [[] for _ in range(num_bins)] for file_path, size in file_sizes: current_size, bin_index = heapq.heappop(heap) bins[bin_index].append(file_path) new_size = current_size + size heapq.heappush(heap, (new_size, bin_index)) return bins def merge_bins(bins, output_dir="/tmp", output_prefix="merged_"): """ Merges the files in each bin into a single output file. :param bins: List where each element is a list of file paths :param output_dir: Directory to store merged files :param output_prefix: Prefix for merged files """ # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) for i, bin_files in enumerate(bins): output_path = os.path.join(output_dir, f"{output_prefix}{i}") try: with open(output_path, 'wb') as outfile: for file_path in bin_files: try: with open(file_path, 'rb') as infile: while True: chunk = infile.read(1024 * 1024) # Read in chunks of 1MB if not chunk: break outfile.write(chunk) except OSError as e: print(f"Error reading {file_path}: {e}") # Optionally, print the total size of each merged file total_size = sum(os.path.getsize(fp) for fp in bin_files if os.path.exists(fp)) print(f"Merged {len(bin_files)} files into {output_path} (Total size: {total_size} bytes)") except OSError as e: print(f"Error writing to {output_path}: {e}") def main(): file_list_path = "/tmp/files" print("Reading file list...") file_paths = get_file_list(file_list_path) print(f"Total files found: {len(file_paths)}") print("Getting file sizes...") file_sizes = get_file_sizes(file_paths) print(f"Total accessible files: {len(file_sizes)}") print("Sorting files by size (largest first)...") file_sizes.sort(key=lambda x: x[1], reverse=True) print("Assigning files to bins...") bins = assign_files_to_bins(file_sizes, num_bins=16) print("Merging files into output bins...") merge_bins(bins, output_dir="/tmp", output_prefix="merged_") print("Merging completed.") if __name__ == "__main__": main() |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import os import heapq def get_file_list(file_list_path): """ Reads the list of file paths from the given file. :param file_list_path: Path to the file containing list of file paths :return: List of file paths """ with open(file_list_path, 'r') as f: files = [line.strip() for line in f if line.strip()] return files def get_file_sizes(file_paths): """ Gets the sizes of the files. :param file_paths: List of file paths :return: List of tuples (file_path, size) """ file_sizes = [] for path in file_paths: try: size = os.path.getsize(path) file_sizes.append((path, size)) except OSError as e: print(f"Error accessing {path}: {e}") return file_sizes def assign_files_to_bins(file_sizes, num_bins=16): """ Assigns files to bins to balance the total size. :param file_sizes: List of tuples (file_path, size), sorted descending by size :param num_bins: Number of bins/merged files :return: List where index represents bin number and contains list of file paths """ # Initialize a heap with (current_total_size, bin_index) heap = [(0, i) for i in range(num_bins)] heapq.heapify(heap) # Initialize list of files for each bin bins = [[] for _ in range(num_bins)] for file_path, size in file_sizes: current_size, bin_index = heapq.heappop(heap) bins[bin_index].append(file_path) new_size = current_size + size heapq.heappush(heap, (new_size, bin_index)) return bins def merge_bins(bins, output_dir="/tmp", output_prefix="merged_"): """ Merges the files in each bin into a single output file. :param bins: List where each element is a list of file paths :param output_dir: Directory to store merged files :param output_prefix: Prefix for merged files """ # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) for i, bin_files in enumerate(bins): output_path = os.path.join(output_dir, f"{output_prefix}{i}") try: with open(output_path, 'wb') as outfile: for file_path in bin_files: try: with open(file_path, 'rb') as infile: while True: chunk = infile.read(1024 * 1024) # Read in chunks of 1MB if not chunk: break outfile.write(chunk) except OSError as e: print(f"Error reading {file_path}: {e}") # Optionally, print the total size of each merged file total_size = sum(os.path.getsize(fp) for fp in bin_files if os.path.exists(fp)) print(f"Merged {len(bin_files)} files into {output_path} (Total size: {total_size} bytes)") except OSError as e: print(f"Error writing to {output_path}: {e}") def main(): file_list_path = "/tmp/files" print("Reading file list...") file_paths = get_file_list(file_list_path) print(f"Total files found: {len(file_paths)}") print("Getting file sizes...") file_sizes = get_file_sizes(file_paths) print(f"Total accessible files: {len(file_sizes)}") print("Sorting files by size (largest first)...") file_sizes.sort(key=lambda x: x[1], reverse=True) print("Assigning files to bins...") bins = assign_files_to_bins(file_sizes, num_bins=16) print("Merging files into output bins...") merge_bins(bins, output_dir="/tmp", output_prefix="merged_") print("Merging completed.") if __name__ == "__main__": main() |
Reading file list...
Total files found: 1000
Getting file sizes...
Total accessible files: 1000
Sorting files by size (largest first)...
Assigning files to bins...
Merging files into output bins...
Merged 33 files into /tmp/merged_0 (Total size: 37141 bytes)
Merged 42 files into /tmp/merged_1 (Total size: 37140 bytes)
Merged 50 files into /tmp/merged_2 (Total size: 37146 bytes)
Merged 51 files into /tmp/merged_3 (Total size: 37143 bytes)
Merged 58 files into /tmp/merged_4 (Total size: 37145 bytes)
Merged 69 files into /tmp/merged_5 (Total size: 37140 bytes)
Merged 69 files into /tmp/merged_6 (Total size: 37141 bytes)
Merged 70 files into /tmp/merged_7 (Total size: 37143 bytes)
Merged 70 files into /tmp/merged_8 (Total size: 37142 bytes)
Merged 70 files into /tmp/merged_9 (Total size: 37143 bytes)
Merged 70 files into /tmp/merged_10 (Total size: 37142 bytes)
Merged 70 files into /tmp/merged_11 (Total size: 37142 bytes)
Merged 70 files into /tmp/merged_12 (Total size: 37140 bytes)
Merged 70 files into /tmp/merged_13 (Total size: 37144 bytes)
Merged 69 files into /tmp/merged_14 (Total size: 37139 bytes)
Merged 69 files into /tmp/merged_15 (Total size: 37139 bytes)
Merging completed.
final: True Resulting in output: True