Docker Setup

I have setup the docker container to run the model evaluation.

Initial Query

I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.

LLM Generation

Query

I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.

Output

Certainly! Below is a Python script that accomplishes the task of merging several thousand files listed in `/tmp/files` into 16 merged files (`/tmp/merged_0` through `/tmp/merged_15`) with approximately equal total sizes. ### Overview of the Script 1. **Read File Paths**: Reads all file paths from `/tmp/files`, assuming each line contains one file path. 2. **Retrieve File Sizes**: Determines the size of each file to aid in balanced distribution. 3. **Sort Files**: Sorts the files in descending order based on their sizes to optimize the balancing process. 4. **Assign Files to Merged Outputs**: Uses a greedy algorithm to assign each file to the merged file with the current smallest total size. 5. **Merge Files**: Reads the content of each input file and appends it to the appropriate merged output file. ### Python Script

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
#!/usr/bin/env python3

import os
import sys
from collections import defaultdict

def read_file_paths(file_list_path):
    """Reads file paths from the given file."""
    with open(file_list_path, 'r') as f:
        file_paths = [line.strip() for line in f if line.strip()]
    return file_paths

def get_file_sizes(file_paths):
    """Returns a list of tuples (file_path, size)."""
    files_with_sizes = []
    for path in file_paths:
        try:
            size = os.path.getsize(path)
            files_with_sizes.append((path, size))
        except OSError as e:
            print(f"Warning: Could not access {path}. Skipping. Error: {e}", file=sys.stderr)
    return files_with_sizes

def assign_files_to_merges(files_with_sizes, num_merges=16):
    """
    Assigns each file to one of the merged files aiming for balanced total sizes.
    
    Returns a dictionary mapping merge index to list of file paths.
    """
    # Sort files by size descending
    sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True)
    
    # Initialize merge assignments and their current total sizes
    merge_assignments = defaultdict(list)
    merge_sizes = [0] * num_merges
    
    for file_path, size in sorted_files:
        # Find the merge index with the smallest current size
        min_index = merge_sizes.index(min(merge_sizes))
        merge_assignments[min_index].append(file_path)
        merge_sizes[min_index] += size
    
    return merge_assignments

def merge_files(merge_assignments, output_dir='/tmp', output_prefix='merged_', num_merges=16):
    """
    Merges the assigned files into the respective merged output files.
    """
    for merge_index in range(num_merges):
        output_file_path = os.path.join(output_dir, f"{output_prefix}{merge_index}")
        assigned_files = merge_assignments.get(merge_index, [])
        
        if not assigned_files:
            print(f"No files assigned to {output_file_path}. Skipping.")
            continue
        
        try:
            with open(output_file_path, 'wb') as outfile:
                for file_path in assigned_files:
                    try:
                        with open(file_path, 'rb') as infile:
                            while True:
                                chunk = infile.read(1024 * 1024)  # Read in 1MB chunks
                                if not chunk:
                                    break
                                outfile.write(chunk)
                    except OSError as e:
                        print(f"Warning: Could not read {file_path}. Skipping. Error: {e}", file=sys.stderr)
            print(f"Merged {len(assigned_files)} files into {output_file_path}")
        except OSError as e:
            print(f"Error: Could not write to {output_file_path}. Error: {e}", file=sys.stderr)

def main():
    file_list_path = '/tmp/files'
    num_merges = 16
    output_dir = '/tmp'
    output_prefix = 'merged_'
    
    print("Reading file paths...")
    file_paths = read_file_paths(file_list_path)
    print(f"Total files read: {len(file_paths)}")
    
    print("Retrieving file sizes...")
    files_with_sizes = get_file_sizes(file_paths)
    print(f"Total accessible files: {len(files_with_sizes)}")
    
    print("Assigning files to merged outputs...")
    merge_assignments = assign_files_to_merges(files_with_sizes, num_merges)
    
    # Optional: Print the total size assigned to each merge
    merge_sizes = [0] * num_merges
    for merge_index, files in merge_assignments.items():
        for file_path in files:
            size = os.path.getsize(file_path)
            merge_sizes[merge_index] += size
    for i, size in enumerate(merge_sizes):
        print(f"Merged_{i}: {size} bytes across {len(merge_assignments[i])} files")
    
    print("Merging files...")
    merge_files(merge_assignments, output_dir, output_prefix, num_merges)
    print("Merging completed.")

if __name__ == "__main__":
    main()
### How to Use the Script 1. **Save the Script**: Save the script to a file, for example, `merge_files.py`. 2. **Make It Executable** (Optional): If you're on a UNIX-like system, you can make the script executable:
1
2
    chmod +x merge_files.py
    
3. **Run the Script**:
1
2
    ./merge_files.py
    
Or, if not executable:
1
2
    python3 merge_files.py
    
### Explanation of Key Components - **Reading File Paths**: The `read_file_paths` function reads all non-empty lines from `/tmp/files` and strips any leading/trailing whitespace. - **Handling File Sizes**: The `get_file_sizes` function attempts to get the size of each file. If a file cannot be accessed (due to permissions, being deleted, etc.), it logs a warning and skips that file. - **Assigning Files**: The `assign_files_to_merges` function uses a greedy algorithm—sorting the files in descending order by size and always assigning the next largest file to the merged file with the current smallest total size. This approach helps in balancing the total sizes across all merged files. - **Merging Process**: The `merge_files` function iterates over each merged file's assigned files, reading them in chunks (1MB in this case) to efficiently handle large files without consuming excessive memory. - **Progress and Error Reporting**: The script prints out informative messages about its progress and any issues encountered, such as inaccessible files. ### Notes and Considerations - **Performance**: Reading and writing files in chunks optimizes memory usage, especially important when dealing with large files. - **Error Handling**: The script includes basic error handling, logging warnings for files that can't be accessed and continuing with the rest. - **Customization**: - **Number of Merges**: You can adjust the `num_merges` variable in the `main` function if you need a different number of merged output files. - **Output Directory and Prefix**: Modify `output_dir` and `output_prefix` in the `main` function if you want the merged files to be stored elsewhere or have a different naming convention. - **Dependencies**: The script uses only standard Python libraries, so no additional installations are necessary. - **Python Version**: Ensure you're using Python 3.x, as the script utilizes Python 3 features. This script should efficiently distribute and merge your files into 16 balanced merged files. If you encounter any issues or have specific requirements, feel free to modify the script accordingly or reach out for further assistance.

Extract Code

I extracted the following code from that output:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
#!/usr/bin/env python3

import os
import sys
from collections import defaultdict

def read_file_paths(file_list_path):
    """Reads file paths from the given file."""
    with open(file_list_path, 'r') as f:
        file_paths = [line.strip() for line in f if line.strip()]
    return file_paths

def get_file_sizes(file_paths):
    """Returns a list of tuples (file_path, size)."""
    files_with_sizes = []
    for path in file_paths:
        try:
            size = os.path.getsize(path)
            files_with_sizes.append((path, size))
        except OSError as e:
            print(f"Warning: Could not access {path}. Skipping. Error: {e}", file=sys.stderr)
    return files_with_sizes

def assign_files_to_merges(files_with_sizes, num_merges=16):
    """
    Assigns each file to one of the merged files aiming for balanced total sizes.
    
    Returns a dictionary mapping merge index to list of file paths.
    """
    # Sort files by size descending
    sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True)
    
    # Initialize merge assignments and their current total sizes
    merge_assignments = defaultdict(list)
    merge_sizes = [0] * num_merges
    
    for file_path, size in sorted_files:
        # Find the merge index with the smallest current size
        min_index = merge_sizes.index(min(merge_sizes))
        merge_assignments[min_index].append(file_path)
        merge_sizes[min_index] += size
    
    return merge_assignments

def merge_files(merge_assignments, output_dir='/tmp', output_prefix='merged_', num_merges=16):
    """
    Merges the assigned files into the respective merged output files.
    """
    for merge_index in range(num_merges):
        output_file_path = os.path.join(output_dir, f"{output_prefix}{merge_index}")
        assigned_files = merge_assignments.get(merge_index, [])
        
        if not assigned_files:
            print(f"No files assigned to {output_file_path}. Skipping.")
            continue
        
        try:
            with open(output_file_path, 'wb') as outfile:
                for file_path in assigned_files:
                    try:
                        with open(file_path, 'rb') as infile:
                            while True:
                                chunk = infile.read(1024 * 1024)  # Read in 1MB chunks
                                if not chunk:
                                    break
                                outfile.write(chunk)
                    except OSError as e:
                        print(f"Warning: Could not read {file_path}. Skipping. Error: {e}", file=sys.stderr)
            print(f"Merged {len(assigned_files)} files into {output_file_path}")
        except OSError as e:
            print(f"Error: Could not write to {output_file_path}. Error: {e}", file=sys.stderr)

def main():
    file_list_path = '/tmp/files'
    num_merges = 16
    output_dir = '/tmp'
    output_prefix = 'merged_'
    
    print("Reading file paths...")
    file_paths = read_file_paths(file_list_path)
    print(f"Total files read: {len(file_paths)}")
    
    print("Retrieving file sizes...")
    files_with_sizes = get_file_sizes(file_paths)
    print(f"Total accessible files: {len(files_with_sizes)}")
    
    print("Assigning files to merged outputs...")
    merge_assignments = assign_files_to_merges(files_with_sizes, num_merges)
    
    # Optional: Print the total size assigned to each merge
    merge_sizes = [0] * num_merges
    for merge_index, files in merge_assignments.items():
        for file_path in files:
            size = os.path.getsize(file_path)
            merge_sizes[merge_index] += size
    for i, size in enumerate(merge_sizes):
        print(f"Merged_{i}: {size} bytes across {len(merge_assignments[i])} files")
    
    print("Merging files...")
    merge_files(merge_assignments, output_dir, output_prefix, num_merges)
    print("Merging completed.")

if __name__ == "__main__":
    main()

Run Code Interpreter

Running the following program:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
#!/usr/bin/env python3

import os
import sys
from collections import defaultdict

def read_file_paths(file_list_path):
    """Reads file paths from the given file."""
    with open(file_list_path, 'r') as f:
        file_paths = [line.strip() for line in f if line.strip()]
    return file_paths

def get_file_sizes(file_paths):
    """Returns a list of tuples (file_path, size)."""
    files_with_sizes = []
    for path in file_paths:
        try:
            size = os.path.getsize(path)
            files_with_sizes.append((path, size))
        except OSError as e:
            print(f"Warning: Could not access {path}. Skipping. Error: {e}", file=sys.stderr)
    return files_with_sizes

def assign_files_to_merges(files_with_sizes, num_merges=16):
    """
    Assigns each file to one of the merged files aiming for balanced total sizes.
    
    Returns a dictionary mapping merge index to list of file paths.
    """
    # Sort files by size descending
    sorted_files = sorted(files_with_sizes, key=lambda x: x[1], reverse=True)
    
    # Initialize merge assignments and their current total sizes
    merge_assignments = defaultdict(list)
    merge_sizes = [0] * num_merges
    
    for file_path, size in sorted_files:
        # Find the merge index with the smallest current size
        min_index = merge_sizes.index(min(merge_sizes))
        merge_assignments[min_index].append(file_path)
        merge_sizes[min_index] += size
    
    return merge_assignments

def merge_files(merge_assignments, output_dir='/tmp', output_prefix='merged_', num_merges=16):
    """
    Merges the assigned files into the respective merged output files.
    """
    for merge_index in range(num_merges):
        output_file_path = os.path.join(output_dir, f"{output_prefix}{merge_index}")
        assigned_files = merge_assignments.get(merge_index, [])
        
        if not assigned_files:
            print(f"No files assigned to {output_file_path}. Skipping.")
            continue
        
        try:
            with open(output_file_path, 'wb') as outfile:
                for file_path in assigned_files:
                    try:
                        with open(file_path, 'rb') as infile:
                            while True:
                                chunk = infile.read(1024 * 1024)  # Read in 1MB chunks
                                if not chunk:
                                    break
                                outfile.write(chunk)
                    except OSError as e:
                        print(f"Warning: Could not read {file_path}. Skipping. Error: {e}", file=sys.stderr)
            print(f"Merged {len(assigned_files)} files into {output_file_path}")
        except OSError as e:
            print(f"Error: Could not write to {output_file_path}. Error: {e}", file=sys.stderr)

def main():
    file_list_path = '/tmp/files'
    num_merges = 16
    output_dir = '/tmp'
    output_prefix = 'merged_'
    
    print("Reading file paths...")
    file_paths = read_file_paths(file_list_path)
    print(f"Total files read: {len(file_paths)}")
    
    print("Retrieving file sizes...")
    files_with_sizes = get_file_sizes(file_paths)
    print(f"Total accessible files: {len(files_with_sizes)}")
    
    print("Assigning files to merged outputs...")
    merge_assignments = assign_files_to_merges(files_with_sizes, num_merges)
    
    # Optional: Print the total size assigned to each merge
    merge_sizes = [0] * num_merges
    for merge_index, files in merge_assignments.items():
        for file_path in files:
            size = os.path.getsize(file_path)
            merge_sizes[merge_index] += size
    for i, size in enumerate(merge_sizes):
        print(f"Merged_{i}: {size} bytes across {len(merge_assignments[i])} files")
    
    print("Merging files...")
    merge_files(merge_assignments, output_dir, output_prefix, num_merges)
    print("Merging completed.")

if __name__ == "__main__":
    main()
And got the output: Reading file paths... Total files read: 1000 Retrieving file sizes... Total accessible files: 1000 Assigning files to merged outputs... Merged_0: 38388 bytes across 40 files Merged_1: 38381 bytes across 43 files Merged_2: 38392 bytes across 43 files Merged_3: 38392 bytes across 45 files Merged_4: 38379 bytes across 59 files Merged_5: 38387 bytes across 70 files Merged_6: 38392 bytes across 70 files Merged_7: 38391 bytes across 70 files Merged_8: 38385 bytes across 70 files Merged_9: 38389 bytes across 70 files Merged_10: 38390 bytes across 70 files Merged_11: 38390 bytes across 70 files Merged_12: 38381 bytes across 70 files Merged_13: 38388 bytes across 70 files Merged_14: 38388 bytes across 70 files Merged_15: 38387 bytes across 70 files Merging files... Merged 40 files into /tmp/merged_0 Merged 43 files into /tmp/merged_1 Merged 43 files into /tmp/merged_2 Merged 45 files into /tmp/merged_3 Merged 59 files into /tmp/merged_4 Merged 70 files into /tmp/merged_5 Merged 70 files into /tmp/merged_6 Merged 70 files into /tmp/merged_7 Merged 70 files into /tmp/merged_8 Merged 70 files into /tmp/merged_9 Merged 70 files into /tmp/merged_10 Merged 70 files into /tmp/merged_11 Merged 70 files into /tmp/merged_12 Merged 70 files into /tmp/merged_13 Merged 70 files into /tmp/merged_14 Merged 70 files into /tmp/merged_15 Merging completed.

PyFunc

final: True Resulting in output: True

Docker Setup

I have setup the docker container to run the model evaluation.

Initial Query

I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.

LLM Generation

Query

I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.

Output

Certainly! Below is a Python script that reads a list of file paths from `/tmp/files`, assigns each file to one of 16 merged output files (`/tmp/merged_0` to `/tmp/merged_15`) to ensure that each merged file is approximately the same size, and then concatenates the contents accordingly. ### Explanation: 1. **Read File Paths and Sizes:** - The script reads each file path from `/tmp/files` and retrieves its size in bytes. 2. **Sort Files by Size (Descending):** - To distribute the files more evenly, it sorts them in descending order based on their sizes. This helps in balancing the total sizes across the merged files using a greedy approach. 3. **Assign Files to Merged Outputs:** - The script maintains a list of 16 merged files and keeps track of the current total size for each. - For each file, it assigns it to the merged file with the smallest current total size. 4. **Concatenate File Contents:** - After assignment, the script opens each merged file and writes the contents of its assigned files sequentially. ### Python Script:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3

import os
import sys
from collections import defaultdict

def main():
    input_list = '/tmp/files'
    output_dir = '/tmp'
    num_outputs = 16
    output_files = [os.path.join(output_dir, f'merged_{i}') for i in range(num_outputs)]
    
    # Step 1: Read file paths and sizes
    files = []
    try:
        with open(input_list, 'r') as f:
            for line in f:
                filepath = line.strip()
                if not filepath:
                    continue
                try:
                    size = os.path.getsize(filepath)
                    files.append((filepath, size))
                except OSError as e:
                    print(f"Warning: Could not access {filepath}: {e}", file=sys.stderr)
    except FileNotFoundError:
        print(f"Error: Input list {input_list} not found.", file=sys.stderr)
        sys.exit(1)
    
    if not files:
        print("No valid files to process.", file=sys.stderr)
        sys.exit(1)
    
    # Step 2: Sort files by size descending
    files.sort(key=lambda x: x[1], reverse=True)
    
    # Step 3: Assign files to merged outputs
    # Initialize merged file sizes
    merged_sizes = [0] * num_outputs
    merged_assignments = defaultdict(list)  # key: output index, value: list of file paths
    
    for filepath, size in files:
        # Find the merged file with the smallest current size
        min_index = merged_sizes.index(min(merged_sizes))
        merged_assignments[min_index].append(filepath)
        merged_sizes[min_index] += size
    
    # Step 4: Write to merged files
    for i in range(num_outputs):
        output_path = output_files[i]
        print(f"Writing to {output_path} with {len(merged_assignments[i])} files, total size {merged_sizes[i]} bytes.")
        try:
            with open(output_path, 'wb') as outfile:
                for filepath in merged_assignments[i]:
                    try:
                        with open(filepath, 'rb') as infile:
                            while True:
                                chunk = infile.read(1024 * 1024)  # Read in 1MB chunks
                                if not chunk:
                                    break
                                outfile.write(chunk)
                    except OSError as e:
                        print(f"Warning: Could not read {filepath}: {e}", file=sys.stderr)
        except OSError as e:
            print(f"Error: Could not write to {output_path}: {e}", file=sys.stderr)
            sys.exit(1)
    
    print("Merging completed successfully.")

if __name__ == '__main__':
    main()
### How to Use the Script: 1. **Save the Script:** - Save the above script to a file, for example, `merge_files.py`. 2. **Make the Script Executable:**
1
2
   chmod +x merge_files.py
   
3. **Ensure the Input List Exists:** - Make sure that `/tmp/files` exists and contains the list of file paths you want to merge, one per line. 4. **Run the Script:**
1
2
   ./merge_files.py
   
5. **Output:** - The script will create 16 files named `/tmp/merged_0` through `/tmp/merged_15`, each containing the concatenated contents of the assigned files. - It will also print progress messages indicating the number of files and total size assigned to each merged file. ### Notes: - **Error Handling:** - The script includes basic error handling. It will skip files that cannot be accessed and continue processing the rest. Warnings are printed to `stderr`. - **Performance Considerations:** - The script reads and writes files in chunks (1MB) to handle large files efficiently without consuming excessive memory. - **Customization:** - If you need to change the number of merged files or the input/output directories, you can modify the `num_outputs`, `input_list`, and `output_dir` variables in the script accordingly. - **Dependencies:** - The script uses only standard Python libraries and should work with Python 3.x. If you have any further questions or need additional features, feel free to ask!

Extract Code

I extracted the following code from that output:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3

import os
import sys
from collections import defaultdict

def main():
    input_list = '/tmp/files'
    output_dir = '/tmp'
    num_outputs = 16
    output_files = [os.path.join(output_dir, f'merged_{i}') for i in range(num_outputs)]
    
    # Step 1: Read file paths and sizes
    files = []
    try:
        with open(input_list, 'r') as f:
            for line in f:
                filepath = line.strip()
                if not filepath:
                    continue
                try:
                    size = os.path.getsize(filepath)
                    files.append((filepath, size))
                except OSError as e:
                    print(f"Warning: Could not access {filepath}: {e}", file=sys.stderr)
    except FileNotFoundError:
        print(f"Error: Input list {input_list} not found.", file=sys.stderr)
        sys.exit(1)
    
    if not files:
        print("No valid files to process.", file=sys.stderr)
        sys.exit(1)
    
    # Step 2: Sort files by size descending
    files.sort(key=lambda x: x[1], reverse=True)
    
    # Step 3: Assign files to merged outputs
    # Initialize merged file sizes
    merged_sizes = [0] * num_outputs
    merged_assignments = defaultdict(list)  # key: output index, value: list of file paths
    
    for filepath, size in files:
        # Find the merged file with the smallest current size
        min_index = merged_sizes.index(min(merged_sizes))
        merged_assignments[min_index].append(filepath)
        merged_sizes[min_index] += size
    
    # Step 4: Write to merged files
    for i in range(num_outputs):
        output_path = output_files[i]
        print(f"Writing to {output_path} with {len(merged_assignments[i])} files, total size {merged_sizes[i]} bytes.")
        try:
            with open(output_path, 'wb') as outfile:
                for filepath in merged_assignments[i]:
                    try:
                        with open(filepath, 'rb') as infile:
                            while True:
                                chunk = infile.read(1024 * 1024)  # Read in 1MB chunks
                                if not chunk:
                                    break
                                outfile.write(chunk)
                    except OSError as e:
                        print(f"Warning: Could not read {filepath}: {e}", file=sys.stderr)
        except OSError as e:
            print(f"Error: Could not write to {output_path}: {e}", file=sys.stderr)
            sys.exit(1)
    
    print("Merging completed successfully.")

if __name__ == '__main__':
    main()

Run Code Interpreter

Running the following program:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3

import os
import sys
from collections import defaultdict

def main():
    input_list = '/tmp/files'
    output_dir = '/tmp'
    num_outputs = 16
    output_files = [os.path.join(output_dir, f'merged_{i}') for i in range(num_outputs)]
    
    # Step 1: Read file paths and sizes
    files = []
    try:
        with open(input_list, 'r') as f:
            for line in f:
                filepath = line.strip()
                if not filepath:
                    continue
                try:
                    size = os.path.getsize(filepath)
                    files.append((filepath, size))
                except OSError as e:
                    print(f"Warning: Could not access {filepath}: {e}", file=sys.stderr)
    except FileNotFoundError:
        print(f"Error: Input list {input_list} not found.", file=sys.stderr)
        sys.exit(1)
    
    if not files:
        print("No valid files to process.", file=sys.stderr)
        sys.exit(1)
    
    # Step 2: Sort files by size descending
    files.sort(key=lambda x: x[1], reverse=True)
    
    # Step 3: Assign files to merged outputs
    # Initialize merged file sizes
    merged_sizes = [0] * num_outputs
    merged_assignments = defaultdict(list)  # key: output index, value: list of file paths
    
    for filepath, size in files:
        # Find the merged file with the smallest current size
        min_index = merged_sizes.index(min(merged_sizes))
        merged_assignments[min_index].append(filepath)
        merged_sizes[min_index] += size
    
    # Step 4: Write to merged files
    for i in range(num_outputs):
        output_path = output_files[i]
        print(f"Writing to {output_path} with {len(merged_assignments[i])} files, total size {merged_sizes[i]} bytes.")
        try:
            with open(output_path, 'wb') as outfile:
                for filepath in merged_assignments[i]:
                    try:
                        with open(filepath, 'rb') as infile:
                            while True:
                                chunk = infile.read(1024 * 1024)  # Read in 1MB chunks
                                if not chunk:
                                    break
                                outfile.write(chunk)
                    except OSError as e:
                        print(f"Warning: Could not read {filepath}: {e}", file=sys.stderr)
        except OSError as e:
            print(f"Error: Could not write to {output_path}: {e}", file=sys.stderr)
            sys.exit(1)
    
    print("Merging completed successfully.")

if __name__ == '__main__':
    main()
And got the output: Writing to /tmp/merged_0 with 51 files, total size 35331 bytes. Writing to /tmp/merged_1 with 52 files, total size 35327 bytes. Writing to /tmp/merged_2 with 53 files, total size 35331 bytes. Writing to /tmp/merged_3 with 54 files, total size 35326 bytes. Writing to /tmp/merged_4 with 55 files, total size 35326 bytes. Writing to /tmp/merged_5 with 67 files, total size 35330 bytes. Writing to /tmp/merged_6 with 67 files, total size 35325 bytes. Writing to /tmp/merged_7 with 67 files, total size 35330 bytes. Writing to /tmp/merged_8 with 67 files, total size 35330 bytes. Writing to /tmp/merged_9 with 66 files, total size 35325 bytes. Writing to /tmp/merged_10 with 67 files, total size 35332 bytes. Writing to /tmp/merged_11 with 67 files, total size 35331 bytes. Writing to /tmp/merged_12 with 67 files, total size 35331 bytes. Writing to /tmp/merged_13 with 67 files, total size 35329 bytes. Writing to /tmp/merged_14 with 67 files, total size 35330 bytes. Writing to /tmp/merged_15 with 66 files, total size 35325 bytes. Merging completed successfully.

PyFunc

final: True Resulting in output: True