I have setup the docker container to run the model evaluation.
Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,":1, "an":1, "example":1} The program should be run by calling /path/to/python_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,":1, "an":1, "example":1} The program should be run by calling /path/to/python_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
#!/usr/bin/env python3 """ This is a self-contained Python program that reads the file /tmp/file.txt, splits it into four (roughly) equal chunks (by bytes) and processes each chunk in parallel (using 4 cores). Each process computes a word‐count dictionary (based on whitespace splitting) for the text in its chunk. To avoid counting words twice at the boundaries, each worker adjusts its start/end boundaries so that no word is split into two parts. Finally, the program prints 4 JSON dictionaries (one per line) to stdout. Run the program with no arguments: /path/to/python_word_count """ import os import json import multiprocessing from collections import Counter # We'll work in binary mode so that f.seek() works with file positions. # After reading the raw bytes, we decode to text (assuming UTF-8). # When splitting into words, we split on whitespace. # Characters considered whitespace (in bytes) WHITESPACE = b" \t\r\n" def adjust_start(f, start): """ Adjust the start position so that we don't start in the middle of a word. If start > 0 and the preceding byte is not whitespace, skip until the next whitespace. Return the new starting position. """ if start != 0: # Move back one byte and check f.seek(start - 1) prev = f.read(1) # If the previous byte is not whitespace, we are in the middle of a word. if prev not in WHITESPACE: # Read until we hit whitespace (this will toss out the partial word). f.readline() return f.tell() def adjust_end(f, end, filesize): """ Adjust the end position so that we don't cut a word in half. If end < filesize, read forward until after the current word boundary. Return the new end position. """ f.seek(end) if end < filesize: while True: ch = f.read(1) if not ch or ch in WHITESPACE: break return f.tell() else: return end def process_chunk(args): """ Process a chunk of the file. Receives a tuple (start, end, filesize) indicating the byte boundaries. Returns a dictionary with word counts from that chunk. """ start, end, filesize = args counts = Counter() with open("/tmp/file.txt", "rb") as f: # Adjust the starting position so as not to start in the middle of a word. new_start = adjust_start(f, start) # Adjust the end position so as not to cut a word in half. new_end = adjust_end(f, end, filesize) # Compute how many bytes to read. to_read = new_end - new_start f.seek(new_start) raw_data = f.read(to_read) # Decode the bytes (assuming UTF-8 encoding) try: text_data = raw_data.decode("utf-8") except UnicodeDecodeError: # In case of decoding error, decode with replacement characters. text_data = raw_data.decode("utf-8", errors="replace") # Split by whitespace; note that punctuation is not removed. words = text_data.split() counts.update(words) return dict(counts) def main(): filepath = "/tmp/file.txt" if not os.path.exists(filepath): print("File /tmp/file.txt does not exist.", flush=True) return filesize = os.stat(filepath).st_size # We will split the file into 4 chunks (by bytes). # For the first 3 chunks, we use equal sizes; the last chunk takes the remainder. num_chunks = 4 chunk_size = filesize // num_chunks # Prepare a list of (start, end, filesize) for each chunk. chunks = [] for i in range(num_chunks): start = i * chunk_size # For the last chunk, end at the filesize. if i == num_chunks - 1: end = filesize else: end = (i + 1) * chunk_size chunks.append((start, end, filesize)) # Use multiprocessing Pool to process the 4 chunks in parallel. with multiprocessing.Pool(processes=num_chunks) as pool: results = pool.map(process_chunk, chunks) # Print each word count dict as a JSON object (one per line). for word_count in results: print(json.dumps(word_count, ensure_ascii=False)) if __name__ == "__main__": main()
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | #!/usr/bin/env python3 """ This is a self-contained Python program that reads the file /tmp/file.txt, splits it into four (roughly) equal chunks (by bytes) and processes each chunk in parallel (using 4 cores). Each process computes a word‐count dictionary (based on whitespace splitting) for the text in its chunk. To avoid counting words twice at the boundaries, each worker adjusts its start/end boundaries so that no word is split into two parts. Finally, the program prints 4 JSON dictionaries (one per line) to stdout. Run the program with no arguments: /path/to/python_word_count """ import os import json import multiprocessing from collections import Counter # We'll work in binary mode so that f.seek() works with file positions. # After reading the raw bytes, we decode to text (assuming UTF-8). # When splitting into words, we split on whitespace. # Characters considered whitespace (in bytes) WHITESPACE = b" \t\r\n" def adjust_start(f, start): """ Adjust the start position so that we don't start in the middle of a word. If start > 0 and the preceding byte is not whitespace, skip until the next whitespace. Return the new starting position. """ if start != 0: # Move back one byte and check f.seek(start - 1) prev = f.read(1) # If the previous byte is not whitespace, we are in the middle of a word. if prev not in WHITESPACE: # Read until we hit whitespace (this will toss out the partial word). f.readline() return f.tell() def adjust_end(f, end, filesize): """ Adjust the end position so that we don't cut a word in half. If end < filesize, read forward until after the current word boundary. Return the new end position. """ f.seek(end) if end < filesize: while True: ch = f.read(1) if not ch or ch in WHITESPACE: break return f.tell() else: return end def process_chunk(args): """ Process a chunk of the file. Receives a tuple (start, end, filesize) indicating the byte boundaries. Returns a dictionary with word counts from that chunk. """ start, end, filesize = args counts = Counter() with open("/tmp/file.txt", "rb") as f: # Adjust the starting position so as not to start in the middle of a word. new_start = adjust_start(f, start) # Adjust the end position so as not to cut a word in half. new_end = adjust_end(f, end, filesize) # Compute how many bytes to read. to_read = new_end - new_start f.seek(new_start) raw_data = f.read(to_read) # Decode the bytes (assuming UTF-8 encoding) try: text_data = raw_data.decode("utf-8") except UnicodeDecodeError: # In case of decoding error, decode with replacement characters. text_data = raw_data.decode("utf-8", errors="replace") # Split by whitespace; note that punctuation is not removed. words = text_data.split() counts.update(words) return dict(counts) def main(): filepath = "/tmp/file.txt" if not os.path.exists(filepath): print("File /tmp/file.txt does not exist.", flush=True) return filesize = os.stat(filepath).st_size # We will split the file into 4 chunks (by bytes). # For the first 3 chunks, we use equal sizes; the last chunk takes the remainder. num_chunks = 4 chunk_size = filesize // num_chunks # Prepare a list of (start, end, filesize) for each chunk. chunks = [] for i in range(num_chunks): start = i * chunk_size # For the last chunk, end at the filesize. if i == num_chunks - 1: end = filesize else: end = (i + 1) * chunk_size chunks.append((start, end, filesize)) # Use multiprocessing Pool to process the 4 chunks in parallel. with multiprocessing.Pool(processes=num_chunks) as pool: results = pool.map(process_chunk, chunks) # Print each word count dict as a JSON object (one per line). for word_count in results: print(json.dumps(word_count, ensure_ascii=False)) if __name__ == "__main__": main() |
Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Just tell me if they tried to parallelize it. #!/usr/bin/env python3 """ This is a self-contained Python program that reads the file /tmp/file.txt, splits it into four (roughly) equal chunks (by bytes) and processes each chunk in parallel (using 4 cores). Each process computes a word‐count dictionary (based on whitespace splitting) for the text in its chunk. To avoid counting words twice at the boundaries, each worker adjusts its start/end boundaries so that no word is split into two parts. Finally, the program prints 4 JSON dictionaries (one per line) to stdout. Run the program with no arguments: /path/to/python_word_count """ import os import json import multiprocessing from collections import Counter # We'll work in binary mode so that f.seek() works with file positions. # After reading the raw bytes, we decode to text (assuming UTF-8). # When splitting into words, we split on whitespace. # Characters considered whitespace (in bytes) WHITESPACE = b" \t\r\n" def adjust_start(f, start): """ Adjust the start position so that we don't start in the middle of a word. If start > 0 and the preceding byte is not whitespace, skip until the next whitespace. Return the new starting position. """ if start != 0: # Move back one byte and check f.seek(start - 1) prev = f.read(1) # If the previous byte is not whitespace, we are in the middle of a word. if prev not in WHITESPACE: # Read until we hit whitespace (this will toss out the partial word). f.readline() return f.tell() def adjust_end(f, end, filesize): """ Adjust the end position so that we don't cut a word in half. If end < filesize, read forward until after the current word boundary. Return the new end position. """ f.seek(end) if end < filesize: while True: ch = f.read(1) if not ch or ch in WHITESPACE: break return f.tell() else: return end def process_chunk(args): """ Process a chunk of the file. Receives a tuple (start, end, filesize) indicating the byte boundaries. Returns a dictionary with word counts from that chunk. """ start, end, filesize = args counts = Counter() with open("/tmp/file.txt", "rb") as f: # Adjust the starting position so as not to start in the middle of a word. new_start = adjust_start(f, start) # Adjust the end position so as not to cut a word in half. new_end = adjust_end(f, end, filesize) # Compute how many bytes to read. to_read = new_end - new_start f.seek(new_start) raw_data = f.read(to_read) # Decode the bytes (assuming UTF-8 encoding) try: text_data = raw_data.decode("utf-8") except UnicodeDecodeError: # In case of decoding error, decode with replacement characters. text_data = raw_data.decode("utf-8", errors="replace") # Split by whitespace; note that punctuation is not removed. words = text_data.split() counts.update(words) return dict(counts) def main(): filepath = "/tmp/file.txt" if not os.path.exists(filepath): print("File /tmp/file.txt does not exist.", flush=True) return filesize = os.stat(filepath).st_size # We will split the file into 4 chunks (by bytes). # For the first 3 chunks, we use equal sizes; the last chunk takes the remainder. num_chunks = 4 chunk_size = filesize // num_chunks # Prepare a list of (start, end, filesize) for each chunk. chunks = [] for i in range(num_chunks): start = i * chunk_size # For the last chunk, end at the filesize. if i == num_chunks - 1: end = filesize else: end = (i + 1) * chunk_size chunks.append((start, end, filesize)) # Use multiprocessing Pool to process the 4 chunks in parallel. with multiprocessing.Pool(processes=num_chunks) as pool: results = pool.map(process_chunk, chunks) # Print each word count dict as a JSON object (one per line). for word_count in results: print(json.dumps(word_count, ensure_ascii=False)) if __name__ == "__main__": main() Answer either 'The student passes' or 'The student fails'.
The student passes.
Testing if the previous output contains the string student passes
: True
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | #!/usr/bin/env python3 """ This is a self-contained Python program that reads the file /tmp/file.txt, splits it into four (roughly) equal chunks (by bytes) and processes each chunk in parallel (using 4 cores). Each process computes a word‐count dictionary (based on whitespace splitting) for the text in its chunk. To avoid counting words twice at the boundaries, each worker adjusts its start/end boundaries so that no word is split into two parts. Finally, the program prints 4 JSON dictionaries (one per line) to stdout. Run the program with no arguments: /path/to/python_word_count """ import os import json import multiprocessing from collections import Counter # We'll work in binary mode so that f.seek() works with file positions. # After reading the raw bytes, we decode to text (assuming UTF-8). # When splitting into words, we split on whitespace. # Characters considered whitespace (in bytes) WHITESPACE = b" \t\r\n" def adjust_start(f, start): """ Adjust the start position so that we don't start in the middle of a word. If start > 0 and the preceding byte is not whitespace, skip until the next whitespace. Return the new starting position. """ if start != 0: # Move back one byte and check f.seek(start - 1) prev = f.read(1) # If the previous byte is not whitespace, we are in the middle of a word. if prev not in WHITESPACE: # Read until we hit whitespace (this will toss out the partial word). f.readline() return f.tell() def adjust_end(f, end, filesize): """ Adjust the end position so that we don't cut a word in half. If end < filesize, read forward until after the current word boundary. Return the new end position. """ f.seek(end) if end < filesize: while True: ch = f.read(1) if not ch or ch in WHITESPACE: break return f.tell() else: return end def process_chunk(args): """ Process a chunk of the file. Receives a tuple (start, end, filesize) indicating the byte boundaries. Returns a dictionary with word counts from that chunk. """ start, end, filesize = args counts = Counter() with open("/tmp/file.txt", "rb") as f: # Adjust the starting position so as not to start in the middle of a word. new_start = adjust_start(f, start) # Adjust the end position so as not to cut a word in half. new_end = adjust_end(f, end, filesize) # Compute how many bytes to read. to_read = new_end - new_start f.seek(new_start) raw_data = f.read(to_read) # Decode the bytes (assuming UTF-8 encoding) try: text_data = raw_data.decode("utf-8") except UnicodeDecodeError: # In case of decoding error, decode with replacement characters. text_data = raw_data.decode("utf-8", errors="replace") # Split by whitespace; note that punctuation is not removed. words = text_data.split() counts.update(words) return dict(counts) def main(): filepath = "/tmp/file.txt" if not os.path.exists(filepath): print("File /tmp/file.txt does not exist.", flush=True) return filesize = os.stat(filepath).st_size # We will split the file into 4 chunks (by bytes). # For the first 3 chunks, we use equal sizes; the last chunk takes the remainder. num_chunks = 4 chunk_size = filesize // num_chunks # Prepare a list of (start, end, filesize) for each chunk. chunks = [] for i in range(num_chunks): start = i * chunk_size # For the last chunk, end at the filesize. if i == num_chunks - 1: end = filesize else: end = (i + 1) * chunk_size chunks.append((start, end, filesize)) # Use multiprocessing Pool to process the 4 chunks in parallel. with multiprocessing.Pool(processes=num_chunks) as pool: results = pool.map(process_chunk, chunks) # Print each word count dict as a JSON object (one per line). for word_count in results: print(json.dumps(word_count, ensure_ascii=False)) if __name__ == "__main__": main() |
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
^^^^^^^^^^^^^^^^
File "/usr/src/app/main.py", line 74, in process_chunk
raw_data = f.read(to_read)
^^^^^^^^^^^^^^^
ValueError: read length must be non-negative or -1
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/src/app/main.py", line 119, in <module>
main()
File "/usr/src/app/main.py", line 112, in main
results = pool.map(process_chunk, chunks)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/multiprocessing/pool.py", line 367, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/multiprocessing/pool.py", line 774, in get
raise self._value
ValueError: read length must be non-negative or -1
Resulting in output: {}
Testing if the previous output equals the string {'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}
: False
I have setup the docker container to run the model evaluation.
Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,":1, "an":1, "example":1} The program should be run by calling /path/to/python_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,":1, "an":1, "example":1} The program should be run by calling /path/to/python_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
#!/usr/bin/env python3 """ A self-contained Python program to count how often each word appears in a text file located at /tmp/file.txt, in parallel using 4 processes. The file is split into 4 chunks (by lines) and each process counts the words in its chunk. The program then prints out 4 different JSON dictionaries (one per line), one for each quarter of the file. (You can later merge the 4 dictionaries.) For example, if /tmp/file.txt contains the text: This is is is, an example then one of the printed JSON lines might be: {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} Run this program without any command-line arguments. """ import json import sys import multiprocessing from collections import Counter def count_words(lines): """ Count words in a list of text lines. Splitting is done using the default str.split() which splits on any whitespace. Note that punctuation is not removed, so "is" and "is," are counted as different words. """ counter = Counter() for line in lines: # Splitting on whitespace words = line.split() counter.update(words) # Convert Counter to a regular dict for a cleaner JSON output. return dict(counter) def main(): filepath = "/tmp/file.txt" try: with open(filepath, "r") as f: lines = f.readlines() except Exception as e: sys.exit(f"Error reading file {filepath}: {e}") n_cores = 4 total_lines = len(lines) # Divide lines into approx equal chunks (by lines). # This approach works well for files that have multiple lines. # (If the file is one huge line, then splitting by byte offset would be needed.) chunks = [] base_chunk_size = total_lines // n_cores remainder = total_lines % n_cores start = 0 for i in range(n_cores): # Distribute the remainder over the first few chunks. extra = 1 if i < remainder else 0 end = start + base_chunk_size + extra chunks.append(lines[start:end]) start = end # Use a multiprocessing pool to count words in parallel. with multiprocessing.Pool(processes=n_cores) as pool: results = pool.map(count_words, chunks) # Print each resulting dictionary as a JSON object on a separate line. for result in results: print(json.dumps(result)) if __name__ == "__main__": main()
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | #!/usr/bin/env python3 """ A self-contained Python program to count how often each word appears in a text file located at /tmp/file.txt, in parallel using 4 processes. The file is split into 4 chunks (by lines) and each process counts the words in its chunk. The program then prints out 4 different JSON dictionaries (one per line), one for each quarter of the file. (You can later merge the 4 dictionaries.) For example, if /tmp/file.txt contains the text: This is is is, an example then one of the printed JSON lines might be: {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} Run this program without any command-line arguments. """ import json import sys import multiprocessing from collections import Counter def count_words(lines): """ Count words in a list of text lines. Splitting is done using the default str.split() which splits on any whitespace. Note that punctuation is not removed, so "is" and "is," are counted as different words. """ counter = Counter() for line in lines: # Splitting on whitespace words = line.split() counter.update(words) # Convert Counter to a regular dict for a cleaner JSON output. return dict(counter) def main(): filepath = "/tmp/file.txt" try: with open(filepath, "r") as f: lines = f.readlines() except Exception as e: sys.exit(f"Error reading file {filepath}: {e}") n_cores = 4 total_lines = len(lines) # Divide lines into approx equal chunks (by lines). # This approach works well for files that have multiple lines. # (If the file is one huge line, then splitting by byte offset would be needed.) chunks = [] base_chunk_size = total_lines // n_cores remainder = total_lines % n_cores start = 0 for i in range(n_cores): # Distribute the remainder over the first few chunks. extra = 1 if i < remainder else 0 end = start + base_chunk_size + extra chunks.append(lines[start:end]) start = end # Use a multiprocessing pool to count words in parallel. with multiprocessing.Pool(processes=n_cores) as pool: results = pool.map(count_words, chunks) # Print each resulting dictionary as a JSON object on a separate line. for result in results: print(json.dumps(result)) if __name__ == "__main__": main() |
Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Just tell me if they tried to parallelize it. #!/usr/bin/env python3 """ A self-contained Python program to count how often each word appears in a text file located at /tmp/file.txt, in parallel using 4 processes. The file is split into 4 chunks (by lines) and each process counts the words in its chunk. The program then prints out 4 different JSON dictionaries (one per line), one for each quarter of the file. (You can later merge the 4 dictionaries.) For example, if /tmp/file.txt contains the text: This is is is, an example then one of the printed JSON lines might be: {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} Run this program without any command-line arguments. """ import json import sys import multiprocessing from collections import Counter def count_words(lines): """ Count words in a list of text lines. Splitting is done using the default str.split() which splits on any whitespace. Note that punctuation is not removed, so "is" and "is," are counted as different words. """ counter = Counter() for line in lines: # Splitting on whitespace words = line.split() counter.update(words) # Convert Counter to a regular dict for a cleaner JSON output. return dict(counter) def main(): filepath = "/tmp/file.txt" try: with open(filepath, "r") as f: lines = f.readlines() except Exception as e: sys.exit(f"Error reading file {filepath}: {e}") n_cores = 4 total_lines = len(lines) # Divide lines into approx equal chunks (by lines). # This approach works well for files that have multiple lines. # (If the file is one huge line, then splitting by byte offset would be needed.) chunks = [] base_chunk_size = total_lines // n_cores remainder = total_lines % n_cores start = 0 for i in range(n_cores): # Distribute the remainder over the first few chunks. extra = 1 if i < remainder else 0 end = start + base_chunk_size + extra chunks.append(lines[start:end]) start = end # Use a multiprocessing pool to count words in parallel. with multiprocessing.Pool(processes=n_cores) as pool: results = pool.map(count_words, chunks) # Print each resulting dictionary as a JSON object on a separate line. for result in results: print(json.dumps(result)) if __name__ == "__main__": main() Answer either 'The student passes' or 'The student fails'.
The student passes.
Testing if the previous output contains the string student passes
: True
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | #!/usr/bin/env python3 """ A self-contained Python program to count how often each word appears in a text file located at /tmp/file.txt, in parallel using 4 processes. The file is split into 4 chunks (by lines) and each process counts the words in its chunk. The program then prints out 4 different JSON dictionaries (one per line), one for each quarter of the file. (You can later merge the 4 dictionaries.) For example, if /tmp/file.txt contains the text: This is is is, an example then one of the printed JSON lines might be: {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} Run this program without any command-line arguments. """ import json import sys import multiprocessing from collections import Counter def count_words(lines): """ Count words in a list of text lines. Splitting is done using the default str.split() which splits on any whitespace. Note that punctuation is not removed, so "is" and "is," are counted as different words. """ counter = Counter() for line in lines: # Splitting on whitespace words = line.split() counter.update(words) # Convert Counter to a regular dict for a cleaner JSON output. return dict(counter) def main(): filepath = "/tmp/file.txt" try: with open(filepath, "r") as f: lines = f.readlines() except Exception as e: sys.exit(f"Error reading file {filepath}: {e}") n_cores = 4 total_lines = len(lines) # Divide lines into approx equal chunks (by lines). # This approach works well for files that have multiple lines. # (If the file is one huge line, then splitting by byte offset would be needed.) chunks = [] base_chunk_size = total_lines // n_cores remainder = total_lines % n_cores start = 0 for i in range(n_cores): # Distribute the remainder over the first few chunks. extra = 1 if i < remainder else 0 end = start + base_chunk_size + extra chunks.append(lines[start:end]) start = end # Use a multiprocessing pool to count words in parallel. with multiprocessing.Pool(processes=n_cores) as pool: results = pool.map(count_words, chunks) # Print each resulting dictionary as a JSON object on a separate line. for result in results: print(json.dumps(result)) if __name__ == "__main__": main() |
{"it": 10, "was": 10, "the": 11, "best": 1, "of": 10, "times,": 2, "worst": 1, "age": 2, "wisdom,": 1, "foolishness,": 1, "epoch": 2, "belief,": 1, "incredulity,": 1, "season": 2, "Light,": 1, "Darkness,": 1, "spring": 1, "hope,": 1, "winter": 1, "despair,": 1, "we": 4, "had": 2, "everything": 1, "before": 2, "us,": 2, "nothing": 1, "were": 2, "all": 2, "going": 2, "direct": 2, "to": 1, "Heaven,": 1, "other": 1, "way": 1}
{}
{}
{}
Resulting in output: {'it': 10, 'was': 10, 'the': 11, 'best': 1, 'of': 10, 'times,': 2, 'worst': 1, 'age': 2, 'wisdom,': 1, 'foolishness,': 1, 'epoch': 2, 'belief,': 1, 'incredulity,': 1, 'season': 2, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'we': 4, 'had': 2, 'everything': 1, 'before': 2, 'us,': 2, 'nothing': 1, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}
Testing if the previous output equals the string {'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}
: True