I have setup the docker container to run the model evaluation.
Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,":1, "an":1, "example":1} The program should be run by calling /path/to/python_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,":1, "an":1, "example":1} The program should be run by calling /path/to/python_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
To solve this problem, we need to write a Python program that counts the frequency of each word in a text file located at `/tmp/file.txt` and prints the results as four separate JSON dictionaries, each processed in parallel by one of four cores. Each core handles a different chunk of the file, ensuring that words are not split across chunks. ### Approach 1. **File Splitting**: The file is divided into four chunks based on byte positions. Each chunk is adjusted to start and end at word boundaries (whitespace characters) to avoid splitting words. 2. **Parallel Processing**: Each chunk is processed in parallel using Python's `multiprocessing.Pool` to utilize four cores efficiently. 3. **Word Counting**: Each chunk is read, decoded, split into words, and the frequency of each word is counted. 4. **JSON Output**: The word counts from each chunk are printed as separate JSON dictionaries, one per line. ### Solution Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | #!/usr/bin/env python3 import json from multiprocessing import Pool def find_word_boundary(file, position, direction='forward'): whitespace = {b' ', b'\t', b'\n', b'\r', b'\x0b', b'\x0c'} chunk_size = 1024 file.seek(position) if direction == 'forward': while True: data = file.read(chunk_size) if not data: return file.tell() for i, byte in enumerate(data): if bytes([byte]) in whitespace: return position + i + 1 position += len(data) return file.tell() elif direction == 'backward': pos = position - 1 while pos >= 0: file.seek(pos) byte = file.read(1) if not byte: break if byte in whitespace: return pos + 1 pos -= 1 return 0 else: raise ValueError("Invalid direction") def process_chunk(args): start, end = args counts = {} with open('/tmp/file.txt', 'rb') as f: f.seek(start) data = f.read(end - start) try: text = data.decode('utf-8') except UnicodeDecodeError: text = data.decode('utf-8', errors='replace') words = text.split() for word in words: counts[word] = counts.get(word, 0) + 1 return counts if __name__ == '__main__': chunks = [] with open('/tmp/file.txt', 'rb') as f: f.seek(0, 2) total_size = f.tell() if total_size == 0: print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) exit() chunk_size = total_size // 4 initial_ends = [(i + 1) * chunk_size for i in range(3)] + [total_size] adjusted_ends = [] for i in range(3): end = initial_ends[i] with open('/tmp/file.txt', 'rb') as f_in: adjusted_end = find_word_boundary(f_in, end, direction='forward') adjusted_ends.append(adjusted_end) adjusted_ends.append(total_size) start = 0 for end in adjusted_ends: chunks.append((start, end)) start = end with Pool(4) as pool: results = pool.map(process_chunk, chunks) for res in results: print(json.dumps(res)) |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | #!/usr/bin/env python3 import json from multiprocessing import Pool def find_word_boundary(file, position, direction='forward'): whitespace = {b' ', b'\t', b'\n', b'\r', b'\x0b', b'\x0c'} chunk_size = 1024 file.seek(position) if direction == 'forward': while True: data = file.read(chunk_size) if not data: return file.tell() for i, byte in enumerate(data): if bytes([byte]) in whitespace: return position + i + 1 position += len(data) return file.tell() elif direction == 'backward': pos = position - 1 while pos >= 0: file.seek(pos) byte = file.read(1) if not byte: break if byte in whitespace: return pos + 1 pos -= 1 return 0 else: raise ValueError("Invalid direction") def process_chunk(args): start, end = args counts = {} with open('/tmp/file.txt', 'rb') as f: f.seek(start) data = f.read(end - start) try: text = data.decode('utf-8') except UnicodeDecodeError: text = data.decode('utf-8', errors='replace') words = text.split() for word in words: counts[word] = counts.get(word, 0) + 1 return counts if __name__ == '__main__': chunks = [] with open('/tmp/file.txt', 'rb') as f: f.seek(0, 2) total_size = f.tell() if total_size == 0: print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) exit() chunk_size = total_size // 4 initial_ends = [(i + 1) * chunk_size for i in range(3)] + [total_size] adjusted_ends = [] for i in range(3): end = initial_ends[i] with open('/tmp/file.txt', 'rb') as f_in: adjusted_end = find_word_boundary(f_in, end, direction='forward') adjusted_ends.append(adjusted_end) adjusted_ends.append(total_size) start = 0 for end in adjusted_ends: chunks.append((start, end)) start = end with Pool(4) as pool: results = pool.map(process_chunk, chunks) for res in results: print(json.dumps(res)) |
Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Just tell me if they tried to parallelize it. #!/usr/bin/env python3 import json from multiprocessing import Pool def find_word_boundary(file, position, direction='forward'): whitespace = {b' ', b'\t', b'\n', b'\r', b'\x0b', b'\x0c'} chunk_size = 1024 file.seek(position) if direction == 'forward': while True: data = file.read(chunk_size) if not data: return file.tell() for i, byte in enumerate(data): if bytes([byte]) in whitespace: return position + i + 1 position += len(data) return file.tell() elif direction == 'backward': pos = position - 1 while pos >= 0: file.seek(pos) byte = file.read(1) if not byte: break if byte in whitespace: return pos + 1 pos -= 1 return 0 else: raise ValueError("Invalid direction") def process_chunk(args): start, end = args counts = {} with open('/tmp/file.txt', 'rb') as f: f.seek(start) data = f.read(end - start) try: text = data.decode('utf-8') except UnicodeDecodeError: text = data.decode('utf-8', errors='replace') words = text.split() for word in words: counts[word] = counts.get(word, 0) + 1 return counts if __name__ == '__main__': chunks = [] with open('/tmp/file.txt', 'rb') as f: f.seek(0, 2) total_size = f.tell() if total_size == 0: print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) exit() chunk_size = total_size // 4 initial_ends = [(i + 1) * chunk_size for i in range(3)] + [total_size] adjusted_ends = [] for i in range(3): end = initial_ends[i] with open('/tmp/file.txt', 'rb') as f_in: adjusted_end = find_word_boundary(f_in, end, direction='forward') adjusted_ends.append(adjusted_end) adjusted_ends.append(total_size) start = 0 for end in adjusted_ends: chunks.append((start, end)) start = end with Pool(4) as pool: results = pool.map(process_chunk, chunks) for res in results: print(json.dumps(res)) Answer either 'The student passes' or 'The student fails'.
The student passes.
Testing if the previous output contains the string student passes
: True
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | #!/usr/bin/env python3 import json from multiprocessing import Pool def find_word_boundary(file, position, direction='forward'): whitespace = {b' ', b'\t', b'\n', b'\r', b'\x0b', b'\x0c'} chunk_size = 1024 file.seek(position) if direction == 'forward': while True: data = file.read(chunk_size) if not data: return file.tell() for i, byte in enumerate(data): if bytes([byte]) in whitespace: return position + i + 1 position += len(data) return file.tell() elif direction == 'backward': pos = position - 1 while pos >= 0: file.seek(pos) byte = file.read(1) if not byte: break if byte in whitespace: return pos + 1 pos -= 1 return 0 else: raise ValueError("Invalid direction") def process_chunk(args): start, end = args counts = {} with open('/tmp/file.txt', 'rb') as f: f.seek(start) data = f.read(end - start) try: text = data.decode('utf-8') except UnicodeDecodeError: text = data.decode('utf-8', errors='replace') words = text.split() for word in words: counts[word] = counts.get(word, 0) + 1 return counts if __name__ == '__main__': chunks = [] with open('/tmp/file.txt', 'rb') as f: f.seek(0, 2) total_size = f.tell() if total_size == 0: print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) print(json.dumps({})) exit() chunk_size = total_size // 4 initial_ends = [(i + 1) * chunk_size for i in range(3)] + [total_size] adjusted_ends = [] for i in range(3): end = initial_ends[i] with open('/tmp/file.txt', 'rb') as f_in: adjusted_end = find_word_boundary(f_in, end, direction='forward') adjusted_ends.append(adjusted_end) adjusted_ends.append(total_size) start = 0 for end in adjusted_ends: chunks.append((start, end)) start = end with Pool(4) as pool: results = pool.map(process_chunk, chunks) for res in results: print(json.dumps(res)) |
{"it": 4, "was": 4, "the": 4, "best": 1, "of": 4, "times,": 2, "worst": 1, "age": 2, "wisdom,": 1, "foolishness,": 1}
{"it": 4, "was": 4, "the": 4, "epoch": 2, "of": 3, "belief,": 1, "incredulity,": 1, "season": 1, "Light,": 1}
{"season": 1, "of": 3, "Darkness,": 1, "it": 2, "was": 2, "the": 2, "spring": 1, "hope,": 1, "winter": 1, "despair,": 1, "we": 1, "had": 1, "everything": 1, "before": 1, "us,": 1}
{"we": 3, "had": 1, "nothing": 1, "before": 1, "us,": 1, "were": 2, "all": 2, "going": 2, "direct": 2, "to": 1, "Heaven,": 1, "the": 1, "other": 1, "way": 1}
Resulting in output: {'it': 10, 'was': 10, 'the': 11, 'best': 1, 'of': 10, 'times,': 2, 'worst': 1, 'age': 2, 'wisdom,': 1, 'foolishness,': 1, 'epoch': 2, 'belief,': 1, 'incredulity,': 1, 'season': 2, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'we': 4, 'had': 2, 'everything': 1, 'before': 2, 'us,': 2, 'nothing': 1, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}
Testing if the previous output equals the string {'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}
: True