I have setup the docker container to run the model evaluation.
Write a self-contained Rust program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} Don't import any crates that would require a Cargo.toml file. I'll just compile it with `rustc yourfile.rs` and it should work. The program should be run by calling /path/to/rust_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each job taking ~1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
Write a self-contained Rust program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} Don't import any crates that would require a Cargo.toml file. I'll just compile it with `rustc yourfile.rs` and it should work. The program should be run by calling /path/to/rust_word_count with no arguments. Make it run in parallel on 4 cores at the same time, each job taking ~1/4 of the file. Print out 4 differnt json dicts, one per line. I'll do the merging.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, ErrorKind}; // Import ErrorKind for better error handling use std::thread; // No external crates (like serde, rayon, etc.) are needed. // --- Configuration --- /// The path to the input text file. const FILE_PATH: &str = "/tmp/file.txt"; /// The number of threads to use for parallel processing. const NUM_THREADS: usize = 4; /// Reads the file, splits the work among threads, counts words in parallel, /// and prints one JSON output per thread. fn main() -> io::Result<()> { // 1. Read the entire file content into a String // Provide more specific error messages if file operations fail. let mut file = File::open(FILE_PATH) .map_err(|e| io::Error::new(e.kind(), format!("Failed to open '{}': {}", FILE_PATH, e)))?; let mut content = String::new(); file.read_to_string(&mut content) .map_err(|e| io::Error::new(e.kind(), format!("Failed to read '{}': {}", FILE_PATH, e)))?; // Handle empty file: Print N empty JSON objects immediately. if content.is_empty() { for _ in 0..NUM_THREADS { println!("{{}}"); } return Ok(()); } // 2. Determine chunk boundaries to divide the work among threads. // The goal is to find split points near (length / NUM_THREADS) intervals, // preferably at whitespace, to avoid splitting words across threads. let mut boundaries = vec![0]; // Stores the start index of each chunk's *potential* range. let total_len = content.len(); if total_len > 0 { // Only calculate boundaries if there's content for i in 1..NUM_THREADS { // Calculate the approximate boundary point based on byte length. let approx_boundary = (total_len * i) / NUM_THREADS; // Find the index of the first whitespace character *at or after* the approximate boundary. // This ensures that the chunk processed by the *previous* thread ends cleanly before or at whitespace. // We search from the approximate boundary onwards. let boundary = content[approx_boundary..] .find(|c: char| c.is_whitespace()) .map(|pos| approx_boundary + pos) // Calculate absolute index of the whitespace. .unwrap_or(total_len); // If no whitespace found until the end, the boundary is the end. boundaries.push(boundary.min(total_len)); // Add the boundary, ensuring it doesn't exceed total length. } } boundaries.push(total_len); // Add the final boundary (end of the string). // Sort and deduplicate boundaries. This handles cases where calculated boundaries // might be identical (e.g., multiple consecutive spaces at a boundary point). boundaries.sort_unstable(); boundaries.dedup(); // 3. Create (start, end) index pairs for the actual slices each thread will process. // We aim for exactly NUM_THREADS pairs, even if some are empty due to file size or boundary calculation. let mut chunks = Vec::with_capacity(NUM_THREADS); let mut last_boundary_end = 0; // Iterate through the calculated boundary points to define chunk ranges. // `boundaries` contains the end points (exclusive) of the ideal ranges. for i in 1..boundaries.len() { let current_boundary_end = boundaries[i]; let start = last_boundary_end; let end = current_boundary_end; // Add the chunk definition (start, end). The thread will handle splitting words within this range. // Ensure start <= end, which should hold true after sort/dedup. chunks.push((start.min(end), end)); last_boundary_end = current_boundary_end; // The end of this chunk is the start of the next. } // Ensure we have exactly NUM_THREADS chunk definitions. If boundary calculations // resulted in fewer actual chunks (e.g., very short file, or NUM_THREADS > number of words), // pad the `chunks` vector with empty ranges. while chunks.len() < NUM_THREADS { // Add an empty chunk range pointing to the end of the content. chunks.push((total_len, total_len)); } // If, for some reason, we generated more chunks than threads (e.g., NUM_THREADS=1 but boundaries split), truncate. chunks.truncate(NUM_THREADS); // 4. Process chunks in parallel using `thread::scope`. // `thread::scope` ensures all spawned threads finish before `main` continues or returns. // This allows threads to safely borrow `content` without needing `Arc`. thread::scope(|s| { for i in 0..NUM_THREADS { // Get the specific (start, end) byte range for this thread. let (start, end) = chunks[i]; // Borrow the content slice for this thread. Check bounds for safety, although // the calculation logic should ensure they are valid. if start <= end && end <= content.len() { // `content` is borrowed immutably here, which is safe for multiple threads. let chunk_str: &str = &content[start..end]; // Spawn a new thread to process this chunk. s.spawn(move || { // Each thread gets its own word count map. let mut word_counts: HashMap<String, usize> = HashMap::new(); // `split_whitespace()` iterates over words separated by any amount of whitespace. // It correctly handles leading/trailing whitespace within the `chunk_str`. // It operates only on the provided slice, so words split exactly at the // byte boundary `start` or `end` will be handled correctly by the respective threads. for word in chunk_str.split_whitespace() { // Increment the count for the word. `to_string()` creates an owned String key. *word_counts.entry(word.to_string()).or_insert(0) += 1; } // Format and print this thread's result as a JSON object on its own line. print_json(&word_counts); }); } else { // This case should ideally not be reached with correct boundary/chunk logic. // However, to strictly guarantee NUM_THREADS outputs as requested, // spawn a thread that prints empty JSON if the chunk indices seem invalid. s.spawn(|| { println!("{{}}"); }); } } }); // `scope` automatically waits for all spawned threads to complete here. Ok(()) } /// Formats a HashMap<String, usize> into a JSON object string and prints it to stdout, followed by a newline. /// Keys within the JSON object are sorted alphabetically for deterministic output. fn print_json(counts: &HashMap<String, usize>) { // Handle the case of an empty map (no words found in the chunk). if counts.is_empty() { println!("{{}}"); return; } let mut items = Vec::with_capacity(counts.len()); // Sort keys for consistent output order. This is good practice, especially for testing. let mut sorted_keys: Vec<&String> = counts.keys().collect(); sorted_keys.sort_unstable(); // `sort_unstable` might offer slightly better performance than stable sort. // Build the JSON parts: "key": value for key in sorted_keys { // We expect the key to be present, but `get` is safer than direct indexing. if let Some(count) = counts.get(key) { // Basic JSON string escaping for the key: only handles literal double quotes. // Words from split_whitespace typically don't contain quotes, but this adds robustness. let escaped_key = key.replace('"', "\\\""); items.push(format!("\"{}\": {}", escaped_key, count)); } } // Join the key-value pairs with commas and wrap in curly braces for valid JSON. println!("{{{}}}", items.join(", ")); } /* // How to Compile and Run: // 1. Save the code above as `rust_word_count.rs` (or any other name). // 2. Create the input file: // echo "This is is is, an example file.\nAnother line is here, with is again." > /tmp/file.txt // 3. Compile the Rust code using the standard Rust compiler (no Cargo needed): // rustc rust_word_count.rs // 4. Run the compiled executable: // ./rust_word_count // Example Expected Output (The order of the lines and the exact distribution of words // will depend on the file content, length, and thread scheduling, // but there will be exactly 4 lines of JSON): // {"is": 2} // {"This": 1, "is,": 1} // {"an": 1, "example": 1, "file.": 1} // {"Another": 1, "again.": 1, "here,": 1, "is": 1, "line": 1, "with": 1} // Note: The word "is" might appear in counts from different threads if it spans across // the calculated chunk boundaries. The merging step (done externally) would sum these up. // Punctuation attached to words (like "is," or "file.") is treated as part of the word // by `split_whitespace`. */ |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, ErrorKind}; // Import ErrorKind for better error handling use std::thread; // No external crates (like serde, rayon, etc.) are needed. // --- Configuration --- /// The path to the input text file. const FILE_PATH: &str = "/tmp/file.txt"; /// The number of threads to use for parallel processing. const NUM_THREADS: usize = 4; /// Reads the file, splits the work among threads, counts words in parallel, /// and prints one JSON output per thread. fn main() -> io::Result<()> { // 1. Read the entire file content into a String // Provide more specific error messages if file operations fail. let mut file = File::open(FILE_PATH) .map_err(|e| io::Error::new(e.kind(), format!("Failed to open '{}': {}", FILE_PATH, e)))?; let mut content = String::new(); file.read_to_string(&mut content) .map_err(|e| io::Error::new(e.kind(), format!("Failed to read '{}': {}", FILE_PATH, e)))?; // Handle empty file: Print N empty JSON objects immediately. if content.is_empty() { for _ in 0..NUM_THREADS { println!("{{}}"); } return Ok(()); } // 2. Determine chunk boundaries to divide the work among threads. // The goal is to find split points near (length / NUM_THREADS) intervals, // preferably at whitespace, to avoid splitting words across threads. let mut boundaries = vec![0]; // Stores the start index of each chunk's *potential* range. let total_len = content.len(); if total_len > 0 { // Only calculate boundaries if there's content for i in 1..NUM_THREADS { // Calculate the approximate boundary point based on byte length. let approx_boundary = (total_len * i) / NUM_THREADS; // Find the index of the first whitespace character *at or after* the approximate boundary. // This ensures that the chunk processed by the *previous* thread ends cleanly before or at whitespace. // We search from the approximate boundary onwards. let boundary = content[approx_boundary..] .find(|c: char| c.is_whitespace()) .map(|pos| approx_boundary + pos) // Calculate absolute index of the whitespace. .unwrap_or(total_len); // If no whitespace found until the end, the boundary is the end. boundaries.push(boundary.min(total_len)); // Add the boundary, ensuring it doesn't exceed total length. } } boundaries.push(total_len); // Add the final boundary (end of the string). // Sort and deduplicate boundaries. This handles cases where calculated boundaries // might be identical (e.g., multiple consecutive spaces at a boundary point). boundaries.sort_unstable(); boundaries.dedup(); // 3. Create (start, end) index pairs for the actual slices each thread will process. // We aim for exactly NUM_THREADS pairs, even if some are empty due to file size or boundary calculation. let mut chunks = Vec::with_capacity(NUM_THREADS); let mut last_boundary_end = 0; // Iterate through the calculated boundary points to define chunk ranges. // `boundaries` contains the end points (exclusive) of the ideal ranges. for i in 1..boundaries.len() { let current_boundary_end = boundaries[i]; let start = last_boundary_end; let end = current_boundary_end; // Add the chunk definition (start, end). The thread will handle splitting words within this range. // Ensure start <= end, which should hold true after sort/dedup. chunks.push((start.min(end), end)); last_boundary_end = current_boundary_end; // The end of this chunk is the start of the next. } // Ensure we have exactly NUM_THREADS chunk definitions. If boundary calculations // resulted in fewer actual chunks (e.g., very short file, or NUM_THREADS > number of words), // pad the `chunks` vector with empty ranges. while chunks.len() < NUM_THREADS { // Add an empty chunk range pointing to the end of the content. chunks.push((total_len, total_len)); } // If, for some reason, we generated more chunks than threads (e.g., NUM_THREADS=1 but boundaries split), truncate. chunks.truncate(NUM_THREADS); // 4. Process chunks in parallel using `thread::scope`. // `thread::scope` ensures all spawned threads finish before `main` continues or returns. // This allows threads to safely borrow `content` without needing `Arc`. thread::scope(|s| { for i in 0..NUM_THREADS { // Get the specific (start, end) byte range for this thread. let (start, end) = chunks[i]; // Borrow the content slice for this thread. Check bounds for safety, although // the calculation logic should ensure they are valid. if start <= end && end <= content.len() { // `content` is borrowed immutably here, which is safe for multiple threads. let chunk_str: &str = &content[start..end]; // Spawn a new thread to process this chunk. s.spawn(move || { // Each thread gets its own word count map. let mut word_counts: HashMap<String, usize> = HashMap::new(); // `split_whitespace()` iterates over words separated by any amount of whitespace. // It correctly handles leading/trailing whitespace within the `chunk_str`. // It operates only on the provided slice, so words split exactly at the // byte boundary `start` or `end` will be handled correctly by the respective threads. for word in chunk_str.split_whitespace() { // Increment the count for the word. `to_string()` creates an owned String key. *word_counts.entry(word.to_string()).or_insert(0) += 1; } // Format and print this thread's result as a JSON object on its own line. print_json(&word_counts); }); } else { // This case should ideally not be reached with correct boundary/chunk logic. // However, to strictly guarantee NUM_THREADS outputs as requested, // spawn a thread that prints empty JSON if the chunk indices seem invalid. s.spawn(|| { println!("{{}}"); }); } } }); // `scope` automatically waits for all spawned threads to complete here. Ok(()) } /// Formats a HashMap<String, usize> into a JSON object string and prints it to stdout, followed by a newline. /// Keys within the JSON object are sorted alphabetically for deterministic output. fn print_json(counts: &HashMap<String, usize>) { // Handle the case of an empty map (no words found in the chunk). if counts.is_empty() { println!("{{}}"); return; } let mut items = Vec::with_capacity(counts.len()); // Sort keys for consistent output order. This is good practice, especially for testing. let mut sorted_keys: Vec<&String> = counts.keys().collect(); sorted_keys.sort_unstable(); // `sort_unstable` might offer slightly better performance than stable sort. // Build the JSON parts: "key": value for key in sorted_keys { // We expect the key to be present, but `get` is safer than direct indexing. if let Some(count) = counts.get(key) { // Basic JSON string escaping for the key: only handles literal double quotes. // Words from split_whitespace typically don't contain quotes, but this adds robustness. let escaped_key = key.replace('"', "\\\""); items.push(format!("\"{}\": {}", escaped_key, count)); } } // Join the key-value pairs with commas and wrap in curly braces for valid JSON. println!("{{{}}}", items.join(", ")); } /* // How to Compile and Run: // 1. Save the code above as `rust_word_count.rs` (or any other name). // 2. Create the input file: // echo "This is is is, an example file.\nAnother line is here, with is again." > /tmp/file.txt // 3. Compile the Rust code using the standard Rust compiler (no Cargo needed): // rustc rust_word_count.rs // 4. Run the compiled executable: // ./rust_word_count // Example Expected Output (The order of the lines and the exact distribution of words // will depend on the file content, length, and thread scheduling, // but there will be exactly 4 lines of JSON): // {"is": 2} // {"This": 1, "is,": 1} // {"an": 1, "example": 1, "file.": 1} // {"Another": 1, "again.": 1, "here,": 1, "is": 1, "line": 1, "with": 1} // Note: The word "is" might appear in counts from different threads if it spans across // the calculated chunk boundaries. The merging step (done externally) would sum these up. // Punctuation attached to words (like "is," or "file.") is treated as part of the word // by `split_whitespace`. */ |
Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Logic errors are okay. So are synatx errors. Just tell me if they tried to parallelize it. use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, ErrorKind}; // Import ErrorKind for better error handling use std::thread; // No external crates (like serde, rayon, etc.) are needed. // --- Configuration --- /// The path to the input text file. const FILE_PATH: &str = "/tmp/file.txt"; /// The number of threads to use for parallel processing. const NUM_THREADS: usize = 4; /// Reads the file, splits the work among threads, counts words in parallel, /// and prints one JSON output per thread. fn main() -> io::Result<()> { // 1. Read the entire file content into a String // Provide more specific error messages if file operations fail. let mut file = File::open(FILE_PATH) .map_err(|e| io::Error::new(e.kind(), format!("Failed to open '{}': {}", FILE_PATH, e)))?; let mut content = String::new(); file.read_to_string(&mut content) .map_err(|e| io::Error::new(e.kind(), format!("Failed to read '{}': {}", FILE_PATH, e)))?; // Handle empty file: Print N empty JSON objects immediately. if content.is_empty() { for _ in 0..NUM_THREADS { println!("{{}}"); } return Ok(()); } // 2. Determine chunk boundaries to divide the work among threads. // The goal is to find split points near (length / NUM_THREADS) intervals, // preferably at whitespace, to avoid splitting words across threads. let mut boundaries = vec![0]; // Stores the start index of each chunk's *potential* range. let total_len = content.len(); if total_len > 0 { // Only calculate boundaries if there's content for i in 1..NUM_THREADS { // Calculate the approximate boundary point based on byte length. let approx_boundary = (total_len * i) / NUM_THREADS; // Find the index of the first whitespace character *at or after* the approximate boundary. // This ensures that the chunk processed by the *previous* thread ends cleanly before or at whitespace. // We search from the approximate boundary onwards. let boundary = content[approx_boundary..] .find(|c: char| c.is_whitespace()) .map(|pos| approx_boundary + pos) // Calculate absolute index of the whitespace. .unwrap_or(total_len); // If no whitespace found until the end, the boundary is the end. boundaries.push(boundary.min(total_len)); // Add the boundary, ensuring it doesn't exceed total length. } } boundaries.push(total_len); // Add the final boundary (end of the string). // Sort and deduplicate boundaries. This handles cases where calculated boundaries // might be identical (e.g., multiple consecutive spaces at a boundary point). boundaries.sort_unstable(); boundaries.dedup(); // 3. Create (start, end) index pairs for the actual slices each thread will process. // We aim for exactly NUM_THREADS pairs, even if some are empty due to file size or boundary calculation. let mut chunks = Vec::with_capacity(NUM_THREADS); let mut last_boundary_end = 0; // Iterate through the calculated boundary points to define chunk ranges. // `boundaries` contains the end points (exclusive) of the ideal ranges. for i in 1..boundaries.len() { let current_boundary_end = boundaries[i]; let start = last_boundary_end; let end = current_boundary_end; // Add the chunk definition (start, end). The thread will handle splitting words within this range. // Ensure start <= end, which should hold true after sort/dedup. chunks.push((start.min(end), end)); last_boundary_end = current_boundary_end; // The end of this chunk is the start of the next. } // Ensure we have exactly NUM_THREADS chunk definitions. If boundary calculations // resulted in fewer actual chunks (e.g., very short file, or NUM_THREADS > number of words), // pad the `chunks` vector with empty ranges. while chunks.len() < NUM_THREADS { // Add an empty chunk range pointing to the end of the content. chunks.push((total_len, total_len)); } // If, for some reason, we generated more chunks than threads (e.g., NUM_THREADS=1 but boundaries split), truncate. chunks.truncate(NUM_THREADS); // 4. Process chunks in parallel using `thread::scope`. // `thread::scope` ensures all spawned threads finish before `main` continues or returns. // This allows threads to safely borrow `content` without needing `Arc`. thread::scope(|s| { for i in 0..NUM_THREADS { // Get the specific (start, end) byte range for this thread. let (start, end) = chunks[i]; // Borrow the content slice for this thread. Check bounds for safety, although // the calculation logic should ensure they are valid. if start <= end && end <= content.len() { // `content` is borrowed immutably here, which is safe for multiple threads. let chunk_str: &str = &content[start..end]; // Spawn a new thread to process this chunk. s.spawn(move || { // Each thread gets its own word count map. let mut word_counts: HashMap<String, usize> = HashMap::new(); // `split_whitespace()` iterates over words separated by any amount of whitespace. // It correctly handles leading/trailing whitespace within the `chunk_str`. // It operates only on the provided slice, so words split exactly at the // byte boundary `start` or `end` will be handled correctly by the respective threads. for word in chunk_str.split_whitespace() { // Increment the count for the word. `to_string()` creates an owned String key. *word_counts.entry(word.to_string()).or_insert(0) += 1; } // Format and print this thread's result as a JSON object on its own line. print_json(&word_counts); }); } else { // This case should ideally not be reached with correct boundary/chunk logic. // However, to strictly guarantee NUM_THREADS outputs as requested, // spawn a thread that prints empty JSON if the chunk indices seem invalid. s.spawn(|| { println!("{{}}"); }); } } }); // `scope` automatically waits for all spawned threads to complete here. Ok(()) } /// Formats a HashMap<String, usize> into a JSON object string and prints it to stdout, followed by a newline. /// Keys within the JSON object are sorted alphabetically for deterministic output. fn print_json(counts: &HashMap<String, usize>) { // Handle the case of an empty map (no words found in the chunk). if counts.is_empty() { println!("{{}}"); return; } let mut items = Vec::with_capacity(counts.len()); // Sort keys for consistent output order. This is good practice, especially for testing. let mut sorted_keys: Vec<&String> = counts.keys().collect(); sorted_keys.sort_unstable(); // `sort_unstable` might offer slightly better performance than stable sort. // Build the JSON parts: "key": value for key in sorted_keys { // We expect the key to be present, but `get` is safer than direct indexing. if let Some(count) = counts.get(key) { // Basic JSON string escaping for the key: only handles literal double quotes. // Words from split_whitespace typically don't contain quotes, but this adds robustness. let escaped_key = key.replace('"', "\\\""); items.push(format!("\"{}\": {}", escaped_key, count)); } } // Join the key-value pairs with commas and wrap in curly braces for valid JSON. println!("{{{}}}", items.join(", ")); } /* // How to Compile and Run: // 1. Save the code above as `rust_word_count.rs` (or any other name). // 2. Create the input file: // echo "This is is is, an example file.\nAnother line is here, with is again." > /tmp/file.txt // 3. Compile the Rust code using the standard Rust compiler (no Cargo needed): // rustc rust_word_count.rs // 4. Run the compiled executable: // ./rust_word_count // Example Expected Output (The order of the lines and the exact distribution of words // will depend on the file content, length, and thread scheduling, // but there will be exactly 4 lines of JSON): // {"is": 2} // {"This": 1, "is,": 1} // {"an": 1, "example": 1, "file.": 1} // {"Another": 1, "again.": 1, "here,": 1, "is": 1, "line": 1, "with": 1} // Note: The word "is" might appear in counts from different threads if it spans across // the calculated chunk boundaries. The merging step (done externally) would sum these up. // Punctuation attached to words (like "is," or "file.") is treated as part of the word // by `split_whitespace`. */ Answer either 'The student passes' or 'The student fails'.
The student passes.
Testing if the previous output contains the string student passes
: True
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, ErrorKind}; // Import ErrorKind for better error handling use std::thread; // No external crates (like serde, rayon, etc.) are needed. // --- Configuration --- /// The path to the input text file. const FILE_PATH: &str = "/tmp/file.txt"; /// The number of threads to use for parallel processing. const NUM_THREADS: usize = 4; /// Reads the file, splits the work among threads, counts words in parallel, /// and prints one JSON output per thread. fn main() -> io::Result<()> { // 1. Read the entire file content into a String // Provide more specific error messages if file operations fail. let mut file = File::open(FILE_PATH) .map_err(|e| io::Error::new(e.kind(), format!("Failed to open '{}': {}", FILE_PATH, e)))?; let mut content = String::new(); file.read_to_string(&mut content) .map_err(|e| io::Error::new(e.kind(), format!("Failed to read '{}': {}", FILE_PATH, e)))?; // Handle empty file: Print N empty JSON objects immediately. if content.is_empty() { for _ in 0..NUM_THREADS { println!("{{}}"); } return Ok(()); } // 2. Determine chunk boundaries to divide the work among threads. // The goal is to find split points near (length / NUM_THREADS) intervals, // preferably at whitespace, to avoid splitting words across threads. let mut boundaries = vec![0]; // Stores the start index of each chunk's *potential* range. let total_len = content.len(); if total_len > 0 { // Only calculate boundaries if there's content for i in 1..NUM_THREADS { // Calculate the approximate boundary point based on byte length. let approx_boundary = (total_len * i) / NUM_THREADS; // Find the index of the first whitespace character *at or after* the approximate boundary. // This ensures that the chunk processed by the *previous* thread ends cleanly before or at whitespace. // We search from the approximate boundary onwards. let boundary = content[approx_boundary..] .find(|c: char| c.is_whitespace()) .map(|pos| approx_boundary + pos) // Calculate absolute index of the whitespace. .unwrap_or(total_len); // If no whitespace found until the end, the boundary is the end. boundaries.push(boundary.min(total_len)); // Add the boundary, ensuring it doesn't exceed total length. } } boundaries.push(total_len); // Add the final boundary (end of the string). // Sort and deduplicate boundaries. This handles cases where calculated boundaries // might be identical (e.g., multiple consecutive spaces at a boundary point). boundaries.sort_unstable(); boundaries.dedup(); // 3. Create (start, end) index pairs for the actual slices each thread will process. // We aim for exactly NUM_THREADS pairs, even if some are empty due to file size or boundary calculation. let mut chunks = Vec::with_capacity(NUM_THREADS); let mut last_boundary_end = 0; // Iterate through the calculated boundary points to define chunk ranges. // `boundaries` contains the end points (exclusive) of the ideal ranges. for i in 1..boundaries.len() { let current_boundary_end = boundaries[i]; let start = last_boundary_end; let end = current_boundary_end; // Add the chunk definition (start, end). The thread will handle splitting words within this range. // Ensure start <= end, which should hold true after sort/dedup. chunks.push((start.min(end), end)); last_boundary_end = current_boundary_end; // The end of this chunk is the start of the next. } // Ensure we have exactly NUM_THREADS chunk definitions. If boundary calculations // resulted in fewer actual chunks (e.g., very short file, or NUM_THREADS > number of words), // pad the `chunks` vector with empty ranges. while chunks.len() < NUM_THREADS { // Add an empty chunk range pointing to the end of the content. chunks.push((total_len, total_len)); } // If, for some reason, we generated more chunks than threads (e.g., NUM_THREADS=1 but boundaries split), truncate. chunks.truncate(NUM_THREADS); // 4. Process chunks in parallel using `thread::scope`. // `thread::scope` ensures all spawned threads finish before `main` continues or returns. // This allows threads to safely borrow `content` without needing `Arc`. thread::scope(|s| { for i in 0..NUM_THREADS { // Get the specific (start, end) byte range for this thread. let (start, end) = chunks[i]; // Borrow the content slice for this thread. Check bounds for safety, although // the calculation logic should ensure they are valid. if start <= end && end <= content.len() { // `content` is borrowed immutably here, which is safe for multiple threads. let chunk_str: &str = &content[start..end]; // Spawn a new thread to process this chunk. s.spawn(move || { // Each thread gets its own word count map. let mut word_counts: HashMap<String, usize> = HashMap::new(); // `split_whitespace()` iterates over words separated by any amount of whitespace. // It correctly handles leading/trailing whitespace within the `chunk_str`. // It operates only on the provided slice, so words split exactly at the // byte boundary `start` or `end` will be handled correctly by the respective threads. for word in chunk_str.split_whitespace() { // Increment the count for the word. `to_string()` creates an owned String key. *word_counts.entry(word.to_string()).or_insert(0) += 1; } // Format and print this thread's result as a JSON object on its own line. print_json(&word_counts); }); } else { // This case should ideally not be reached with correct boundary/chunk logic. // However, to strictly guarantee NUM_THREADS outputs as requested, // spawn a thread that prints empty JSON if the chunk indices seem invalid. s.spawn(|| { println!("{{}}"); }); } } }); // `scope` automatically waits for all spawned threads to complete here. Ok(()) } /// Formats a HashMap<String, usize> into a JSON object string and prints it to stdout, followed by a newline. /// Keys within the JSON object are sorted alphabetically for deterministic output. fn print_json(counts: &HashMap<String, usize>) { // Handle the case of an empty map (no words found in the chunk). if counts.is_empty() { println!("{{}}"); return; } let mut items = Vec::with_capacity(counts.len()); // Sort keys for consistent output order. This is good practice, especially for testing. let mut sorted_keys: Vec<&String> = counts.keys().collect(); sorted_keys.sort_unstable(); // `sort_unstable` might offer slightly better performance than stable sort. // Build the JSON parts: "key": value for key in sorted_keys { // We expect the key to be present, but `get` is safer than direct indexing. if let Some(count) = counts.get(key) { // Basic JSON string escaping for the key: only handles literal double quotes. // Words from split_whitespace typically don't contain quotes, but this adds robustness. let escaped_key = key.replace('"', "\\\""); items.push(format!("\"{}\": {}", escaped_key, count)); } } // Join the key-value pairs with commas and wrap in curly braces for valid JSON. println!("{{{}}}", items.join(", ")); } /* // How to Compile and Run: // 1. Save the code above as `rust_word_count.rs` (or any other name). // 2. Create the input file: // echo "This is is is, an example file.\nAnother line is here, with is again." > /tmp/file.txt // 3. Compile the Rust code using the standard Rust compiler (no Cargo needed): // rustc rust_word_count.rs // 4. Run the compiled executable: // ./rust_word_count // Example Expected Output (The order of the lines and the exact distribution of words // will depend on the file content, length, and thread scheduling, // but there will be exactly 4 lines of JSON): // {"is": 2} // {"This": 1, "is,": 1} // {"an": 1, "example": 1, "file.": 1} // {"Another": 1, "again.": 1, "here,": 1, "is": 1, "line": 1, "with": 1} // Note: The word "is" might appear in counts from different threads if it spans across // the calculated chunk boundaries. The merging step (done externally) would sum these up. // Punctuation attached to words (like "is," or "file.") is treated as part of the word // by `split_whitespace`. */ |
{"age": 2, "best": 1, "foolishness,": 1, "it": 4, "of": 4, "the": 4, "times,": 2, "was": 4, "wisdom,": 1, "worst": 1}
{"Light,": 1, "belief,": 1, "epoch": 2, "incredulity,": 1, "it": 4, "of": 3, "season": 1, "the": 4, "was": 4}
{"Darkness,": 1, "before": 1, "despair,": 1, "everything": 1, "had": 1, "hope,": 1, "it": 2, "of": 3, "season": 1, "spring": 1, "the": 2, "us,": 1, "was": 2, "we": 1, "winter": 1}
{"Heaven,": 1, "all": 2, "before": 1, "direct": 2, "going": 2, "had": 1, "nothing": 1, "other": 1, "the": 1, "to": 1, "us,": 1, "way": 1, "we": 3, "were": 2}
warning: unused import: `ErrorKind`
--> main.rs:4:27
|
4 | use std::io::{self, Read, ErrorKind}; // Import ErrorKind for better error handling
| ^^^^^^^^^
|
= note: `#[warn(unused_imports)]` on by default
warning: 1 warning emitted
Resulting in output: [('Darkness,', 1), ('Heaven,', 1), ('Light,', 1), ('age', 2), ('all', 2), ('before', 2), ('belief,', 1), ('best', 1), ('despair,', 1), ('direct', 2), ('epoch', 2), ('everything', 1), ('foolishness,', 1), ('going', 2), ('had', 2), ('hope,', 1), ('incredulity,', 1), ('it', 10), ('nothing', 1), ('of', 10), ('other', 1), ('season', 2), ('spring', 1), ('the', 11), ('times,', 2), ('to', 1), ('us,', 2), ('was', 10), ('way', 1), ('we', 4), ('were', 2), ('winter', 1), ('wisdom,', 1), ('worst', 1)]
Testing if the previous output equals the string [('Darkness,', 1), ('Heaven,', 1), ('Light,', 1), ('age', 2), ('all', 2), ('before', 2), ('belief,', 1), ('best', 1), ('despair,', 1), ('direct', 2), ('epoch', 2), ('everything', 1), ('foolishness,', 1), ('going', 2), ('had', 2), ('hope,', 1), ('incredulity,', 1), ('it', 10), ('nothing', 1), ('of', 10), ('other', 1), ('season', 2), ('spring', 1), ('the', 11), ('times,', 2), ('to', 1), ('us,', 2), ('was', 10), ('way', 1), ('we', 4), ('were', 2), ('winter', 1), ('wisdom,', 1), ('worst', 1)]
: True