Given a parse function like def parse_transcript(content): transcript_segments = [] pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\] (.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds() end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds() transcript_segments.append((start_seconds, end_seconds, text)) return transcript_segments for data like [00:00:14.240 --> 00:00:14.420] the [00:00:14.420 --> 00:00:14.860] robust [00:00:14.860 --> 00:00:15.020] ness [00:00:15.020 --> 00:00:15.140] of [00:00:15.140 --> 00:00:15.500] neural [00:00:15.500 --> 00:00:15.870] networks [00:00:15.870 --> 00:00:16.200] . [00:00:16.200 --> 00:00:16.410] And [00:00:16.410 --> 00:00:16.700] this [00:00:16.700 --> 00:00:16.840] is [00:00:16.840 --> 00:00:17.200] joint write a merge() function that (1) calls parse and (2) merges words into sentences that end in period. Start with the first word and end with the period. Return a list of strings.
Given a parse function like def parse_transcript(content): transcript_segments = [] pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\] (.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds() end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds() transcript_segments.append((start_seconds, end_seconds, text)) return transcript_segments for data like [00:00:14.240 --> 00:00:14.420] the [00:00:14.420 --> 00:00:14.860] robust [00:00:14.860 --> 00:00:15.020] ness [00:00:15.020 --> 00:00:15.140] of [00:00:15.140 --> 00:00:15.500] neural [00:00:15.500 --> 00:00:15.870] networks [00:00:15.870 --> 00:00:16.200] . [00:00:16.200 --> 00:00:16.410] And [00:00:16.410 --> 00:00:16.700] this [00:00:16.700 --> 00:00:16.840] is [00:00:16.840 --> 00:00:17.200] joint write a merge() function that (1) calls parse and (2) merges words into sentences that end in period. Start with the first word and end with the period. Return a list of strings.
Okay, here's the Python code including the `parse_transcript` function and the new `merge_transcript_to_sentences` function.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | import re from datetime import timedelta from typing import List, Tuple def parse_transcript(content: str) -> List[Tuple[float, float, str]]: """ Parses transcript content with timestamps into segments. Args: content: A string containing the transcript data in the format: [HH:MM:SS.mmm --> HH:MM:SS.mmm] text Returns: A list of tuples, where each tuple contains: (start_time_seconds, end_time_seconds, text_segment) """ transcript_segments = [] # Regex captures start time, end time, and the text content. # Added \s* to handle potential variable spacing after the timestamp block. pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: try: # Convert timestamp strings to total seconds using timedelta start_seconds = timedelta( hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:]) ).total_seconds() end_seconds = timedelta( hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:]) ).total_seconds() # Append the parsed data, stripping leading/trailing whitespace from text transcript_segments.append((start_seconds, end_seconds, text.strip())) except ValueError: # Handle potential errors in timestamp format if necessary print(f"Warning: Skipping segment due to invalid timestamp format: {start_time} --> {end_time}") continue return transcript_segments def merge_transcript_to_sentences(content: str) -> List[str]: """ Parses transcript content and merges word segments into sentences ending with a period. Args: content: A string containing the transcript data. Returns: A list of strings, where each string is a merged sentence. """ # 1. Call parse_transcript to get individual segments parsed_segments = parse_transcript(content) if not parsed_segments: return [] sentences = [] current_sentence_words = [] # 2. Iterate through parsed segments and merge words for _, _, text in parsed_segments: if not text: # Skip empty text segments continue current_sentence_words.append(text) # Check if the current text segment ends with a period if text.endswith('.'): # Join the collected words into a single sentence string sentence = " ".join(current_sentence_words) sentences.append(sentence) # Reset the list for the next sentence current_sentence_words = [] # Add any remaining words as the last sentence if the transcript # doesn't end neatly with a period. if current_sentence_words: sentence = " ".join(current_sentence_words) sentences.append(sentence) return sentences # --- Example Usage --- transcript_data = """ [00:00:14.240 --> 00:00:14.420] the [00:00:14.420 --> 00:00:14.860] robust [00:00:14.860 --> 00:00:15.020] ness [00:00:15.020 --> 00:00:15.140] of [00:00:15.140 --> 00:00:15.500] neural [00:00:15.500 --> 00:00:15.870] networks [00:00:15.870 --> 00:00:16.200] . [00:00:16.200 --> 00:00:16.410] And [00:00:16.410 --> 00:00:16.700] this [00:00:16.700 --> 00:00:16.840] is [00:00:16.840 --> 00:00:17.200] joint [00:00:17.200 --> 00:00:17.800] work with colleagues. [00:00:17.900 --> 00:00:18.500] This is another sentence [00:00:18.500 --> 00:00:18.900] without a period at the end """ # Parse and merge the transcript data merged_sentences = merge_transcript_to_sentences(transcript_data) # Print the resulting sentences for i, sentence in enumerate(merged_sentences): print(f"Sentence {i+1}: {sentence}") # --- Example Output --- # Sentence 1: the robust ness of neural networks . # Sentence 2: And this is joint work with colleagues. # Sentence 3: This is another sentence without a period at the end |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | import re from datetime import timedelta from typing import List, Tuple def parse_transcript(content: str) -> List[Tuple[float, float, str]]: """ Parses transcript content with timestamps into segments. Args: content: A string containing the transcript data in the format: [HH:MM:SS.mmm --> HH:MM:SS.mmm] text Returns: A list of tuples, where each tuple contains: (start_time_seconds, end_time_seconds, text_segment) """ transcript_segments = [] # Regex captures start time, end time, and the text content. # Added \s* to handle potential variable spacing after the timestamp block. pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: try: # Convert timestamp strings to total seconds using timedelta start_seconds = timedelta( hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:]) ).total_seconds() end_seconds = timedelta( hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:]) ).total_seconds() # Append the parsed data, stripping leading/trailing whitespace from text transcript_segments.append((start_seconds, end_seconds, text.strip())) except ValueError: # Handle potential errors in timestamp format if necessary print(f"Warning: Skipping segment due to invalid timestamp format: {start_time} --> {end_time}") continue return transcript_segments def merge_transcript_to_sentences(content: str) -> List[str]: """ Parses transcript content and merges word segments into sentences ending with a period. Args: content: A string containing the transcript data. Returns: A list of strings, where each string is a merged sentence. """ # 1. Call parse_transcript to get individual segments parsed_segments = parse_transcript(content) if not parsed_segments: return [] sentences = [] current_sentence_words = [] # 2. Iterate through parsed segments and merge words for _, _, text in parsed_segments: if not text: # Skip empty text segments continue current_sentence_words.append(text) # Check if the current text segment ends with a period if text.endswith('.'): # Join the collected words into a single sentence string sentence = " ".join(current_sentence_words) sentences.append(sentence) # Reset the list for the next sentence current_sentence_words = [] # Add any remaining words as the last sentence if the transcript # doesn't end neatly with a period. if current_sentence_words: sentence = " ".join(current_sentence_words) sentences.append(sentence) return sentences # --- Example Usage --- transcript_data = """ [00:00:14.240 --> 00:00:14.420] the [00:00:14.420 --> 00:00:14.860] robust [00:00:14.860 --> 00:00:15.020] ness [00:00:15.020 --> 00:00:15.140] of [00:00:15.140 --> 00:00:15.500] neural [00:00:15.500 --> 00:00:15.870] networks [00:00:15.870 --> 00:00:16.200] . [00:00:16.200 --> 00:00:16.410] And [00:00:16.410 --> 00:00:16.700] this [00:00:16.700 --> 00:00:16.840] is [00:00:16.840 --> 00:00:17.200] joint [00:00:17.200 --> 00:00:17.800] work with colleagues. [00:00:17.900 --> 00:00:18.500] This is another sentence [00:00:18.500 --> 00:00:18.900] without a period at the end """ # Parse and merge the transcript data merged_sentences = merge_transcript_to_sentences(transcript_data) # Print the resulting sentences for i, sentence in enumerate(merged_sentences): print(f"Sentence {i+1}: {sentence}") # --- Example Output --- # Sentence 1: the robust ness of neural networks . # Sentence 2: And this is joint work with colleagues. # Sentence 3: This is another sentence without a period at the end |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import re from datetime import timedelta from typing import List, Tuple def parse_transcript(content: str) -> List[Tuple[float, float, str]]: """ Parses transcript content with timestamps into segments. Args: content: A string containing the transcript data in the format: [HH:MM:SS.mmm --> HH:MM:SS.mmm] text Returns: A list of tuples, where each tuple contains: (start_time_seconds, end_time_seconds, text_segment) """ transcript_segments = [] # Regex captures start time, end time, and the text content. # Added \s* to handle potential variable spacing after the timestamp block. pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: try: # Convert timestamp strings to total seconds using timedelta start_seconds = timedelta( hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:]) ).total_seconds() end_seconds = timedelta( hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:]) ).total_seconds() # Append the parsed data, stripping leading/trailing whitespace from text transcript_segments.append((start_seconds, end_seconds, text.strip())) except ValueError: # Handle potential errors in timestamp format if necessary print(f"Warning: Skipping segment due to invalid timestamp format: {start_time} --> {end_time}") continue return transcript_segments def merge_transcript_to_sentences(content: str) -> List[str]: """ Parses transcript content and merges word segments into sentences ending with a period. Args: content: A string containing the transcript data. Returns: A list of strings, where each string is a merged sentence. """ # 1. Call parse_transcript to get individual segments parsed_segments = parse_transcript(content) if not parsed_segments: return [] sentences = [] current_sentence_words = [] # 2. Iterate through parsed segments and merge words for _, _, text in parsed_segments: if not text: # Skip empty text segments continue current_sentence_words.append(text) # Check if the current text segment ends with a period if text.endswith('.'): # Join the collected words into a single sentence string sentence = " ".join(current_sentence_words) sentences.append(sentence) # Reset the list for the next sentence current_sentence_words = [] # Add any remaining words as the last sentence if the transcript # doesn't end neatly with a period. if current_sentence_words: sentence = " ".join(current_sentence_words) sentences.append(sentence) return sentences # --- Example Usage --- transcript_data = """ [00:00:14.240 --> 00:00:14.420] the [00:00:14.420 --> 00:00:14.860] robust [00:00:14.860 --> 00:00:15.020] ness [00:00:15.020 --> 00:00:15.140] of [00:00:15.140 --> 00:00:15.500] neural [00:00:15.500 --> 00:00:15.870] networks [00:00:15.870 --> 00:00:16.200] . [00:00:16.200 --> 00:00:16.410] And [00:00:16.410 --> 00:00:16.700] this [00:00:16.700 --> 00:00:16.840] is [00:00:16.840 --> 00:00:17.200] joint [00:00:17.200 --> 00:00:17.800] work with colleagues. [00:00:17.900 --> 00:00:18.500] This is another sentence [00:00:18.500 --> 00:00:18.900] without a period at the end """ # Parse and merge the transcript data merged_sentences = merge_transcript_to_sentences(transcript_data) # Print the resulting sentences for i, sentence in enumerate(merged_sentences): print(f"Sentence {i+1}: {sentence}") # --- Example Output --- # Sentence 1: the robust ness of neural networks . # Sentence 2: And this is joint work with colleagues. # Sentence 3: This is another sentence without a period at the end def parse_transcript(content): transcript_segments = [] pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\] (.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds() end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds() transcript_segments.append((start_seconds, end_seconds, text)) return transcript_segments answer = merge("[00:00:14.240 --> 00:00:14.420] the\n[00:00:14.420 --> 00:00:14.860] robust\n[00:00:14.860 --> 00:00:15.020] ness\n[00:00:15.020 --> 00:00:15.140] of\n[00:00:15.140 --> 00:00:15.500] neural\n[00:00:15.500 --> 00:00:15.870] networks\n[00:00:15.870 --> 00:00:16.200] .\n[00:00:16.200 --> 00:00:16.410] And\n[00:00:16.410 --> 00:00:16.700] this\n[00:00:16.700 --> 00:00:16.840] is\n[00:00:16.840 --> 00:00:17.200] joint\n") expected = [' the robustness of neural networks.', ' And this is joint'] assert answer == expected, f'Wrong answer; got {answer} instead of {expected}' print('All tests passed') |
Sentence 1: the robust ness of neural networks .
Sentence 2: And this is joint work with colleagues.
Sentence 3: This is another sentence without a period at the end
Traceback (most recent call last):
File "/usr/src/app/main.py", line 138, in <module>
answer = merge("[00:00:14.240 --> 00:00:14.420] the\n[00:00:14.420 --> 00:00:14.860] robust\n[00:00:14.860 --> 00:00:15.020] ness\n[00:00:15.020 --> 00:00:15.140] of\n[00:00:15.140 --> 00:00:15.500] neural\n[00:00:15.500 --> 00:00:15.870] networks\n[00:00:15.870 --> 00:00:16.200] .\n[00:00:16.200 --> 00:00:16.410] And\n[00:00:16.410 --> 00:00:16.700] this\n[00:00:16.700 --> 00:00:16.840] is\n[00:00:16.840 --> 00:00:17.200] joint\n")
^^^^^
NameError: name 'merge' is not defined
Testing if the previous output contains the string All tests passed
: False