Initial Query

Given a parse function like def parse_transcript(content): transcript_segments = [] pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\] (.*)' matches = re.findall(pattern, content) for start_time, end_time, text in matches: start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds() end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds() transcript_segments.append((start_seconds, end_seconds, text)) return transcript_segments for data like [00:00:14.240 --> 00:00:14.420] the [00:00:14.420 --> 00:00:14.860] robust [00:00:14.860 --> 00:00:15.020] ness [00:00:15.020 --> 00:00:15.140] of [00:00:15.140 --> 00:00:15.500] neural [00:00:15.500 --> 00:00:15.870] networks [00:00:15.870 --> 00:00:16.200] . [00:00:16.200 --> 00:00:16.410] And [00:00:16.410 --> 00:00:16.700] this [00:00:16.700 --> 00:00:16.840] is [00:00:16.840 --> 00:00:17.200] joint write a merge() function that (1) calls parse and (2) merges words into sentences that end in period. Start with the first word and end with the period. Return a list of strings.

LLM Generation

Query

Output

Okay, here's the Python code including the `parse_transcript` function and the new `merge_transcript_to_sentences` function.

import re
from datetime import timedelta
from typing import List, Tuple

def parse_transcript(content: str) -> List[Tuple[float, float, str]]:
    """
    Parses transcript content with timestamps into segments.

    Args:
        content: A string containing the transcript data in the format:
                 [HH:MM:SS.mmm --> HH:MM:SS.mmm]  text

    Returns:
        A list of tuples, where each tuple contains:
        (start_time_seconds, end_time_seconds, text_segment)
    """
    transcript_segments = []
    # Regex captures start time, end time, and the text content.
    # Added \s* to handle potential variable spacing after the timestamp block.
    pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)'
    matches = re.findall(pattern, content)

    for start_time, end_time, text in matches:
        try:
            # Convert timestamp strings to total seconds using timedelta
            start_seconds = timedelta(
                hours=int(start_time[:2]),
                minutes=int(start_time[3:5]),
                seconds=int(start_time[6:8]),
                milliseconds=int(start_time[9:])
            ).total_seconds()
            end_seconds = timedelta(
                hours=int(end_time[:2]),
                minutes=int(end_time[3:5]),
                seconds=int(end_time[6:8]),
                milliseconds=int(end_time[9:])
            ).total_seconds()
            # Append the parsed data, stripping leading/trailing whitespace from text
            transcript_segments.append((start_seconds, end_seconds, text.strip()))
        except ValueError:
            # Handle potential errors in timestamp format if necessary
            print(f"Warning: Skipping segment due to invalid timestamp format: {start_time} --> {end_time}")
            continue


    return transcript_segments

def merge_transcript_to_sentences(content: str) -> List[str]:
    """
    Parses transcript content and merges word segments into sentences
    ending with a period.

    Args:
        content: A string containing the transcript data.

    Returns:
        A list of strings, where each string is a merged sentence.
    """
    # 1. Call parse_transcript to get individual segments
    parsed_segments = parse_transcript(content)

    if not parsed_segments:
        return []

    sentences = []
    current_sentence_words = []

    # 2. Iterate through parsed segments and merge words
    for _, _, text in parsed_segments:
        if not text:  # Skip empty text segments
            continue

        current_sentence_words.append(text)

        # Check if the current text segment ends with a period
        if text.endswith('.'):
            # Join the collected words into a single sentence string
            sentence = " ".join(current_sentence_words)
            sentences.append(sentence)
            # Reset the list for the next sentence
            current_sentence_words = []

    # Add any remaining words as the last sentence if the transcript
    # doesn't end neatly with a period.
    if current_sentence_words:
        sentence = " ".join(current_sentence_words)
        sentences.append(sentence)

    return sentences

# --- Example Usage ---
transcript_data = """
[00:00:14.240 --> 00:00:14.420]   the
[00:00:14.420 --> 00:00:14.860]   robust
[00:00:14.860 --> 00:00:15.020]  ness
[00:00:15.020 --> 00:00:15.140]   of
[00:00:15.140 --> 00:00:15.500]   neural
[00:00:15.500 --> 00:00:15.870]   networks
[00:00:15.870 --> 00:00:16.200]  .
[00:00:16.200 --> 00:00:16.410]   And
[00:00:16.410 --> 00:00:16.700]   this
[00:00:16.700 --> 00:00:16.840]   is
[00:00:16.840 --> 00:00:17.200]   joint
[00:00:17.200 --> 00:00:17.800]   work with colleagues.
[00:00:17.900 --> 00:00:18.500]   This is another sentence
[00:00:18.500 --> 00:00:18.900]   without a period at the end
"""

# Parse and merge the transcript data
merged_sentences = merge_transcript_to_sentences(transcript_data)

# Print the resulting sentences
for i, sentence in enumerate(merged_sentences):
    print(f"Sentence {i+1}: {sentence}")

# --- Example Output ---
# Sentence 1: the robust ness of neural networks .
# Sentence 2: And this is joint work with colleagues.
# Sentence 3: This is another sentence without a period at the end

**Explanation:** 1. **`parse_transcript(content)`:** * This function remains largely the same as provided in the prompt. * A small improvement: `\s*` is added in the regex `pattern` after the timestamp block `\]` to correctly handle potentially varying amounts of whitespace before the actual text begins. * `text.strip()` is used when appending to `transcript_segments` to remove leading/trailing whitespace from the captured text, ensuring cleaner joining later. * Basic error handling for `timedelta` conversion is added (optional but good practice). 2. **`merge_transcript_to_sentences(content)`:** * It first calls `parse_transcript(content)` to get the list of `(start, end, text)` tuples. * It initializes an empty list `sentences` to store the final merged sentences and `current_sentence_words` to temporarily hold words for the sentence being built. * It iterates through the `parsed_segments`. We only need the `text` part for merging, so we unpack `(_, _, text)`. * It skips segments if their `text` is empty after stripping. * Each non-empty `text` segment is appended to the `current_sentence_words` list. * Crucially, it checks if the *current* `text` segment `endswith('.')`. * If it does end with a period, it means the sentence is complete. The words in `current_sentence_words` are joined together with a single space (`" ".join(...)`) to form the `sentence` string. * This completed `sentence` is added to the `sentences` list. * `current_sentence_words` is reset to an empty list `[]` to start collecting words for the *next* sentence. * After the loop finishes, there might be remaining words in `current_sentence_words` if the last segment didn't end with a period. The code checks for this (`if current_sentence_words:`) and joins/appends this final (potentially incomplete) sentence. * Finally, it returns the `sentences` list.

Extract Code

I extracted the following code from that output:

import re
from datetime import timedelta
from typing import List, Tuple

def parse_transcript(content: str) -> List[Tuple[float, float, str]]:
    """
    Parses transcript content with timestamps into segments.

    Args:
        content: A string containing the transcript data in the format:
                 [HH:MM:SS.mmm --> HH:MM:SS.mmm]  text

    Returns:
        A list of tuples, where each tuple contains:
        (start_time_seconds, end_time_seconds, text_segment)
    """
    transcript_segments = []
    # Regex captures start time, end time, and the text content.
    # Added \s* to handle potential variable spacing after the timestamp block.
    pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)'
    matches = re.findall(pattern, content)

    for start_time, end_time, text in matches:
        try:
            # Convert timestamp strings to total seconds using timedelta
            start_seconds = timedelta(
                hours=int(start_time[:2]),
                minutes=int(start_time[3:5]),
                seconds=int(start_time[6:8]),
                milliseconds=int(start_time[9:])
            ).total_seconds()
            end_seconds = timedelta(
                hours=int(end_time[:2]),
                minutes=int(end_time[3:5]),
                seconds=int(end_time[6:8]),
                milliseconds=int(end_time[9:])
            ).total_seconds()
            # Append the parsed data, stripping leading/trailing whitespace from text
            transcript_segments.append((start_seconds, end_seconds, text.strip()))
        except ValueError:
            # Handle potential errors in timestamp format if necessary
            print(f"Warning: Skipping segment due to invalid timestamp format: {start_time} --> {end_time}")
            continue


    return transcript_segments

def merge_transcript_to_sentences(content: str) -> List[str]:
    """
    Parses transcript content and merges word segments into sentences
    ending with a period.

    Args:
        content: A string containing the transcript data.

    Returns:
        A list of strings, where each string is a merged sentence.
    """
    # 1. Call parse_transcript to get individual segments
    parsed_segments = parse_transcript(content)

    if not parsed_segments:
        return []

    sentences = []
    current_sentence_words = []

    # 2. Iterate through parsed segments and merge words
    for _, _, text in parsed_segments:
        if not text:  # Skip empty text segments
            continue

        current_sentence_words.append(text)

        # Check if the current text segment ends with a period
        if text.endswith('.'):
            # Join the collected words into a single sentence string
            sentence = " ".join(current_sentence_words)
            sentences.append(sentence)
            # Reset the list for the next sentence
            current_sentence_words = []

    # Add any remaining words as the last sentence if the transcript
    # doesn't end neatly with a period.
    if current_sentence_words:
        sentence = " ".join(current_sentence_words)
        sentences.append(sentence)

    return sentences

# --- Example Usage ---
transcript_data = """
[00:00:14.240 --> 00:00:14.420]   the
[00:00:14.420 --> 00:00:14.860]   robust
[00:00:14.860 --> 00:00:15.020]  ness
[00:00:15.020 --> 00:00:15.140]   of
[00:00:15.140 --> 00:00:15.500]   neural
[00:00:15.500 --> 00:00:15.870]   networks
[00:00:15.870 --> 00:00:16.200]  .
[00:00:16.200 --> 00:00:16.410]   And
[00:00:16.410 --> 00:00:16.700]   this
[00:00:16.700 --> 00:00:16.840]   is
[00:00:16.840 --> 00:00:17.200]   joint
[00:00:17.200 --> 00:00:17.800]   work with colleagues.
[00:00:17.900 --> 00:00:18.500]   This is another sentence
[00:00:18.500 --> 00:00:18.900]   without a period at the end
"""

# Parse and merge the transcript data
merged_sentences = merge_transcript_to_sentences(transcript_data)

# Print the resulting sentences
for i, sentence in enumerate(merged_sentences):
    print(f"Sentence {i+1}: {sentence}")

# --- Example Output ---
# Sentence 1: the robust ness of neural networks .
# Sentence 2: And this is joint work with colleagues.
# Sentence 3: This is another sentence without a period at the end

Run Code Interpreter

Running the following program:

import re
from datetime import timedelta
from typing import List, Tuple

def parse_transcript(content: str) -> List[Tuple[float, float, str]]:
    """
    Parses transcript content with timestamps into segments.

    Args:
        content: A string containing the transcript data in the format:
                 [HH:MM:SS.mmm --> HH:MM:SS.mmm]  text

    Returns:
        A list of tuples, where each tuple contains:
        (start_time_seconds, end_time_seconds, text_segment)
    """
    transcript_segments = []
    # Regex captures start time, end time, and the text content.
    # Added \s* to handle potential variable spacing after the timestamp block.
    pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)'
    matches = re.findall(pattern, content)

    for start_time, end_time, text in matches:
        try:
            # Convert timestamp strings to total seconds using timedelta
            start_seconds = timedelta(
                hours=int(start_time[:2]),
                minutes=int(start_time[3:5]),
                seconds=int(start_time[6:8]),
                milliseconds=int(start_time[9:])
            ).total_seconds()
            end_seconds = timedelta(
                hours=int(end_time[:2]),
                minutes=int(end_time[3:5]),
                seconds=int(end_time[6:8]),
                milliseconds=int(end_time[9:])
            ).total_seconds()
            # Append the parsed data, stripping leading/trailing whitespace from text
            transcript_segments.append((start_seconds, end_seconds, text.strip()))
        except ValueError:
            # Handle potential errors in timestamp format if necessary
            print(f"Warning: Skipping segment due to invalid timestamp format: {start_time} --> {end_time}")
            continue


    return transcript_segments

def merge_transcript_to_sentences(content: str) -> List[str]:
    """
    Parses transcript content and merges word segments into sentences
    ending with a period.

    Args:
        content: A string containing the transcript data.

    Returns:
        A list of strings, where each string is a merged sentence.
    """
    # 1. Call parse_transcript to get individual segments
    parsed_segments = parse_transcript(content)

    if not parsed_segments:
        return []

    sentences = []
    current_sentence_words = []

    # 2. Iterate through parsed segments and merge words
    for _, _, text in parsed_segments:
        if not text:  # Skip empty text segments
            continue

        current_sentence_words.append(text)

        # Check if the current text segment ends with a period
        if text.endswith('.'):
            # Join the collected words into a single sentence string
            sentence = " ".join(current_sentence_words)
            sentences.append(sentence)
            # Reset the list for the next sentence
            current_sentence_words = []

    # Add any remaining words as the last sentence if the transcript
    # doesn't end neatly with a period.
    if current_sentence_words:
        sentence = " ".join(current_sentence_words)
        sentences.append(sentence)

    return sentences

# --- Example Usage ---
transcript_data = """
[00:00:14.240 --> 00:00:14.420]   the
[00:00:14.420 --> 00:00:14.860]   robust
[00:00:14.860 --> 00:00:15.020]  ness
[00:00:15.020 --> 00:00:15.140]   of
[00:00:15.140 --> 00:00:15.500]   neural
[00:00:15.500 --> 00:00:15.870]   networks
[00:00:15.870 --> 00:00:16.200]  .
[00:00:16.200 --> 00:00:16.410]   And
[00:00:16.410 --> 00:00:16.700]   this
[00:00:16.700 --> 00:00:16.840]   is
[00:00:16.840 --> 00:00:17.200]   joint
[00:00:17.200 --> 00:00:17.800]   work with colleagues.
[00:00:17.900 --> 00:00:18.500]   This is another sentence
[00:00:18.500 --> 00:00:18.900]   without a period at the end
"""

# Parse and merge the transcript data
merged_sentences = merge_transcript_to_sentences(transcript_data)

# Print the resulting sentences
for i, sentence in enumerate(merged_sentences):
    print(f"Sentence {i+1}: {sentence}")

# --- Example Output ---
# Sentence 1: the robust ness of neural networks .
# Sentence 2: And this is joint work with colleagues.
# Sentence 3: This is another sentence without a period at the end




def parse_transcript(content):
    transcript_segments = []
    pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]  (.*)'
    matches = re.findall(pattern, content)

    for start_time, end_time, text in matches:
        start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds()
        end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds()
        transcript_segments.append((start_seconds, end_seconds, text))

    return transcript_segments


answer = merge("[00:00:14.240 --> 00:00:14.420]   the\n[00:00:14.420 --> 00:00:14.860]   robust\n[00:00:14.860 --> 00:00:15.020]  ness\n[00:00:15.020 --> 00:00:15.140]   of\n[00:00:15.140 --> 00:00:15.500]   neural\n[00:00:15.500 --> 00:00:15.870]   networks\n[00:00:15.870 --> 00:00:16.200]  .\n[00:00:16.200 --> 00:00:16.410]   And\n[00:00:16.410 --> 00:00:16.700]   this\n[00:00:16.700 --> 00:00:16.840]   is\n[00:00:16.840 --> 00:00:17.200]   joint\n")
expected = [' the robustness of neural networks.', ' And this is joint']
assert answer == expected, f'Wrong answer; got {answer} instead of {expected}'
print('All tests passed')

And got the output:

Sentence 1: the robust ness of neural networks .
Sentence 2: And this is joint work with colleagues.
Sentence 3: This is another sentence without a period at the end
Traceback (most recent call last):
  File "/usr/src/app/main.py", line 138, in <module>
    answer = merge("[00:00:14.240 --> 00:00:14.420]   the\n[00:00:14.420 --> 00:00:14.860]   robust\n[00:00:14.860 --> 00:00:15.020]  ness\n[00:00:15.020 --> 00:00:15.140]   of\n[00:00:15.140 --> 00:00:15.500]   neural\n[00:00:15.500 --> 00:00:15.870]   networks\n[00:00:15.870 --> 00:00:16.200]  .\n[00:00:16.200 --> 00:00:16.410]   And\n[00:00:16.410 --> 00:00:16.700]   this\n[00:00:16.700 --> 00:00:16.840]   is\n[00:00:16.840 --> 00:00:17.200]   joint\n")
             ^^^^^
NameError: name 'merge' is not defined

Substring Evaluation

Testing if the previous output contains the string All tests passed: False