1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
from evaluator import *

DESCRIPTION = "Test if the model can write a C program that draws an image. This test requires the ability to understand the .bmp specification, and draw a flag that can be correctly parsed and seen by the evaluator."

TAGS = ['code', 'c', 'visual']

question = """
Write me a C program that emits a .bmp that's a depiction of an american flag to stdout.
"""

TestFlagDrawBMP = question >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("United States") | SubstringEvaluator("USA") | SubstringEvaluator("America"))


question2 = """
Write me a C program that emits a .bmp that's a depiction of an italian flag to stdout.
"""

TestEasyFlagDrawBMP = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("Italy") | SubstringEvaluator("Italian"))


if __name__ == "__main__":
    print(run_test(TestFlagDrawBMP))