Implementation Example
This end-to-end example shows how a Go travel-planning API can evaluate a real agent flow with a profile, shared runner options, deterministic route artifact contracts, custom metrics, scenario state, tool policies, and JSONL reporting.
Project Shape
The eval package sits beside the application code it tests. App-owned helpers build the agent, collect trajectory data, redact sensitive IDs, and translate agent outputs into eval.Case artifacts.
app/
├── go.mod
├── goeval.json
└── evals/travel/
├── runner_test.go
├── route_artifact_metrics_test.go
├── trajectory_scenarios_test.go
└── testdata/conversation_smoke.json1. Define The Eval Profile
The profile names the integration package, limits PR runs to the critical tier, writes JSONL results, and skips when the provider key is not present.
{
"profiles": {
"pr": {
"packages": [
"-tags=integration",
"./evals/travel/..."
],
"tiers": ["critical"],
"results_dir": ".goeval/pr",
"missing_prerequisite": "skip",
"prerequisites": [
{"type": "env", "name": "GEMINI_API_KEY"}
]
}
},
"compare": {
"case_id_key": "case_id",
"default": {
"score_tolerance": 0.02,
"fail_on_missing": true,
"fail_on_regression": true
}
}
}2. Share Runner Options
A small helper keeps every eval on the same tier filter, JSONL sink, redaction rules, and optional case filter.
func goEvalOptions(t testing.TB, extra ...eval.Option) []eval.Option {
t.Helper()
opts := []eval.Option{
eval.DefaultTierFilter(),
eval.WithResultSink(eval.DefaultResultSink()),
eval.WithRedactors(
eval.UUIDRedactor(),
eval.FieldRedactor("route_request_id"),
eval.FieldRedactor("session_id"),
eval.FieldRedactor("user_id"),
eval.FieldRedactor("quote_id"),
),
}
if filter := caseFilterFromEnv(t); filter != nil {
opts = append(opts, filter)
}
return append(opts, extra...)
}3. Contract The Structured Output
The suite checks the route artifact before judging prose. The contract verifies that the route exists, succeeded, has enough stops, and includes expected cities with accent/case-tolerant matching.
func routeArtifactNormalizer() eval.Normalizer {
return eval.ChainNormalizers(
eval.CaseFoldNormalizer(),
eval.SpanishASCIIFoldNormalizer(),
)
}
func routeStopContains(name string) eval.ArtifactArrayContains {
return eval.ArtifactArrayContains{
Key: "route",
Path: "stops[*].name",
Expected: name,
Normalizer: routeArtifactNormalizer(),
}
}
func readyRouteContract(name string, minStops int, checks ...eval.Metric) eval.Contract {
if minStops < 2 {
minStops = 2
}
contractChecks := []eval.Metric{
eval.ArtifactExists{Key: "route"},
eval.ArtifactSubset{
Key: "route",
Expected: json.RawMessage(`{"success":true}`),
},
eval.ArtifactArrayMinLen{Key: "route", Path: "stops", MinLen: minStops},
}
contractChecks = append(contractChecks, checks...)
return eval.NewContract(name, contractChecks...)
}Product-specific invariants can be ordinary metrics. This one ensures a route has exactly one fewer leg than stops.
type RouteLegCountMatchesStopsMinusOne struct {
Key string
}
func (m RouteLegCountMatchesStopsMinusOne) Name() string {
return "RouteLegCountMatchesStopsMinusOne"
}
func (m RouteLegCountMatchesStopsMinusOne) Score(ctx context.Context, _ eval.Judge, c eval.Case) (eval.Result, error) {
_ = ctx
route, err := decodeRouteArtifact(c, m.Key)
if err != nil {
return eval.Result{Score: 0, Passed: false, Metric: m.Name(), Reason: err.Error()}, nil
}
expected := len(route.Stops) - 1
if len(route.Legs) != expected {
return eval.Result{
Score: 0,
Passed: false,
Metric: m.Name(),
Reason: fmt.Sprintf("got %d legs for %d stops, expected %d", len(route.Legs), len(route.Stops), expected),
}, nil
}
return eval.Result{Score: 1, Passed: true, Metric: m.Name(), Reason: "route leg count matches stop count"}, nil
}4. Run A Multi-Step Agent Scenario
The scenario first proves the agent does not plan a route too early, then provides complete trip details and requires the planning tool plus the ready-route contract.
func TestScenario_IncrementalCompletionToReady(t *testing.T) {
requireGOEval(t)
ctx := context.Background()
agent := newRoutePlanningAgent(t, ctx)
requestID := uuid.NewString()
runner := eval.NewRunner(nil, goEvalOptions(t)...)
driver := newRouteScenarioDriver(t, agent, requestID)
state := map[string]any{
"partial_route_idea": "Pending: cities, dates, travelers, vehicle type\nStatus: planning",
"complete_details": "Confirmed: Santiago to Puerto Montt, Jan 10-Jan 20, 2 adults, campervan\nStatus: ready",
}
runner.RunScenario(t, eval.Scenario{
Name: "incremental_completion_ready",
Tier: "critical",
Tools: travelToolRegistry(),
Driver: driver,
State: state,
Metadata: map[string]any{
"flow": "travel.route",
"tier": "critical",
"case_id": "route-incremental-completion",
},
Repeat: eval.ScenarioRepeat{N: 3, PassRate: 2.0 / 3.0},
Steps: []eval.Step{
{
Name: "partial_route_idea",
Input: "I want a scenic campervan road trip, but I have not picked cities, dates, travelers, or vehicle yet.",
ForbiddenTools: []string{"plan_route"},
ForbiddenToolPatterns: []string{"maps_*", "routes_*"},
Timeout: 45 * time.Second,
Checks: []eval.Metric{
eval.ArtifactNotExists{Key: "route"},
},
},
{
Name: "complete_details",
Input: "Make it Santiago to Puerto Montt from January 10 to January 20 for 2 adults in a campervan. Propose the route with stops.",
RequiredTools: []string{"plan_route"},
MaxToolCalls: 6,
Timeout: 90 * time.Second,
Checks: []eval.Metric{
readyRouteContract(
"ready_route",
2,
routeStopContains("Santiago"),
routeStopContains("Puerto Montt"),
RouteLegCountMatchesStopsMinusOne{Key: "route"},
),
},
},
},
})
}5. Run, Summarize, Compare
The same manifest powers local runs and CI gates. Normal go test remains fast unless the eval profile enables GOEVAL=1.
go install github.com/igcodinap/go-eval/cmd/goeval@v1.0.0
# Run critical travel-planning evals, writing .goeval/pr/results.jsonl.
goeval test --profile pr
# Inspect one run.
goeval summarize --policy goeval.json .goeval/pr/results.jsonl
# Gate a prompt/model change against a baseline.
goeval compare --policy goeval.json baseline.jsonl .goeval/pr/results.jsonl