Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 59 additions & 9 deletions src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from __future__ import annotations

from collections import Counter
from collections import namedtuple
from typing import Optional

from google.genai import types as genai_types
Expand All @@ -27,6 +29,7 @@
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .evaluator import PerInvocationResult
from .text_utils import normalize_text


class RougeEvaluator(Evaluator):
Expand All @@ -46,7 +49,7 @@ def evaluate_invocations(
conversation_scenario: Optional[ConversationScenario] = None,
) -> EvaluationResult:
if expected_invocations is None:
raise ValueError("expected_invocations is required for this metric.")
raise ValueError('expected_invocations is required for this metric.')
del conversation_scenario # not used by this metric.

total_score = 0.0
Expand Down Expand Up @@ -83,15 +86,18 @@ def evaluate_invocations(

def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
if content and content.parts:
return "\n".join([part.text for part in content.parts if part.text])
return '\n'.join([part.text for part in content.parts if part.text])

return ""
return ''


def _get_eval_status(score: float, threshold: float):
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED


Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])


def _calculate_rouge_1_scores(candidate: str, reference: str):
"""Calculates the ROUGE-1 score between a candidate and reference text.

Expand All @@ -108,12 +114,56 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
reference: The ground-truth text to compare against.

Returns:
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure.
"""
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
# Normalize both texts before scoring to handle Unicode variations
normalized_candidate = normalize_text(candidate)
normalized_reference = normalize_text(reference)

# Check if the text contains spaces (word-separated languages)
has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate

if has_spaces:
# Use standard word-level ROUGE for space-separated languages
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scores = scorer.score(normalized_reference, normalized_candidate)
return scores['rouge1']
else:
# For non-space-separated languages, use character-level comparison
return _calculate_character_level_rouge(
normalized_candidate, normalized_reference
)


def _calculate_character_level_rouge(candidate: str, reference: str):
"""Calculates character-level ROUGE-1 score for non-space-separated text.

Args:
candidate: The candidate text (already normalized).
reference: The reference text (already normalized).

Returns:
A Score namedtuple with precision, recall, and fmeasure.
"""

if not reference or not candidate:
return Score(precision=0.0, recall=0.0, fmeasure=0.0)

# Count character occurrences
ref_chars = Counter(reference)
cand_chars = Counter(candidate)

# Calculate overlapping characters
overlap = sum((ref_chars & cand_chars).values())

# Calculate precision and recall
precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
recall = overlap / len(reference) if len(reference) > 0 else 0.0
Comment on lines +160 to +161
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The checks if len(candidate) > 0 and if len(reference) > 0 are redundant. The guard clause on line 149 (if not reference or not candidate:) ensures that if this part of the code is reached, both candidate and reference are non-empty strings, so their lengths will be greater than 0. You can simplify the code by removing these checks.

Suggested change
precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
recall = overlap / len(reference) if len(reference) > 0 else 0.0
precision = overlap / len(candidate)
recall = overlap / len(reference)


# The score method returns a dictionary where keys are the ROUGE types
# and values are Score objects (tuples) with precision, recall, and fmeasure.
scores = scorer.score(reference, candidate)
# Calculate F-measure
if precision + recall > 0:
fmeasure = 2 * (precision * recall) / (precision + recall)
else:
fmeasure = 0.0

return scores["rouge1"]
return Score(precision=precision, recall=recall, fmeasure=fmeasure)
34 changes: 34 additions & 0 deletions src/google/adk/evaluation/text_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Text utilities for evaluation."""

from __future__ import annotations

import unicodedata


def normalize_text(text: str) -> str:
"""Normalize text using NFC normalization and strip whitespace.

This ensures consistent text comparison across different Unicode
representations, which is particularly important for non-English text.

Args:
text: The text to normalize.

Returns:
The normalized text.
"""
return unicodedata.normalize("NFC", text).strip()
38 changes: 38 additions & 0 deletions tests/unittests/evaluation/test_non_english_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for final_response_match_v1."""

from __future__ import annotations

import pytest


def test_normalization_applied_in_rouge():
"""Normalization should make identical Thai strings match."""
from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
from google.adk.evaluation.text_utils import normalize_text

reference = "สวัสดี"
candidate = "สวัสดี"

# Verify normalization directly
assert normalize_text(reference) == normalize_text(candidate)

# Verify ROUGE score reflects a perfect match
score = _calculate_rouge_1_scores(candidate, reference)

assert score.precision == pytest.approx(1.0)
assert score.recall == pytest.approx(1.0)
assert score.fmeasure == pytest.approx(1.0)
Comment on lines +22 to +38
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This test is a good start for verifying the new character-level ROUGE scoring. To make it more robust and prevent future regressions, I recommend expanding the test suite. Consider parameterizing the test function to cover a wider range of scenarios, such as:

  • Partial matches for character-level scoring (e.g., with Thai text where candidate and reference are different).
  • Cases that trigger the word-level scoring logic (e.g., English sentences with spaces).
  • Edge cases like empty strings for candidate and/or reference.
  • A test case with two different Unicode representations of the same string that become equivalent only after NFC normalization.

Loading