google · robinpats182 · Jan 12, 2026 · Jan 13, 2026 · Jan 14, 2026 · gemini-code-assist
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,6 +14,8 @@
 
 from __future__ import annotations
 
+from collections import Counter
+from collections import namedtuple
 from typing import Optional
 
 from google.genai import types as genai_types
@@ -27,6 +29,7 @@
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
+from .text_utils import normalize_text
 
 
 class RougeEvaluator(Evaluator):
@@ -46,7 +49,7 @@ def evaluate_invocations(
       conversation_scenario: Optional[ConversationScenario] = None,
   ) -> EvaluationResult:
     if expected_invocations is None:
-      raise ValueError("expected_invocations is required for this metric.")
+      raise ValueError('expected_invocations is required for this metric.')
     del conversation_scenario  # not used by this metric.
 
     total_score = 0.0
@@ -83,15 +86,18 @@ def evaluate_invocations(
 
 def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
   if content and content.parts:
-    return "\n".join([part.text for part in content.parts if part.text])
+    return '\n'.join([part.text for part in content.parts if part.text])
 
-  return ""
+  return ''
 
 
 def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
+
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -108,12 +114,56 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
       reference: The ground-truth text to compare against.
 
   Returns:
-      A dictionary containing the ROUGE-1 precision, recall, and f-measure.
+      A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  # Normalize both texts before scoring to handle Unicode variations
+  normalized_candidate = normalize_text(candidate)
+  normalized_reference = normalize_text(reference)
+
+  # Check if the text contains spaces (word-separated languages)
+  has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate
+
+  if has_spaces:
+    # Use standard word-level ROUGE for space-separated languages
+    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
+    scores = scorer.score(normalized_reference, normalized_candidate)
+    return scores['rouge1']
+  else:
+    # For non-space-separated languages, use character-level comparison
+    return _calculate_character_level_rouge(
+        normalized_candidate, normalized_reference
+    )
+
+
+def _calculate_character_level_rouge(candidate: str, reference: str):
+  """Calculates character-level ROUGE-1 score for non-space-separated text.
+
+  Args:
+    candidate: The candidate text (already normalized).
+    reference: The reference text (already normalized).
+
+  Returns:
+    A Score namedtuple with precision, recall, and fmeasure.
+  """
+
+  if not reference or not candidate:
+    return Score(precision=0.0, recall=0.0, fmeasure=0.0)
+
+  # Count character occurrences
+  ref_chars = Counter(reference)
+  cand_chars = Counter(candidate)
+
+  # Calculate overlapping characters
+  overlap = sum((ref_chars & cand_chars).values())
+
+  # Calculate precision and recall
+  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
+  recall = overlap / len(reference) if len(reference) > 0 else 0.0
-  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
-  recall = overlap / len(reference) if len(reference) > 0 else 0.0
+  precision = overlap / len(candidate)
+  recall = overlap / len(reference)
-  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
-  recall = overlap / len(reference) if len(reference) > 0 else 0.0
+  precision = overlap / len(candidate)
+  recall = overlap / len(reference)
 
-  # The score method returns a dictionary where keys are the ROUGE types
-  # and values are Score objects (tuples) with precision, recall, and fmeasure.
-  scores = scorer.score(reference, candidate)
+  # Calculate F-measure
+  if precision + recall > 0:
+    fmeasure = 2 * (precision * recall) / (precision + recall)
+  else:
+    fmeasure = 0.0
 
-  return scores["rouge1"]
+  return Score(precision=precision, recall=recall, fmeasure=fmeasure)
diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py
@@ -0,0 +1,34 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Text utilities for evaluation."""
+
+from __future__ import annotations
+
+import unicodedata
+
+
+def normalize_text(text: str) -> str:
+  """Normalize text using NFC normalization and strip whitespace.
+
+  This ensures consistent text comparison across different Unicode
+  representations, which is particularly important for non-English text.
+
+  Args:
+    text: The text to normalize.
+
+  Returns:
+    The normalized text.
+  """
+  return unicodedata.normalize("NFC", text).strip()
diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py
@@ -0,0 +1,38 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for final_response_match_v1."""
+
+from __future__ import annotations
+
+import pytest
+
+
+def test_normalization_applied_in_rouge():
+  """Normalization should make identical Thai strings match."""
+  from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+  from google.adk.evaluation.text_utils import normalize_text
+
+  reference = "สวัสดี"
+  candidate = "สวัสดี"
+
+  # Verify normalization directly
+  assert normalize_text(reference) == normalize_text(candidate)
+
+  # Verify ROUGE score reflects a perfect match
+  score = _calculate_rouge_1_scores(candidate, reference)
+
+  assert score.precision == pytest.approx(1.0)
+  assert score.recall == pytest.approx(1.0)
+  assert score.fmeasure == pytest.approx(1.0)