diff --git a/src/google/adk/dependencies/rouge_scorer.py b/src/google/adk/dependencies/rouge_scorer.py index cc987deb88..d9371ef46a 100644 --- a/src/google/adk/dependencies/rouge_scorer.py +++ b/src/google/adk/dependencies/rouge_scorer.py @@ -15,3 +15,4 @@ from __future__ import annotations from rouge_score import rouge_scorer +from rouge_score import tokenizers diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 3047922c3f..11525d077d 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -251,6 +251,30 @@ class LlmBackedUserSimulatorCriterion(LlmAsAJudgeCriterion): ) +class RougeScoreCriterion(BaseCriterion): + """Criterion for ROUGE score evaluation with tokenizer options. + + This criterion allows specifying a custom tokenizer for ROUGE-1 + evaluation, particularly useful for CJK languages (Chinese, Japanese, + Korean) where the default tokenizer produces zero scores. + + Note: The `threshold` field is inherited from BaseCriterion but is + IGNORED by RougeEvaluator. Always use EvalMetric.threshold instead. + """ + + tokenizer: Optional[str] = Field( + default=None, + description=( + "Tokenizer for text tokenization. Options:\n" + "- None: Default word-based tokenization (ASCII alphanumeric only).\n" + " Non-ASCII text will produce score=0.0.\n" + "- 'cjk': Character-based tokenization for CJK (Chinese, Japanese,\n" + " Korean) + ASCII alphanumeric. Other scripts (Greek, Cyrillic,\n" + " fullwidth alphanumeric, etc.) are skipped. Stemming is disabled." + ), + ) + + class EvalMetric(EvalBaseModel): """A metric used to evaluate a particular aspect of an eval case.""" diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index fb17fe80eb..fbeae957fa 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,29 +14,149 @@ from __future__ import annotations +import logging +import re +from typing import ClassVar +from typing import List from typing import Optional from google.genai import types as genai_types +from pydantic import ValidationError from typing_extensions import override from ..dependencies.rouge_scorer import rouge_scorer +from ..dependencies.rouge_scorer import tokenizers from .eval_case import ConversationScenario from .eval_case import Invocation +from .eval_metrics import BaseCriterion from .eval_metrics import EvalMetric +from .eval_metrics import RougeScoreCriterion from .evaluator import EvalStatus from .evaluator import EvaluationResult from .evaluator import Evaluator from .evaluator import PerInvocationResult +logger = logging.getLogger("google_adk." + __name__) + + +# ============================================================================= +# CJK Character Ranges +# ============================================================================= +# Each range is defined separately for maintainability. +# Order: Han (Chinese/Japanese/Korean) -> Japanese Kana -> Korean Hangul + +CJK_RANGES = ( + "\u4e00-\u9fff" # CJK Unified Ideographs (Han) + "\u3400-\u4dbf" # CJK Extension A (Han) + "\u3040-\u309f" # Hiragana (Japanese) + "\u30a0-\u30ff" # Katakana (Japanese) + "\uac00-\ud7af" # Hangul Syllables (Korean) +) + +# CJK Symbols and Punctuation block (U+3000-U+303F) +# Includes: 。、!?「」『』【】〈〉《》〔〕 etc. +# Note: Fullwidth forms (U+FF00-U+FFEF) are NOT included here. +CJK_PUNCTUATION = "\u3000-\u303f" + +CJK_CHAR_PATTERN = re.compile(f"[{CJK_RANGES}]") +CJK_PUNCT_PATTERN = re.compile(f"[{CJK_PUNCTUATION}]") + + +# Regex pattern for tokenization: matches CJK characters or ASCII alphanumeric words +_CJK_TOKEN_PATTERN = re.compile(f"[{CJK_RANGES}]|[a-z0-9]+") + + +def _contains_cjk(text: str) -> bool: + """Check if text contains any CJK characters.""" + return bool(CJK_CHAR_PATTERN.search(text)) if text else False + + +class CJKTokenizer(tokenizers.Tokenizer): + """Character-based tokenizer for CJK + ASCII alphanumeric mixed text. + + This tokenizer is designed for evaluating text in CJK languages + (Chinese, Japanese, Korean) where the default ROUGE tokenizer fails + because it only recognizes ASCII alphanumeric characters. + + Tokenization strategy: + - CJK characters: Each character becomes one token + - ASCII alphanumeric (a-z, 0-9): Word-based tokenization + - CJK punctuation/symbols (U+3000-U+303F): Removed + - All other characters: Skipped (not tokenized) + + Limitations: + - Fullwidth alphanumeric (A-Z, 0-9): Skipped + - Greek, Cyrillic, accented Latin: Skipped + - This is NOT a general multilingual tokenizer + + For morphological analysis, consider language-specific tokenizers + (e.g., MeCab for Japanese). + + Note: Stemming is not applicable to CJK and is always disabled. + """ + + def tokenize(self, text: Optional[str]) -> List[str]: + """Tokenize text with CJK-aware segmentation. + + Args: + text: Input text to tokenize. None or empty string returns []. + + Returns: + List of tokens. CJK characters are individual tokens, + ASCII words are single tokens. + """ + if not text: + return [] + + text = text.lower() + text = CJK_PUNCT_PATTERN.sub(" ", text) + return _CJK_TOKEN_PATTERN.findall(text) + class RougeEvaluator(Evaluator): - """Evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. + """Evaluates using Rouge_1 metric with optional CJK support. Value range for this metric is [0,1], with values closer to 1 more desirable. + + Warning behavior: + When CJK characters are detected but no tokenizer is specified, + a warning is logged. This warning is logged at most ONCE per + RougeEvaluator instance, even if evaluate_invocations() is called + multiple times. """ + criterion_type: ClassVar[type[BaseCriterion]] = RougeScoreCriterion + def __init__(self, eval_metric: EvalMetric): self._eval_metric = eval_metric + # Warning is logged at most once per instance + self._warned_about_cjk = False + + tokenizer: Optional[tokenizers.Tokenizer] = None + use_stemmer = True + + if eval_metric.criterion: + try: + criterion = RougeScoreCriterion.model_validate( + eval_metric.criterion.model_dump() + ) + if criterion.tokenizer == "cjk": + tokenizer = CJKTokenizer() + use_stemmer = False # Stemming not applicable to CJK + except ValidationError: + pass # Different criterion type, ignore + + # Create scorer once for reuse across invocations (performance optimization) + if tokenizer: + self._scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=False, tokenizer=tokenizer + ) + self._has_cjk_tokenizer = True + else: + self._scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=use_stemmer + ) + self._has_cjk_tokenizer = False @override def evaluate_invocations( @@ -55,8 +175,13 @@ def evaluate_invocations( for actual, expected in zip(actual_invocations, expected_invocations): reference = _get_text_from_content(expected.final_response) response = _get_text_from_content(actual.final_response) - rouge_1_scores = _calculate_rouge_1_scores(response, reference) - score = rouge_1_scores.fmeasure + + # Log warning once if CJK detected without tokenizer + self._maybe_warn_cjk(reference, response) + + # Use pre-created scorer for performance + scores = self._scorer.score(reference, response) + score = scores["rouge1"].fmeasure per_invocation_results.append( PerInvocationResult( actual_invocation=actual, @@ -80,6 +205,21 @@ def evaluate_invocations( return EvaluationResult() + def _maybe_warn_cjk(self, reference: str, response: str) -> None: + """Log warning if CJK detected without tokenizer (once per instance).""" + if self._warned_about_cjk: + return + if self._has_cjk_tokenizer: + return + if _contains_cjk(reference) or _contains_cjk(response): + logger.warning( + "CJK characters detected in text but no tokenizer specified. " + "ROUGE scores will likely be 0.0 for CJK text. " + "Consider using RougeScoreCriterion(tokenizer='cjk') for " + "Chinese, Japanese, or Korean language support." + ) + self._warned_about_cjk = True + def _get_text_from_content(content: Optional[genai_types.Content]) -> str: if content and content.parts: @@ -92,25 +232,37 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED -def _calculate_rouge_1_scores(candidate: str, reference: str): +def _calculate_rouge_1_scores( + candidate: str, + reference: str, + tokenizer: Optional[tokenizers.Tokenizer] = None, + use_stemmer: bool = True, +): """Calculates the ROUGE-1 score between a candidate and reference text. ROUGE-1 measures the overlap of unigrams (single words) between the candidate and reference texts. The score is broken down into: - Precision: The proportion of unigrams in the candidate that are also in the - reference. + reference. - Recall: The proportion of unigrams in the reference that are also in the - candidate. + candidate. - F-measure: The harmonic mean of precision and recall. Args: candidate: The generated text to be evaluated. reference: The ground-truth text to compare against. + tokenizer: Custom tokenizer (e.g., CJKTokenizer). None for default. + use_stemmer: Whether to use Porter stemmer. Ignored if tokenizer is set. Returns: - A dictionary containing the ROUGE-1 precision, recall, and f-measure. + A Score object containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + if tokenizer: + scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=False, tokenizer=tokenizer + ) + else: + scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. diff --git a/src/google/adk/evaluation/response_evaluator.py b/src/google/adk/evaluation/response_evaluator.py index 3fa3754913..c54618f51d 100644 --- a/src/google/adk/evaluation/response_evaluator.py +++ b/src/google/adk/evaluation/response_evaluator.py @@ -58,6 +58,8 @@ def __init__( " metric_name should be specified." ) + self._eval_metric = eval_metric + if eval_metric: threshold = eval_metric.threshold metric_name = eval_metric.metric_name @@ -82,9 +84,12 @@ def evaluate_invocations( ) -> EvaluationResult: # If the metric is response_match_score, just use the RougeEvaluator. if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value: - rouge_evaluator = RougeEvaluator( - EvalMetric(metric_name=self._metric_name, threshold=self._threshold) - ) + if self._eval_metric: + rouge_evaluator = RougeEvaluator(eval_metric=self._eval_metric) + else: + rouge_evaluator = RougeEvaluator( + EvalMetric(metric_name=self._metric_name, threshold=self._threshold) + ) return rouge_evaluator.evaluate_invocations( actual_invocations, expected_invocations, conversation_scenario ) diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py index eef35d86d6..7bc8b923bc 100644 --- a/tests/unittests/evaluation/test_final_response_match_v1.py +++ b/tests/unittests/evaluation/test_final_response_match_v1.py @@ -139,3 +139,208 @@ def test_rouge_evaluator_multiple_invocations( expected_score, rel=1e-3 ) assert evaluation_result.overall_eval_status == expected_status + + +# ============================================================================= +# CJK Tokenizer Tests (Issue #4122) +# ============================================================================= + +import logging + +from google.adk.evaluation.eval_metrics import RougeScoreCriterion +from google.adk.evaluation.final_response_match_v1 import _contains_cjk +from google.adk.evaluation.final_response_match_v1 import CJKTokenizer + + +class TestCJKTokenizer: + """Tests for CJKTokenizer tokenization behavior.""" + + def test_tokenize_japanese(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("これはテスト") + assert tokens == ["こ", "れ", "は", "テ", "ス", "ト"] + + def test_tokenize_english(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("This is a test") + assert tokens == ["this", "is", "a", "test"] + + def test_tokenize_mixed_cjk_and_ascii(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("Hello世界World") + assert tokens == ["hello", "世", "界", "world"] + + def test_tokenize_fullwidth_alphanumeric_skipped(self): + """Fullwidth alphanumeric should be skipped.""" + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("ABC123") + assert tokens == [] + + def test_tokenize_greek_skipped(self): + """Greek and other non-CJK scripts should be skipped.""" + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("αβγtest") + assert tokens == ["test"] + + def test_tokenize_empty_string(self): + tokenizer = CJKTokenizer() + assert tokenizer.tokenize("") == [] + + def test_tokenize_none(self): + """None input should return empty list.""" + tokenizer = CJKTokenizer() + assert tokenizer.tokenize(None) == [] + + def test_tokenize_chinese(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("这是测试") + assert tokens == ["这", "是", "测", "试"] + + def test_tokenize_korean(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("테스트") + assert len(tokens) == 3 # 3 Hangul syllables + + +class TestContainsCJK: + """Tests for _contains_cjk helper function.""" + + def test_contains_cjk_japanese(self): + assert _contains_cjk("これはテスト") is True + + def test_contains_cjk_english(self): + assert _contains_cjk("This is a test") is False + + def test_contains_cjk_mixed(self): + assert _contains_cjk("Hello世界") is True + + def test_contains_cjk_empty(self): + assert _contains_cjk("") is False + + def test_contains_cjk_none(self): + assert _contains_cjk(None) is False + + +class TestRougeScoreWithCJKTokenizer: + """Tests for ROUGE score calculation with CJK tokenizer.""" + + def test_english_identical_default_tokenizer(self): + """English identical text should score 1.0 with default tokenizer.""" + result = self._evaluate("This is a test", "This is a test", None) + assert result.overall_score == pytest.approx(1.0) + + def test_english_partial_default_tokenizer(self): + """English partial match should score between 0 and 1.""" + result = self._evaluate("This is test", "This is a test", None) + assert 0 < result.overall_score < 1 + + def test_japanese_without_tokenizer_scores_zero(self): + """Japanese text without CJK tokenizer should score 0.0.""" + result = self._evaluate("これはテスト", "これはテスト", None) + assert result.overall_score == pytest.approx(0.0) + + def test_japanese_identical_with_cjk_tokenizer(self): + """Japanese identical text with CJK tokenizer should score 1.0.""" + result = self._evaluate("これはテスト", "これはテスト", "cjk") + assert result.overall_score == pytest.approx(1.0) + + def test_japanese_partial_with_cjk_tokenizer(self): + """Japanese partial match should score between 0 and 1.""" + result = self._evaluate("これはテスト", "これはサンプル", "cjk") + assert 0 < result.overall_score < 1 + + def test_chinese_identical_with_cjk_tokenizer(self): + """Chinese identical text with CJK tokenizer should score 1.0.""" + result = self._evaluate("这是测试", "这是测试", "cjk") + assert result.overall_score == pytest.approx(1.0) + + def test_mixed_text_identical_with_cjk_tokenizer(self): + """Mixed CJK+ASCII identical text should score 1.0.""" + result = self._evaluate("Hello世界", "Hello世界", "cjk") + assert result.overall_score == pytest.approx(1.0) + + def test_cjk_punctuation_does_not_affect_score(self): + """CJK punctuation should be removed, not affecting score.""" + result_with = self._evaluate("これはテスト。", "これはテスト", "cjk") + result_without = self._evaluate("これはテスト", "これはテスト", "cjk") + assert result_with.overall_score == pytest.approx(1.0) + assert result_without.overall_score == pytest.approx(1.0) + + def _evaluate(self, candidate: str, reference: str, tokenizer_type: str): + """Helper to evaluate ROUGE score.""" + criterion = None + if tokenizer_type: + criterion = RougeScoreCriterion(threshold=0.8, tokenizer=tokenizer_type) + + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + criterion=criterion, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual, expected = _create_test_invocations(candidate, reference) + + return evaluator.evaluate_invocations([actual], [expected]) + + +class TestCJKWarning: + """Tests for CJK detection warning behavior.""" + + def test_warning_logged_once_for_multiple_evaluations(self, caplog): + """Warning should be logged exactly once per evaluator instance.""" + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual1, expected1 = _create_test_invocations( + "これはテスト", "これはテスト" + ) + actual2, expected2 = _create_test_invocations("別のテスト", "別のテスト") + + with caplog.at_level(logging.WARNING): + # First evaluation with CJK - should trigger warning + evaluator.evaluate_invocations([actual1], [expected1]) + # Second evaluation with CJK - should NOT trigger warning + evaluator.evaluate_invocations([actual2], [expected2]) + + cjk_warnings = [r for r in caplog.records if "CJK" in r.message] + assert len(cjk_warnings) == 1 + + def test_no_warning_when_cjk_tokenizer_specified(self, caplog): + """No warning when CJK tokenizer is properly specified.""" + criterion = RougeScoreCriterion(threshold=0.8, tokenizer="cjk") + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + criterion=criterion, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual, expected = _create_test_invocations("これはテスト", "これはテスト") + + with caplog.at_level(logging.WARNING): + evaluator.evaluate_invocations([actual], [expected]) + + cjk_warnings = [r for r in caplog.records if "CJK" in r.message] + assert len(cjk_warnings) == 0 + + def test_no_warning_for_english_text(self, caplog): + """No warning for ASCII-only text.""" + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual, expected = _create_test_invocations( + "This is a test", "This is a test" + ) + + with caplog.at_level(logging.WARNING): + evaluator.evaluate_invocations([actual], [expected]) + + cjk_warnings = [r for r in caplog.records if "CJK" in r.message] + assert len(cjk_warnings) == 0