Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/google/adk/dependencies/rouge_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from __future__ import annotations

from rouge_score import rouge_scorer
from rouge_score import tokenizers
24 changes: 24 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,30 @@ class LlmBackedUserSimulatorCriterion(LlmAsAJudgeCriterion):
)


class RougeScoreCriterion(BaseCriterion):
"""Criterion for ROUGE score evaluation with tokenizer options.

This criterion allows specifying a custom tokenizer for ROUGE-1
evaluation, particularly useful for CJK languages (Chinese, Japanese,
Korean) where the default tokenizer produces zero scores.

Note: The `threshold` field is inherited from BaseCriterion but is
IGNORED by RougeEvaluator. Always use EvalMetric.threshold instead.
"""

tokenizer: Optional[str] = Field(
default=None,
description=(
"Tokenizer for text tokenization. Options:\n"
"- None: Default word-based tokenization (ASCII alphanumeric only).\n"
" Non-ASCII text will produce score=0.0.\n"
"- 'cjk': Character-based tokenization for CJK (Chinese, Japanese,\n"
" Korean) + ASCII alphanumeric. Other scripts (Greek, Cyrillic,\n"
" fullwidth alphanumeric, etc.) are skipped. Stemming is disabled."
),
)


class EvalMetric(EvalBaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""

Expand Down
168 changes: 160 additions & 8 deletions src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,149 @@

from __future__ import annotations

import logging
import re
from typing import ClassVar
from typing import List
from typing import Optional

from google.genai import types as genai_types
from pydantic import ValidationError
from typing_extensions import override

from ..dependencies.rouge_scorer import rouge_scorer
from ..dependencies.rouge_scorer import tokenizers
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalMetric
from .eval_metrics import RougeScoreCriterion
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .evaluator import PerInvocationResult

logger = logging.getLogger("google_adk." + __name__)


# =============================================================================
# CJK Character Ranges
# =============================================================================
# Each range is defined separately for maintainability.
# Order: Han (Chinese/Japanese/Korean) -> Japanese Kana -> Korean Hangul

CJK_RANGES = (
"\u4e00-\u9fff" # CJK Unified Ideographs (Han)
"\u3400-\u4dbf" # CJK Extension A (Han)
"\u3040-\u309f" # Hiragana (Japanese)
"\u30a0-\u30ff" # Katakana (Japanese)
"\uac00-\ud7af" # Hangul Syllables (Korean)
)

# CJK Symbols and Punctuation block (U+3000-U+303F)
# Includes: 。、!?「」『』【】〈〉《》〔〕 etc.
# Note: Fullwidth forms (U+FF00-U+FFEF) are NOT included here.
CJK_PUNCTUATION = "\u3000-\u303f"

CJK_CHAR_PATTERN = re.compile(f"[{CJK_RANGES}]")
CJK_PUNCT_PATTERN = re.compile(f"[{CJK_PUNCTUATION}]")


# Regex pattern for tokenization: matches CJK characters or ASCII alphanumeric words
_CJK_TOKEN_PATTERN = re.compile(f"[{CJK_RANGES}]|[a-z0-9]+")


def _contains_cjk(text: str) -> bool:
"""Check if text contains any CJK characters."""
return bool(CJK_CHAR_PATTERN.search(text)) if text else False


class CJKTokenizer(tokenizers.Tokenizer):
"""Character-based tokenizer for CJK + ASCII alphanumeric mixed text.

This tokenizer is designed for evaluating text in CJK languages
(Chinese, Japanese, Korean) where the default ROUGE tokenizer fails
because it only recognizes ASCII alphanumeric characters.

Tokenization strategy:
- CJK characters: Each character becomes one token
- ASCII alphanumeric (a-z, 0-9): Word-based tokenization
- CJK punctuation/symbols (U+3000-U+303F): Removed
- All other characters: Skipped (not tokenized)

Limitations:
- Fullwidth alphanumeric (A-Z, 0-9): Skipped
- Greek, Cyrillic, accented Latin: Skipped
- This is NOT a general multilingual tokenizer

For morphological analysis, consider language-specific tokenizers
(e.g., MeCab for Japanese).

Note: Stemming is not applicable to CJK and is always disabled.
"""

def tokenize(self, text: Optional[str]) -> List[str]:
"""Tokenize text with CJK-aware segmentation.

Args:
text: Input text to tokenize. None or empty string returns [].

Returns:
List of tokens. CJK characters are individual tokens,
ASCII words are single tokens.
"""
if not text:
return []

text = text.lower()
text = CJK_PUNCT_PATTERN.sub(" ", text)
return _CJK_TOKEN_PATTERN.findall(text)


class RougeEvaluator(Evaluator):
"""Evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric.
"""Evaluates using Rouge_1 metric with optional CJK support.

Value range for this metric is [0,1], with values closer to 1 more desirable.

Warning behavior:
When CJK characters are detected but no tokenizer is specified,
a warning is logged. This warning is logged at most ONCE per
RougeEvaluator instance, even if evaluate_invocations() is called
multiple times.
"""

criterion_type: ClassVar[type[BaseCriterion]] = RougeScoreCriterion

def __init__(self, eval_metric: EvalMetric):
self._eval_metric = eval_metric
# Warning is logged at most once per instance
self._warned_about_cjk = False

tokenizer: Optional[tokenizers.Tokenizer] = None
use_stemmer = True

if eval_metric.criterion:
try:
criterion = RougeScoreCriterion.model_validate(
eval_metric.criterion.model_dump()
)
if criterion.tokenizer == "cjk":
tokenizer = CJKTokenizer()
use_stemmer = False # Stemming not applicable to CJK
except ValidationError:
pass # Different criterion type, ignore

# Create scorer once for reuse across invocations (performance optimization)
if tokenizer:
self._scorer = rouge_scorer.RougeScorer(
["rouge1"], use_stemmer=False, tokenizer=tokenizer
)
self._has_cjk_tokenizer = True
else:
self._scorer = rouge_scorer.RougeScorer(
["rouge1"], use_stemmer=use_stemmer
)
self._has_cjk_tokenizer = False

@override
def evaluate_invocations(
Expand All @@ -55,8 +175,13 @@ def evaluate_invocations(
for actual, expected in zip(actual_invocations, expected_invocations):
reference = _get_text_from_content(expected.final_response)
response = _get_text_from_content(actual.final_response)
rouge_1_scores = _calculate_rouge_1_scores(response, reference)
score = rouge_1_scores.fmeasure

# Log warning once if CJK detected without tokenizer
self._maybe_warn_cjk(reference, response)

# Use pre-created scorer for performance
scores = self._scorer.score(reference, response)
score = scores["rouge1"].fmeasure
per_invocation_results.append(
PerInvocationResult(
actual_invocation=actual,
Expand All @@ -80,6 +205,21 @@ def evaluate_invocations(

return EvaluationResult()

def _maybe_warn_cjk(self, reference: str, response: str) -> None:
"""Log warning if CJK detected without tokenizer (once per instance)."""
if self._warned_about_cjk:
return
if self._has_cjk_tokenizer:
return
if _contains_cjk(reference) or _contains_cjk(response):
logger.warning(
"CJK characters detected in text but no tokenizer specified. "
"ROUGE scores will likely be 0.0 for CJK text. "
"Consider using RougeScoreCriterion(tokenizer='cjk') for "
"Chinese, Japanese, or Korean language support."
)
self._warned_about_cjk = True


def _get_text_from_content(content: Optional[genai_types.Content]) -> str:
if content and content.parts:
Expand All @@ -92,25 +232,37 @@ def _get_eval_status(score: float, threshold: float):
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED


def _calculate_rouge_1_scores(candidate: str, reference: str):
def _calculate_rouge_1_scores(
candidate: str,
reference: str,
tokenizer: Optional[tokenizers.Tokenizer] = None,
use_stemmer: bool = True,
):
"""Calculates the ROUGE-1 score between a candidate and reference text.

ROUGE-1 measures the overlap of unigrams (single words) between the
candidate and reference texts. The score is broken down into:
- Precision: The proportion of unigrams in the candidate that are also in the
reference.
reference.
- Recall: The proportion of unigrams in the reference that are also in the
candidate.
candidate.
- F-measure: The harmonic mean of precision and recall.

Args:
candidate: The generated text to be evaluated.
reference: The ground-truth text to compare against.
tokenizer: Custom tokenizer (e.g., CJKTokenizer). None for default.
use_stemmer: Whether to use Porter stemmer. Ignored if tokenizer is set.

Returns:
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
A Score object containing the ROUGE-1 precision, recall, and f-measure.
"""
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
if tokenizer:
scorer = rouge_scorer.RougeScorer(
["rouge1"], use_stemmer=False, tokenizer=tokenizer
)
else:
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer)

# The score method returns a dictionary where keys are the ROUGE types
# and values are Score objects (tuples) with precision, recall, and fmeasure.
Expand Down
11 changes: 8 additions & 3 deletions src/google/adk/evaluation/response_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def __init__(
" metric_name should be specified."
)

self._eval_metric = eval_metric

if eval_metric:
threshold = eval_metric.threshold
metric_name = eval_metric.metric_name
Expand All @@ -82,9 +84,12 @@ def evaluate_invocations(
) -> EvaluationResult:
# If the metric is response_match_score, just use the RougeEvaluator.
if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
rouge_evaluator = RougeEvaluator(
EvalMetric(metric_name=self._metric_name, threshold=self._threshold)
)
if self._eval_metric:
rouge_evaluator = RougeEvaluator(eval_metric=self._eval_metric)
else:
rouge_evaluator = RougeEvaluator(
EvalMetric(metric_name=self._metric_name, threshold=self._threshold)
)
return rouge_evaluator.evaluate_invocations(
actual_invocations, expected_invocations, conversation_scenario
)
Expand Down
Loading
Loading