feat: Implement GovernancePlugin to filter forbidden emojis from model responses

This commit is contained in:
2026-03-03 18:43:28 +00:00
parent 8722c146af
commit aabbbbe4c4

View File

@@ -0,0 +1,80 @@
"""GovernancePlugin: Guardrails for VAia, the virtual assistant for VA."""
import logging
import re
from google.adk.agents.callback_context import CallbackContext
from google.adk.models import LlmResponse
logger = logging.getLogger(__name__)
FORBIDDEN_EMOJIS = [
"🥵","🔪","🎰","🎲","🃏","😤","🤬","😡","😠","🩸","🧨","🪓","☠️","💀",
"💣","🔫","👗","💦","🍑","🍆","👄","👅","🫦","💩","⚖️","⚔️","✝️","🕍",
"🕌","","🍻","🍸","🥃","🍷","🍺","🚬","👹","👺","👿","😈","🤡","🧙",
"🧛","🔞","🧿","💊",
"💏"
]
class GovernancePlugin:
"""Guardrail executor for VAia requests as a Agent engine callbacks."""
def __init__(self) -> None:
"""Initialize guardrail model (structured output), prompt and emojis patterns."""
self._combined_pattern = self._get_combined_pattern()
def _get_combined_pattern(self):
person_pattern = r"(?:🧑|👩|👨)"
tone_pattern = r"[\U0001F3FB-\U0001F3FF]?"
# Unique pattern that combines all forbidden emojis, including complex ones with skin tones
combined_pattern = re.compile(
rf"{person_pattern}{tone_pattern}\u200d❤?\u200d💋\u200d{person_pattern}{tone_pattern}" # kiss
rf"|{person_pattern}{tone_pattern}\u200d❤?\u200d{person_pattern}{tone_pattern}" # lovers
rf"|🖕[\U0001F3FB-\U0001F3FF]?" # middle finger with all skin tone variations
rf"|{'|'.join(map(re.escape, sorted(FORBIDDEN_EMOJIS, key=len, reverse=True)))}" # simple emojis
rf"|\u200d|\uFE0F" # residual ZWJ and variation selectors
)
return combined_pattern
def _remove_emojis(self, text: str) -> tuple[str, list[str]]:
removed = self._combined_pattern.findall(text)
text = self._combined_pattern.sub("", text)
return text.strip(), removed
def after_model_callback(
self,
callback_context: CallbackContext | None = None,
llm_response: LlmResponse | None = None,
) -> None:
"""Guardrail post-processing.
Remove forbidden emojis from the model response.
"""
try:
text_out = ""
if llm_response and llm_response.content:
content = llm_response.content
parts = getattr(content, "parts", None)
if parts:
part = parts[0]
text_value = getattr(part, "text", "")
if isinstance(text_value, str):
text_out = text_value
if text_out:
new_text, deleted = self._remove_emojis(text_out)
if llm_response and llm_response.content and llm_response.content.parts:
llm_response.content.parts[0].text = new_text
if deleted:
if callback_context:
callback_context.state["removed_emojis"] = deleted
logger.warning(
"Removed forbidden emojis from response: %s",
deleted,
)
except Exception:
logger.exception("Error in after_model_callback")