feat_dev(guardrail): externalize labels and tighten censorship logic
Some checks failed
CI / ci (pull_request) Failing after 12s

This commit is contained in:
2026-03-13 00:24:51 +00:00
parent 6ce548e718
commit c244b35e00
4 changed files with 49 additions and 31 deletions

View File

@@ -13,6 +13,7 @@ mcp_audience: "https://ap01194-orq-cog-rag-connector-1007577023101.us-central1.r
agent_name: VAia agent_name: VAia
agent_model: gemini-2.5-flash agent_model: gemini-2.5-flash
agent_instructions: | agent_instructions: |
Eres VAia, el asistente virtual de VA en WhatsApp. VA es la opción digital de Banorte para los jóvenes. Fuiste creado por el equipo de inteligencia artifical de Banorte. Tu rol es resolver dudas sobre educación financiera y los productos/servicios de VA. Hablas como un amigo que sabe de finanzas: siempre vas directo al grano, con calidez y sin rodeos. Eres VAia, el asistente virtual de VA en WhatsApp. VA es la opción digital de Banorte para los jóvenes. Fuiste creado por el equipo de inteligencia artifical de Banorte. Tu rol es resolver dudas sobre educación financiera y los productos/servicios de VA. Hablas como un amigo que sabe de finanzas: siempre vas directo al grano, con calidez y sin rodeos.
@@ -50,6 +51,13 @@ agent_instructions: |
El teléfono de centro de contacto de VA es: +52 1 55 5140 5655 El teléfono de centro de contacto de VA es: +52 1 55 5140 5655
# Guardrail config
guardrail_censored_user_message: "[pregunta mala]"
guardrail_censored_model_response: "[respuesta de adversidad]"
guardrail_blocked_label: "[GUARDRAIL_BLOCKED]"
guardrail_passed_label: "[GUARDRAIL_PASSED]"
guardrail_error_label: "[GUARDRAIL_ERROR]"
guardrail_instruction: | guardrail_instruction: |
Eres una capa de seguridad y protección de marca para VAia, el asistente virtual de VA en WhatsApp. Eres una capa de seguridad y protección de marca para VAia, el asistente virtual de VA en WhatsApp.
VAia es un asistente de educación financiera y productos/servicios de VA (la opción digital de Banorte para jóvenes) VAia es un asistente de educación financiera y productos/servicios de VA (la opción digital de Banorte para jóvenes)

View File

@@ -21,9 +21,16 @@ class AgentSettings(BaseSettings):
# Agent configuration # Agent configuration
agent_name: str agent_name: str
agent_instructions: str
guardrail_instruction: str
agent_model: str agent_model: str
agent_instructions: str
# Guardrail configuration
guardrail_censored_user_message: str
guardrail_censored_model_response: str
guardrail_blocked_label: str
guardrail_passed_label: str
guardrail_error_label: str
guardrail_instruction: str
# Firestore configuration # Firestore configuration
firestore_db: str firestore_db: str

View File

@@ -171,19 +171,19 @@ class GovernancePlugin:
if decision == "unsafe": if decision == "unsafe":
callback_context.state["guardrail_blocked"] = True callback_context.state["guardrail_blocked"] = True
callback_context.state["guardrail_message"] = "[GUARDRAIL_BLOCKED]" callback_context.state["guardrail_message"] = settings.guardrail_blocked_label
callback_context.state["guardrail_reasoning"] = reasoning callback_context.state["guardrail_reasoning"] = reasoning
return LlmResponse( return LlmResponse(
content=Content(role="model", parts=[Part(text=blocking_response)]), content=Content(role="model", parts=[Part(text=blocking_response)]),
usage_metadata=resp.usage_metadata or None, usage_metadata=resp.usage_metadata or None,
) )
callback_context.state["guardrail_blocked"] = False callback_context.state["guardrail_blocked"] = False
callback_context.state["guardrail_message"] = "[GUARDRAIL_PASSED]" callback_context.state["guardrail_message"] = settings.guardrail_passed_label
callback_context.state["guardrail_reasoning"] = reasoning callback_context.state["guardrail_reasoning"] = reasoning
except Exception: except Exception:
# Fail safe: block with a generic error response and mark the reason # Fail safe: block with a generic error response and mark the reason
callback_context.state["guardrail_message"] = "[GUARDRAIL_ERROR]" callback_context.state["guardrail_message"] = settings.guardrail_error_label
logger.exception("Guardrail check failed") logger.exception("Guardrail check failed")
return LlmResponse( return LlmResponse(
content=Content( content=Content(

View File

@@ -25,12 +25,13 @@ from google.cloud.firestore_v1.field_path import FieldPath
from google.genai.types import Content, Part from google.genai.types import Content, Part
from .compaction import SessionCompactor from .compaction import SessionCompactor
from .config import settings
if TYPE_CHECKING: if TYPE_CHECKING:
from google import genai from google import genai
from google.cloud.firestore_v1.async_client import AsyncClient from google.cloud.firestore_v1.async_client import AsyncClient
logger = logging.getLogger("google_adk." + __name__) logger = logging.getLogger(__name__)
class FirestoreSessionService(BaseSessionService): class FirestoreSessionService(BaseSessionService):
@@ -382,7 +383,7 @@ class FirestoreSessionService(BaseSessionService):
# Determine if we need to censor this event (model response when guardrail blocked) # Determine if we need to censor this event (model response when guardrail blocked)
should_censor_model = ( should_censor_model = (
session.state.get("guardrail_blocked", False) session.state.get("guardrail_blocked", False)
and event.author == app_name and event.author != "user"
and hasattr(event, "content") and hasattr(event, "content")
and event.content and event.content
and event.content.parts and event.content.parts
@@ -396,34 +397,36 @@ class FirestoreSessionService(BaseSessionService):
# Create a censored version of the model response # Create a censored version of the model response
event_to_save = copy.deepcopy(event) event_to_save = copy.deepcopy(event)
event_to_save.content.parts[0].text = "[respuesta de adversidad]" event_to_save.content.parts[0].text = settings.guardrail_censored_model_response
event_data = event_to_save.model_dump(mode="json", exclude_none=True) event_data = event_to_save.model_dump(mode="json", exclude_none=True)
# Also censor the previous user message in Firestore # Also censor the previous user message in Firestore
# Find the last user event in the session (skip the current model event we just added) # Find the last user event in the session
for i in range(len(session.events) - 2, -1, -1): prev_user_event = next(
prev_event = session.events[i] (
if ( e
prev_event.author == "user" for e in reversed(session.events[:-1])
and prev_event.content if e.author == "user" and e.content and e.content.parts
and prev_event.content.parts ),
): None,
# Update this event in Firestore with censored content )
censored_user_content = Content( if prev_user_event:
role="user", parts=[Part(text="[pregunta mala]")] # Update this event in Firestore with censored content
censored_user_content = Content(
role="user",
parts=[Part(text=settings.guardrail_censored_user_message)],
)
await (
self._events_col(app_name, user_id, session_id)
.document(prev_user_event.id)
.update(
{
"content": censored_user_content.model_dump(
mode="json", exclude_none=True
)
}
) )
await ( )
self._events_col(app_name, user_id, session_id)
.document(prev_event.id)
.update(
{
"content": censored_user_content.model_dump(
mode="json", exclude_none=True
)
}
)
)
break
else: else:
event_data = event.model_dump(mode="json", exclude_none=True) event_data = event.model_dump(mode="json", exclude_none=True)