WIP: feature: Add before Guardrail #26

Draft
A8080816 wants to merge 16 commits from feature/before-guardrail into main
Showing only changes of commit db9400fcf3 - Show all commits

View File

@@ -1,5 +1,6 @@
# ruff: noqa: E501
"""GovernancePlugin: Guardrails for VAia, the virtual assistant for VA."""
import json
import logging
import re
@@ -22,10 +23,56 @@ logger = logging.getLogger(__name__)
FORBIDDEN_EMOJIS = [
"🥵","🔪","🎰","🎲","🃏","😤","🤬","😡","😠","🩸","🧨","🪓","☠️","💀",
"💣","🔫","👗","💦","🍑","🍆","👄","👅","🫦","💩","⚖️","⚔️","✝️","🕍",
"🕌","","🍻","🍸","🥃","🍷","🍺","🚬","👹","👺","👿","😈","🤡","🧙",
"🧙‍♀️", "🧙‍♂️", "🧛", "🧛‍♀️", "🧛‍♂️", "🔞","🧿","💊"
"🥵",
"🔪",
"🎰",
"🎲",
"🃏",
"😤",
"🤬",
"😡",
"😠",
"🩸",
"🧨",
"🪓",
"☠️",
"💀",
"💣",
"🔫",
"👗",
"💦",
"🍑",
"🍆",
"👄",
"👅",
"🫦",
"💩",
"⚖️",
"⚔️",
"✝️",
"🕍",
"🕌",
"",
"🍻",
"🍸",
"🥃",
"🍷",
"🍺",
"🚬",
"👹",
"👺",
"👿",
"😈",
"🤡",
"🧙",
"🧙‍♀️",
"🧙‍♂️",
"🧛",
"🧛‍♀️",
"🧛‍♂️",
"🔞",
"🧿",
"💊",
]
@@ -37,12 +84,11 @@ class GuardrailOutput(BaseModel):
description="Decision for the user prompt",
)
reasoning: str | None = Field(
default=None,
description="Optional reasoning for the decision"
default=None, description="Optional reasoning for the decision"
)
blocking_response: str | None = Field(
default=None,
description="Optional custom blocking response to return to the user if unsafe"
description="Optional custom blocking response to return to the user if unsafe",
)
@@ -54,7 +100,7 @@ class GovernancePlugin:
self.guardrail_llm = Client(
vertexai=True,
project=settings.google_cloud_project,
location=settings.google_cloud_location
location=settings.google_cloud_location,
)
_guardrail_instruction = """
Eres una capa de seguridad y protección de marca para VAia, el asistente virtual de VA en WhatsApp.
@@ -85,9 +131,9 @@ Devuelve un JSON con la siguiente estructura:
_schema = GuardrailOutput.model_json_schema()
# Force strict JSON output from the guardrail LLM
self._guardrail_gen_config = GenerateContentConfig(
system_instruction = _guardrail_instruction,
response_mime_type = "application/json",
response_schema = _schema,
system_instruction=_guardrail_instruction,
response_mime_type="application/json",
response_schema=_schema,
max_output_tokens=1000,
temperature=0.1,
)
@@ -106,7 +152,6 @@ Devuelve un JSON con la siguiente estructura:
rf"|🖕{tone_pattern}" # middle finger with all skin tone variations
)
def _remove_emojis(self, text: str) -> tuple[str, list[str]]:
removed = self._combined_pattern.findall(text)
text = self._combined_pattern.sub("", text)
@@ -139,8 +184,7 @@ Devuelve un JSON con la siguiente estructura:
decision = data.get("decision", "safe").lower()
reasoning = data.get("reasoning", "")
blocking_response = data.get(
"blocking_response",
"Lo siento, no puedo ayudarte con esa solicitud 😅"
"blocking_response", "Lo siento, no puedo ayudarte con esa solicitud 😅"
)
if decision == "unsafe":
@@ -148,13 +192,8 @@ Devuelve un JSON con la siguiente estructura:
callback_context.state["guardrail_message"] = "[GUARDRAIL_BLOCKED]"
callback_context.state["guardrail_reasoning"] = reasoning
return LlmResponse(
content=Content(
role="model",
parts=[
Part(text=blocking_response)
]
),
usage_metadata=resp.usage_metadata or None
content=Content(role="model", parts=[Part(text=blocking_response)]),
usage_metadata=resp.usage_metadata or None,
)
callback_context.state["guardrail_blocked"] = False
callback_context.state["guardrail_message"] = "[GUARDRAIL_PASSED]"
@@ -168,9 +207,7 @@ Devuelve un JSON con la siguiente estructura:
content=Content(
role="model",
parts=[
Part(
text="Lo siento, no puedo ayudarte con esa solicitud 😅"
)
Part(text="Lo siento, no puedo ayudarte con esa solicitud 😅")
],
),
interrupted=True,