diff --git a/src/va_agent/governance.py b/src/va_agent/governance.py index d8ff262..27936e0 100644 --- a/src/va_agent/governance.py +++ b/src/va_agent/governance.py @@ -1,5 +1,6 @@ # ruff: noqa: E501 """GovernancePlugin: Guardrails for VAia, the virtual assistant for VA.""" + import json import logging import re @@ -22,10 +23,56 @@ logger = logging.getLogger(__name__) FORBIDDEN_EMOJIS = [ - "πŸ₯΅","πŸ”ͺ","🎰","🎲","πŸƒ","😀","🀬","😑","😠","🩸","🧨","πŸͺ“","☠️","πŸ’€", - "πŸ’£","πŸ”«","πŸ‘—","πŸ’¦","πŸ‘","πŸ†","πŸ‘„","πŸ‘…","🫦","πŸ’©","βš–οΈ","βš”οΈ","✝️","πŸ•", - "πŸ•Œ","β›ͺ","🍻","🍸","πŸ₯ƒ","🍷","🍺","🚬","πŸ‘Ή","πŸ‘Ί","πŸ‘Ώ","😈","🀑","πŸ§™", - "πŸ§™β€β™€οΈ", "πŸ§™β€β™‚οΈ", "πŸ§›", "πŸ§›β€β™€οΈ", "πŸ§›β€β™‚οΈ", "πŸ”ž","🧿","πŸ’Š" + "πŸ₯΅", + "πŸ”ͺ", + "🎰", + "🎲", + "πŸƒ", + "😀", + "🀬", + "😑", + "😠", + "🩸", + "🧨", + "πŸͺ“", + "☠️", + "πŸ’€", + "πŸ’£", + "πŸ”«", + "πŸ‘—", + "πŸ’¦", + "πŸ‘", + "πŸ†", + "πŸ‘„", + "πŸ‘…", + "🫦", + "πŸ’©", + "βš–οΈ", + "βš”οΈ", + "✝️", + "πŸ•", + "πŸ•Œ", + "β›ͺ", + "🍻", + "🍸", + "πŸ₯ƒ", + "🍷", + "🍺", + "🚬", + "πŸ‘Ή", + "πŸ‘Ί", + "πŸ‘Ώ", + "😈", + "🀑", + "πŸ§™", + "πŸ§™β€β™€οΈ", + "πŸ§™β€β™‚οΈ", + "πŸ§›", + "πŸ§›β€β™€οΈ", + "πŸ§›β€β™‚οΈ", + "πŸ”ž", + "🧿", + "πŸ’Š", ] @@ -37,12 +84,11 @@ class GuardrailOutput(BaseModel): description="Decision for the user prompt", ) reasoning: str | None = Field( - default=None, - description="Optional reasoning for the decision" + default=None, description="Optional reasoning for the decision" ) blocking_response: str | None = Field( default=None, - description="Optional custom blocking response to return to the user if unsafe" + description="Optional custom blocking response to return to the user if unsafe", ) @@ -54,7 +100,7 @@ class GovernancePlugin: self.guardrail_llm = Client( vertexai=True, project=settings.google_cloud_project, - location=settings.google_cloud_location + location=settings.google_cloud_location, ) _guardrail_instruction = """ Eres una capa de seguridad y protecciΓ³n de marca para VAia, el asistente virtual de VA en WhatsApp. @@ -85,9 +131,9 @@ Devuelve un JSON con la siguiente estructura: _schema = GuardrailOutput.model_json_schema() # Force strict JSON output from the guardrail LLM self._guardrail_gen_config = GenerateContentConfig( - system_instruction = _guardrail_instruction, - response_mime_type = "application/json", - response_schema = _schema, + system_instruction=_guardrail_instruction, + response_mime_type="application/json", + response_schema=_schema, max_output_tokens=1000, temperature=0.1, ) @@ -100,13 +146,12 @@ Devuelve un JSON con la siguiente estructura: # Unique pattern that combines all forbidden emojis, including skin tones and compound emojis return re.compile( - rf"{person_pattern}{tone_pattern}\u200d❀️?\u200dπŸ’‹\u200d{person_pattern}{tone_pattern}" # kissers + rf"{person_pattern}{tone_pattern}\u200d❀️?\u200dπŸ’‹\u200d{person_pattern}{tone_pattern}" # kissers rf"|{person_pattern}{tone_pattern}\u200d❀️?\u200d{person_pattern}{tone_pattern}" # lovers rf"|{'|'.join(map(re.escape, sorted(FORBIDDEN_EMOJIS, key=len, reverse=True)))}" # simple emojis rf"|πŸ–•{tone_pattern}" # middle finger with all skin tone variations ) - def _remove_emojis(self, text: str) -> tuple[str, list[str]]: removed = self._combined_pattern.findall(text) text = self._combined_pattern.sub("", text) @@ -139,8 +184,7 @@ Devuelve un JSON con la siguiente estructura: decision = data.get("decision", "safe").lower() reasoning = data.get("reasoning", "") blocking_response = data.get( - "blocking_response", - "Lo siento, no puedo ayudarte con esa solicitud πŸ˜…" + "blocking_response", "Lo siento, no puedo ayudarte con esa solicitud πŸ˜…" ) if decision == "unsafe": @@ -148,13 +192,8 @@ Devuelve un JSON con la siguiente estructura: callback_context.state["guardrail_message"] = "[GUARDRAIL_BLOCKED]" callback_context.state["guardrail_reasoning"] = reasoning return LlmResponse( - content=Content( - role="model", - parts=[ - Part(text=blocking_response) - ] - ), - usage_metadata=resp.usage_metadata or None + content=Content(role="model", parts=[Part(text=blocking_response)]), + usage_metadata=resp.usage_metadata or None, ) callback_context.state["guardrail_blocked"] = False callback_context.state["guardrail_message"] = "[GUARDRAIL_PASSED]" @@ -168,9 +207,7 @@ Devuelve un JSON con la siguiente estructura: content=Content( role="model", parts=[ - Part( - text="Lo siento, no puedo ayudarte con esa solicitud πŸ˜…" - ) + Part(text="Lo siento, no puedo ayudarte con esa solicitud πŸ˜…") ], ), interrupted=True,