Files
searchbox/tests/test_engine/test_integration.py
Anibal Angulo cda334a45d Rename package to searchbox
This commit renames the package from vector-search-mcp to searchbox. The
package imports and executable name are updated accordingly.
2025-09-27 01:44:46 +00:00

362 lines
13 KiB
Python

from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from qdrant_client import models
from searchbox.engine import Backend, get_engine
from searchbox.models import Match, MatchAny, MatchExclude, SearchRow
class TestEngineIntegration:
"""Integration tests for the complete engine workflow"""
@pytest.fixture
def mock_complete_engine_setup(self):
"""Setup complete mocked engine environment"""
with (
patch(
"searchbox.engine.qdrant_engine.Settings"
) as mock_settings_class,
patch(
"searchbox.engine.qdrant_engine.AsyncQdrantClient"
) as mock_client_class,
):
# Setup settings
mock_settings = MagicMock()
mock_settings.url = "http://localhost:6333"
mock_settings.api_key = "test_api_key"
mock_settings_class.return_value = mock_settings
# Setup client with realistic response
mock_client = AsyncMock()
mock_client.search.return_value = [
models.ScoredPoint(
id="doc_1",
score=0.95,
payload={
"text": "Advanced Python programming techniques for data science",
"category": "programming",
"language": "python",
"difficulty": "advanced",
"tags": ["python", "data-science", "machine-learning"],
},
version=1,
),
models.ScoredPoint(
id="doc_2",
score=0.87,
payload={
"text": "Rust systems programming for performance-critical applications",
"category": "programming",
"language": "rust",
"difficulty": "intermediate",
"tags": ["rust", "systems", "performance"],
},
version=1,
),
models.ScoredPoint(
id="doc_3",
score=0.78,
payload={
"text": "Introduction to machine learning with Python",
"category": "programming",
"language": "python",
"difficulty": "beginner",
"tags": ["python", "machine-learning", "tutorial"],
},
version=1,
),
]
mock_client_class.return_value = mock_client
yield {
"settings": mock_settings,
"client": mock_client,
"settings_class": mock_settings_class,
"client_class": mock_client_class,
}
@pytest.mark.asyncio
async def test_complete_semantic_search_workflow(self, mock_complete_engine_setup):
"""Test the complete workflow from factory to results"""
mocks = mock_complete_engine_setup
# 1. Create engine through factory
engine = get_engine(Backend.QDRANT)
# 2. Prepare search parameters
query_vector = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
collection_name = "programming_docs"
search_conditions = [
Match(key="category", value="programming"),
MatchAny(key="language", any=["python", "rust"]),
MatchExclude(key="difficulty", exclude=["expert"]),
]
# 3. Execute semantic search
results = await engine.semantic_search(
embedding=query_vector,
collection=collection_name,
limit=5,
conditions=search_conditions,
threshold=0.7,
)
# 4. Verify the complete flow
# Check that client.search was called with correct parameters
client_mock = mocks["client"]
client_mock.search.assert_called_once()
call_args = client_mock.search.call_args
assert call_args[1]["collection_name"] == collection_name
assert call_args[1]["query_vector"] == query_vector
assert call_args[1]["limit"] == 5
assert call_args[1]["score_threshold"] == 0.7
assert call_args[1]["with_payload"] is True
assert call_args[1]["with_vectors"] is False
# Verify conditions were transformed to Qdrant filter
qdrant_filter = call_args[1]["query_filter"]
assert isinstance(qdrant_filter, models.Filter)
assert len(qdrant_filter.must) == 3
# Check individual conditions
conditions = qdrant_filter.must
# Match condition
match_condition = next(c for c in conditions if c.key == "category")
assert isinstance(match_condition.match, models.MatchValue)
assert match_condition.match.value == "programming"
# MatchAny condition
match_any_condition = next(c for c in conditions if c.key == "language")
assert isinstance(match_any_condition.match, models.MatchAny)
assert match_any_condition.match.any == ["python", "rust"]
# MatchExclude condition
match_exclude_condition = next(c for c in conditions if c.key == "difficulty")
assert isinstance(match_exclude_condition.match, models.MatchExcept)
# 5. Verify results transformation
assert isinstance(results, list)
assert len(results) == 3
assert all(isinstance(result, SearchRow) for result in results)
# Check first result
assert results[0].chunk_id == "doc_1"
assert results[0].score == 0.95
assert (
results[0].payload["text"]
== "Advanced Python programming techniques for data science"
)
assert results[0].payload["category"] == "programming"
# Check second result
assert results[1].chunk_id == "doc_2"
assert results[1].score == 0.87
assert results[1].payload["language"] == "rust"
# Check third result
assert results[2].chunk_id == "doc_3"
assert results[2].score == 0.78
assert results[2].payload["difficulty"] == "beginner"
@pytest.mark.asyncio
async def test_search_with_no_conditions(self, mock_complete_engine_setup):
"""Test semantic search without any conditions"""
engine = get_engine(Backend.QDRANT)
results = await engine.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="test_collection"
)
# Verify no filter was applied
client_mock = mock_complete_engine_setup["client"]
call_args = client_mock.search.call_args
assert call_args[1]["query_filter"] is None
# Results should still be transformed
assert len(results) == 3
assert all(isinstance(result, SearchRow) for result in results)
@pytest.mark.asyncio
async def test_search_with_empty_conditions(self, mock_complete_engine_setup):
"""Test semantic search with empty conditions list"""
engine = get_engine(Backend.QDRANT)
results = await engine.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="test_collection", conditions=[]
)
# Verify no filter was applied
client_mock = mock_complete_engine_setup["client"]
call_args = client_mock.search.call_args
assert call_args[1]["query_filter"] is None
assert len(results) == 3
@pytest.mark.asyncio
async def test_search_filters_null_payloads(self, mock_complete_engine_setup):
"""Test that results with null payloads are filtered out"""
# Override the mock response to include null payload
client_mock = mock_complete_engine_setup["client"]
client_mock.search.return_value = [
models.ScoredPoint(
id="valid_1",
score=0.95,
payload={"text": "Valid document"},
version=1,
),
models.ScoredPoint(
id="invalid",
score=0.90,
payload=None, # This should be filtered out
version=1,
),
models.ScoredPoint(
id="valid_2",
score=0.85,
payload={"text": "Another valid document"},
version=1,
),
]
engine = get_engine(Backend.QDRANT)
results = await engine.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="test_collection"
)
# Should only have 2 results (null payload filtered out)
assert len(results) == 2
assert results[0].chunk_id == "valid_1"
assert results[1].chunk_id == "valid_2"
@pytest.mark.asyncio
async def test_error_propagation_from_client(self, mock_complete_engine_setup):
"""Test that client errors are properly propagated"""
# Make the client raise an exception
client_mock = mock_complete_engine_setup["client"]
client_mock.search.side_effect = Exception("Qdrant connection timeout")
engine = get_engine(Backend.QDRANT)
with pytest.raises(Exception, match="Qdrant connection timeout"):
await engine.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="test_collection"
)
@pytest.mark.asyncio
async def test_search_with_named_vector(self, mock_complete_engine_setup):
"""Test semantic search with NamedVector instead of regular vector"""
engine = get_engine(Backend.QDRANT)
named_vector = models.NamedVector(
name="text_embedding", vector=[0.1, 0.2, 0.3, 0.4, 0.5]
)
results = await engine.semantic_search(
embedding=named_vector, # type: ignore - Testing duck typing
collection="test_collection",
)
# Verify named vector was passed through
client_mock = mock_complete_engine_setup["client"]
call_args = client_mock.search.call_args
assert call_args[1]["query_vector"] == named_vector
assert len(results) == 3
@pytest.mark.asyncio
async def test_search_parameter_defaults(self, mock_complete_engine_setup):
"""Test that default parameters are applied correctly"""
engine = get_engine(Backend.QDRANT)
await engine.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="test_collection"
)
client_mock = mock_complete_engine_setup["client"]
call_args = client_mock.search.call_args
# Check defaults
assert call_args[1]["limit"] == 10 # default limit
assert call_args[1]["score_threshold"] is None # default threshold
assert call_args[1]["query_filter"] is None # default conditions
assert call_args[1]["with_payload"] is True
assert call_args[1]["with_vectors"] is False
@pytest.mark.asyncio
async def test_multiple_engine_instances_independence(
self, mock_complete_engine_setup
):
"""Test that multiple engine instances work independently"""
# Create two engines
engine1 = get_engine(Backend.QDRANT)
engine2 = get_engine(Backend.QDRANT)
# Verify they are the same instance due to caching
assert engine1 is engine2
# Both should work with the same instance
results1 = await engine1.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="collection1"
)
results2 = await engine2.semantic_search(
embedding=[0.4, 0.5, 0.6], collection="collection2"
)
assert len(results1) == 3
assert len(results2) == 3
# Verify client was called twice (same instance, multiple calls)
client_mock = mock_complete_engine_setup["client"]
assert client_mock.search.call_count == 2
@pytest.mark.asyncio
async def test_large_result_set_handling(self, mock_complete_engine_setup):
"""Test handling of large result sets"""
# Create a large mock response
large_response = []
for i in range(100):
large_response.append(
models.ScoredPoint(
id=f"doc_{i}",
score=0.9 - (i * 0.001), # Decreasing scores
payload={"text": f"Document {i}", "index": i},
version=1,
)
)
client_mock = mock_complete_engine_setup["client"]
client_mock.search.return_value = large_response
engine = get_engine(Backend.QDRANT)
results = await engine.semantic_search(
embedding=[0.1, 0.2, 0.3], collection="large_collection", limit=100
)
# Should handle all 100 results
assert len(results) == 100
assert results[0].chunk_id == "doc_0"
assert results[0].score == 0.9
assert results[99].chunk_id == "doc_99"
assert results[99].score == 0.801 # 0.9 - (99 * 0.001)
def test_engine_type_consistency(self):
"""Test that engine types are consistent across multiple calls"""
with (
patch("searchbox.engine.qdrant_engine.Settings"),
patch("searchbox.engine.qdrant_engine.AsyncQdrantClient"),
):
engines = [get_engine(Backend.QDRANT) for _ in range(5)]
# All should be the same instance due to caching
assert all(engine is engines[0] for engine in engines)
# All should be QdrantEngine instances
from searchbox.engine.qdrant_engine import QdrantEngine
assert all(isinstance(engine, QdrantEngine) for engine in engines)