Response Cache
Production - Tenant-scoped LLM response caching with cost savings tracking
The Response Cache reduces LLM costs and latency by caching responses for identical or semantically similar queries. Each cache entry is scoped to a tenant and tracks estimated cost savings.
12.6.2.1Cache Architecture
Implemented in data-plane/ai-service/src/llm/cache/cache_service.py:
class ResponseCacheService:
async def lookup(self, tenant_id, model, messages, temperature, ...):
cache_key = ResponseCacheKey(tenant_id, model, messages, temperature, ...)
cached = await self._cache_manager.llm_cache.get(cache_key.to_llm_cache_key())
if cached:
cost_saved = estimate_cost_saved(model, cached.prompt_tokens, cached.completion_tokens)
return CacheLookupResult(hit=True, content=cached.response, cost_saved_usd=cost_saved)
return CacheLookupResult(hit=False, ...)
async def store_response(self, tenant_id, model, messages, temperature, response_content, ...):
cache_key = ResponseCacheKey(tenant_id, model, messages, temperature, ...)
await self._cache_manager.llm_cache.set(cache_key, entry, ttl)
async def invalidate_for_tenant(self, tenant_id, pattern=""):
await self._cache_manager.llm_cache.clear()
async def get_analytics(self, tenant_id, window_minutes=60):
return await self._store.get_cache_summary(tenant_id, window_minutes)12.6.2.2API Endpoints
# Get cache analytics
curl "http://localhost:8000/api/v1/llm/cache/analytics?tenant_id=acme-corp&window_minutes=60"
# Get hit rate
curl "http://localhost:8000/api/v1/llm/cache/hit-rate?tenant_id=acme-corp"
# Invalidate cache
curl -X POST "http://localhost:8000/api/v1/llm/cache/invalidate?tenant_id=acme-corp"
# Invalidate for tables (data change)
curl -X POST http://localhost:8000/api/v1/llm/cache/invalidate-tables \
-H "Content-Type: application/json" \
-H "X-Tenant-ID: acme-corp" \
-d '{"table_names": ["orders", "customers"]}'{
"hit_rate": 0.42,
"total_lookups": 3420,
"hits": 1436,
"misses": 1984,
"total_cost_saved_usd": 18.92,
"avg_hit_latency_ms": 12.5,
"avg_miss_latency_ms": 2340.0
}