Complete OpenRouter integration skill pack with 30 skills covering LLM routing, model selection, cost optimization, and multi-provider orchestration. Flagship+ tier vendor pack.
Installation
Open Claude Code and run this command:
/plugin install openrouter-pack@claude-code-plugins-plus
Use --global to install for all projects, or --project for current project only.
Skills (30)
Implement audit logging for OpenRouter API calls.
OpenRouter Audit Logging
Overview
Every OpenRouter API call returns a generation ID and metadata that enables comprehensive audit logging. The generation endpoint (GET /api/v1/generation?id=) provides exact cost, token counts, provider used, and latency -- data that the initial response doesn't always include. This skill covers structured logging, cost tracking, PII redaction, and compliance-ready audit trails.
Core: Generation Metadata Retrieval
import os, json, time, hashlib, logging
from datetime import datetime, timezone
from dataclasses import dataclass, asdict
from typing import Optional
import requests
from openai import OpenAI
log = logging.getLogger("openrouter.audit")
@dataclass
class AuditEntry:
timestamp: str
generation_id: str
model_requested: str
model_used: str # Actual model served (may differ with fallbacks)
prompt_tokens: int
completion_tokens: int
total_cost: float
latency_ms: float
status: str # "success" | "error" | "timeout"
user_id: str
prompt_hash: str # SHA-256 of prompt (not raw content)
error_code: Optional[str] = None
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={
"HTTP-Referer": "https://my-app.com",
"X-Title": "my-app",
},
)
def audited_completion(
messages: list[dict],
model: str = "anthropic/claude-3.5-sonnet",
user_id: str = "system",
**kwargs,
) -> tuple:
"""Make a completion request with full audit logging."""
prompt_text = json.dumps(messages)
prompt_hash = hashlib.sha256(prompt_text.encode()).hexdigest()[:16]
start = time.monotonic()
status = "success"
error_code = None
try:
response = client.chat.completions.create(
model=model, messages=messages, **kwargs
)
except Exception as e:
status = "error"
error_code = type(e).__name__
raise
finally:
latency = (time.monotonic() - start) * 1000
# Fetch exact cost from generation endpoint
gen_data = {}
try:
gen = requests.get(
f"https://openrouter.ai/api/v1/generation?id={response.id}",
headers={"Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}"},
timeout=5,
).json()
gen_data = gen.get("data", {})
except Exception:
log.warning(f"Failed to fetch generation metadata for {response.id}")
entry = AuditEntry(
timestamp=datetime.now(timezone.utc).isoformat(),
generation_id=response.id,
model_requested=model,
model_used=response.model,
prompt_Implement caching for OpenRouter API responses to reduce cost and latency.
OpenRouter Caching Strategy
Overview
OpenRouter charges per token, so caching identical or similar requests can dramatically cut costs. Deterministic requests (temperature=0) with the same model and messages produce identical outputs -- these are safe to cache. This skill covers in-memory caching, persistent caching with TTL, and Anthropic prompt caching via OpenRouter.
In-Memory Cache
import os, hashlib, json, time
from typing import Optional
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
class LLMCache:
def __init__(self, ttl_seconds: int = 3600):
self._cache: dict[str, tuple[dict, float]] = {}
self._ttl = ttl_seconds
self.hits = 0
self.misses = 0
def _key(self, model: str, messages: list, **kwargs) -> str:
blob = json.dumps({"model": model, "messages": messages, **kwargs}, sort_keys=True)
return hashlib.sha256(blob.encode()).hexdigest()
def get(self, model: str, messages: list, **kwargs) -> Optional[dict]:
k = self._key(model, messages, **kwargs)
if k in self._cache:
data, ts = self._cache[k]
if time.time() - ts < self._ttl:
self.hits += 1
return data
del self._cache[k]
self.misses += 1
return None
def set(self, model: str, messages: list, response: dict, **kwargs):
k = self._key(model, messages, **kwargs)
self._cache[k] = (response, time.time())
cache = LLMCache(ttl_seconds=1800)
def cached_completion(messages, model="anthropic/claude-3.5-sonnet", **kwargs):
"""Only cache deterministic requests (temperature=0)."""
kwargs.setdefault("temperature", 0)
kwargs.setdefault("max_tokens", 1024)
cached = cache.get(model, messages, **kwargs)
if cached:
return cached
response = client.chat.completions.create(model=model, messages=messages, **kwargs)
result = {
"content": response.choices[0].message.content,
"model": response.model,
"usage": {"prompt": response.usage.prompt_tokens, "completion": response.usage.completion_tokens},
}
cache.set(model, messages, result, **kwargs)
return result
Persistent Cache with Redis
import redis, json, hashlib
r = redis.Redis(host="localhost", port=6379, db=0)
def redis_cached_completion(messages, model="openai/gpt-4o-mini", ttl=3600, **kwargs):
"""Cache in Redis with automatic TTL expiry."""
kwargs["temperatDiagnose and fix common OpenRouter API errors.
OpenRouter Common Errors
Overview
OpenRouter returns standard HTTP error codes plus OpenRouter-specific error codes in the response body. The most common: 401 (auth), 402 (credits), 429 (rate limit), 400 (bad request), and 5xx (upstream provider errors). Each error includes a code field and a human-readable message. This skill covers every common error, its root cause, and the exact fix.
Complete Error Reference
| HTTP | Error Code | Cause | Fix | |
|---|---|---|---|---|
| 400 | bad_request |
Malformed request body | Validate messages array format; ensure model ID includes provider prefix |
|
| 400 | invalid_model |
Model ID not found | Check model exists: `curl -s https://openrouter.ai/api/v1/models \ | jq '.data[].id'` |
| 400 | contextlengthexceeded |
Prompt + max_tokens > model limit | Reduce prompt size or use a larger-context model | |
| 400 | invalidtoolschema |
Tool definition has unsupported types | Use basic JSON Schema types only (string, number, boolean, object, array) | |
| 401 | invalidapikey |
Key malformed, revoked, or wrong | Regenerate at openrouter.ai/keys; key must start with sk-or-v1- |
|
| 401 | missingapikey |
No Authorization header |
Add Authorization: Bearer sk-or-v1-... header |
|
| 402 | insufficient_credits |
Credit balance is zero | Top up at openrouter.ai/credits | |
| 402 | creditlimitreached |
Per-key credit limit hit | Increase key limit in dashboard or create new key | |
| 403 | key_disabled |
Key was disabled by admin | Re-enable in dashboard or create new key | |
| 408 | request_timeout |
Model took too long | Reduce max_tokens; use streaming; try faster model | |
| 429 | ratelimitexceeded |
Too many requests per interval | SDK auto-retries; increase max_retries; use multiple keys |
|
| 502 | provider_error |
Upstream provider returned error | Retry with backoff; try different provider via provider.order |
|
| 503 | model_unavailable |
Model temporarily offline | Use fallback models; check status.openrouter.ai |
Review OpenRouter integration for regulatory compliance (SOC2, GDPR, HIPAA).
OpenRouter Compliance Review
Overview
OpenRouter is a proxy that routes requests to upstream providers (OpenAI, Anthropic, Google, etc.). Compliance depends on both OpenRouter's data handling and the selected provider's policies. Key considerations: data transit through OpenRouter infrastructure, provider-specific data retention, model selection for regulated data, and audit trail requirements.
Compliance Checklist
COMPLIANCE_CHECKLIST = {
"data_handling": [
"Verify OpenRouter does NOT train on your data (confirmed in their privacy policy)",
"Confirm provider-level data policies (OpenAI, Anthropic, Google each differ)",
"Document data flow: your app -> OpenRouter -> provider -> OpenRouter -> your app",
"Identify if prompts contain PII, PHI, or regulated data",
"Implement PII redaction before sending to API",
],
"access_control": [
"Use per-service API keys (not shared keys)",
"Set credit limits per key to isolate blast radius",
"Rotate keys on a 90-day schedule",
"Store keys in secrets manager (not .env files in repos)",
"Enable management keys for programmatic key provisioning",
],
"audit_trail": [
"Log every API call with generation_id, model, user_id, cost",
"Hash prompts (SHA-256) instead of logging raw content",
"Retain audit logs per regulation (90d operational, 7yr financial)",
"Ship logs to append-only storage (S3, immutable DB)",
],
"provider_selection": [
"Route regulated data only to compliant providers",
"Use provider routing to exclude non-compliant providers",
"Document which models are approved for which data classifications",
"Test that fallback routing doesn't route to unapproved providers",
],
}
Provider Routing for Compliance
import os
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Route ONLY to specific providers (e.g., Anthropic for SOC2)
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet",
messages=[{"role": "user", "content": "Analyze this contract..."}],
max_tokens=2048,
extra_body={
"provider": {
"order": ["Anthropic"], # Only Anthropic's infrastructure
"allow_fallbacks": False, # Do NOT fall Optimize context window usage for OpenRouter models to reduce cost and improve quality.
OpenRouter Context Optimization
Overview
OpenRouter models have varying context windows (4K to 1M+ tokens). Since pricing is per-token, stuffing unnecessary context wastes money and can degrade output quality. This skill covers context window lookup, token estimation, conversation trimming, chunking strategies, and Anthropic prompt caching for large contexts.
Query Context Limits
# Check context window for specific models
curl -s https://openrouter.ai/api/v1/models | jq '[.data[] | select(
.id == "anthropic/claude-3.5-sonnet" or
.id == "openai/gpt-4o" or
.id == "google/gemini-2.0-flash-001" or
.id == "meta-llama/llama-3.1-70b-instruct"
) | {id, context_length, prompt_per_M: ((.pricing.prompt|tonumber)*1000000)}]'
Context-Aware Model Selection
import os, requests
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Cache model metadata at startup
MODELS = {m["id"]: m for m in requests.get("https://openrouter.ai/api/v1/models").json()["data"]}
def estimate_tokens(text: str) -> int:
"""Rough estimate: 1 token ~ 4 characters for English text."""
return len(text) // 4
def select_model_for_context(messages: list, preferred: str = "anthropic/claude-3.5-sonnet") -> str:
"""Pick a model that fits the context, falling back to larger windows."""
estimated_tokens = sum(len(m.get("content", "")) for m in messages) // 4
FALLBACK_CHAIN = [
("openai/gpt-4o-mini", 128_000),
("anthropic/claude-3.5-sonnet", 200_000),
("google/gemini-2.0-flash-001", 1_000_000),
]
# Try preferred model first
preferred_ctx = MODELS.get(preferred, {}).get("context_length", 0)
if estimated_tokens < preferred_ctx * 0.8: # 80% safety margin
return preferred
for model_id, ctx in FALLBACK_CHAIN:
if estimated_tokens < ctx * 0.8:
return model_id
raise ValueError(f"Content too large ({estimated_tokens} est. tokens)")
Conversation Trimming
def trim_conversation(
messages: list[dict],
max_tokens: int = 100_000,
keep_system: bool = True,
keep_last_n: int = 4,
) -> list[dict]:
"""Trim conversation history to fit context window.
Strategy: Keep system prompt + last N messages.
If still too large, reduce to last 2 messages.
"""
system = [m for m in messages if m["role"] == "system"] if Implement cost controls for OpenRouter API usage.
OpenRouter Cost Controls
Overview
OpenRouter provides per-key credit limits, a credit balance API, and per-generation cost queries. Combined with client-side budget middleware, you can enforce hard spending caps at the key level and soft caps in your application. This skill covers key-level limits, per-request cost tracking, budget enforcement middleware, and alert systems.
Check Credit Balance
# Current balance and limits
curl -s https://openrouter.ai/api/v1/auth/key \
-H "Authorization: Bearer $OPENROUTER_API_KEY" | jq '{
credits_used: .data.usage,
credit_limit: .data.limit,
remaining: ((.data.limit // 0) - .data.usage),
is_free_tier: .data.is_free_tier,
rate_limit: .data.rate_limit
}'
Per-Key Credit Limits
import os, requests
MGMT_KEY = os.environ["OPENROUTER_MGMT_KEY"] # Management key
# Create a key with a $50 credit limit
resp = requests.post(
"https://openrouter.ai/api/v1/keys",
headers={"Authorization": f"Bearer {MGMT_KEY}"},
json={"name": "backend-prod", "limit": 50.0},
)
new_key = resp.json()["data"]["key"] # sk-or-v1-...
# List all keys with their limits and usage
keys = requests.get(
"https://openrouter.ai/api/v1/keys",
headers={"Authorization": f"Bearer {MGMT_KEY}"},
).json()
for k in keys.get("data", []):
print(f"{k['name']}: ${k.get('usage', 0):.4f} / ${k.get('limit', 'unlimited')}")
Budget Enforcement Middleware
import os, time, requests
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
class BudgetEnforcer:
"""Client-side budget enforcement with server-side cost verification."""
def __init__(self, daily_limit: float = 10.0, per_request_limit: float = 0.50):
self.daily_limit = daily_limit
self.per_request_limit = per_request_limit
self._daily_spend = 0.0
self._day = time.strftime("%Y-%m-%d")
def _reset_if_new_day(self):
today = time.strftime("%Y-%m-%d")
if today != self._day:
self._daily_spend = 0.0
self._day = today
def estimate_cost(self, model: str, prompt_tokens: int, max_tokens: int) -> float:
"""Pre-flight cost estimate using cached pricing."""
# Representative rates (fetch from /models in production)
RATES = {
"anthropic/claude-3.5-sonnet": (3.0, 15.0), # per 1M tokens
Implement data privacy controls for OpenRouter API usage.
OpenRouter Data Privacy
Overview
When sending data through OpenRouter to upstream LLM providers, you're responsible for ensuring prompts don't leak PII inappropriately. OpenRouter itself does not train on API data, but each upstream provider has its own data retention and training policies. This skill covers PII detection and redaction, placeholder substitution, provider selection for privacy, and consent tracking.
PII Detection and Redaction
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class PiiScanResult:
clean_text: str
findings: list[dict]
has_pii: bool
PII_RULES = [
("email", r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
("phone", r'\b(?:\+1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'),
("ssn", r'\b\d{3}-\d{2}-\d{4}\b'),
("credit_card", r'\b(?:\d{4}[- ]?){3}\d{4}\b'),
("api_key", r'\bsk-or-v1-[a-zA-Z0-9]+\b'),
("ip_address", r'\b(?:\d{1,3}\.){3}\d{1,3}\b'),
]
REPLACEMENTS = {
"email": "[EMAIL]", "phone": "[PHONE]", "ssn": "[SSN]",
"credit_card": "[CARD]", "api_key": "[API_KEY]", "ip_address": "[IP]",
}
def scan_and_redact(text: str) -> PiiScanResult:
"""Scan text for PII and return redacted version with findings."""
findings = []
clean = text
for pii_type, pattern in PII_RULES:
matches = re.findall(pattern, clean)
for match in matches:
findings.append({"type": pii_type, "value_prefix": match[:4] + "..."})
clean = re.sub(pattern, REPLACEMENTS[pii_type], clean)
return PiiScanResult(clean_text=clean, findings=findings, has_pii=len(findings) > 0)
Placeholder Substitution Pattern
import os, uuid
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
class PrivacyProxy:
"""Replace PII with placeholders before API, restore after."""
def __init__(self):
self._map: dict[str, str] = {}
def anonymize(self, text: str) -> str:
"""Replace PII with unique placeholders."""
result = scan_and_redact(text)
if not result.has_pii:
return text
# Use deterministic placeholders for consistent replacement
anonymized = text
for pii_type, pattern in PII_RULES:
for match in re.finditer(pattern, anonymized):
original = mCreate debug bundles for troubleshooting OpenRouter API issues.
OpenRouter Debug Bundle
Current State
!node --version 2>/dev/null || echo 'N/A'
!python3 --version 2>/dev/null || echo 'N/A'
Overview
When an OpenRouter request fails or returns unexpected results, you need a structured debug bundle: the exact request, response, headers, generation metadata, and environment info. The generation ID (gen-* prefix in response.id) is the key correlator -- it lets you look up exact cost, provider used, and latency via GET /api/v1/generation?id=.
Quick Debug: curl
# Send a request and capture full response with headers
curl -v https://openrouter.ai/api/v1/chat/completions \
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
-H "Content-Type: application/json" \
-H "HTTP-Referer: https://my-app.com" \
-H "X-Title: debug-test" \
-d '{
"model": "openai/gpt-4o-mini",
"messages": [{"role": "user", "content": "Say hello"}],
"max_tokens": 50
}' 2>&1 | tee /tmp/openrouter-debug.txt
# Extract generation ID from response
GEN_ID=$(jq -r '.id' /tmp/openrouter-debug.txt 2>/dev/null)
echo "Generation ID: $GEN_ID"
# Look up generation metadata (exact cost, provider, latency)
curl -s "https://openrouter.ai/api/v1/generation?id=$GEN_ID" \
-H "Authorization: Bearer $OPENROUTER_API_KEY" | jq '.data | {
model: .model,
total_cost: .total_cost,
tokens_prompt: .tokens_prompt,
tokens_completion: .tokens_completion,
generation_time: .generation_time,
provider: .provider_name
}'
Python Debug Bundle Generator
import os, json, time, platform, sys
from datetime import datetime, timezone
from dataclasses import dataclass, asdict
from typing import Optional
from openai import OpenAI, APIError
import requests as http_requests
@dataclass
class DebugBundle:
timestamp: str
generation_id: Optional[str]
request_model: str
request_messages: list
request_params: dict
response_status: str
response_model: Optional[str]
response_content: Optional[str]
error_type: Optional[str]
error_message: Optional[str]
error_code: Optional[int]
latency_ms: float
generation_metadata: Optional[dict]
environment: dict
def to_json(self) -> str:
return json.dumps(asdict(self), indent=2)
def save(self, path: str = "debug_bundle.json"):
with open(path, "w") as f:
f.write(self.to_json())
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
Configure automatic model fallbacks for high availability on OpenRouter.
OpenRouter Fallback Config
Overview
OpenRouter supports native model fallbacks: pass multiple model IDs and OpenRouter tries each in order until one succeeds. You can also use provider.order to control which provider serves a specific model. This skill covers native fallbacks, provider routing, client-side fallback chains, and timeout configuration.
Native Model Fallback (Server-Side)
import os
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Pass multiple models -- OpenRouter tries each in order
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet", # Primary (used for param validation)
messages=[{"role": "user", "content": "Explain recursion"}],
max_tokens=500,
extra_body={
"models": [
"anthropic/claude-3.5-sonnet",
"openai/gpt-4o",
"google/gemini-2.0-flash-001",
],
"route": "fallback", # Try in order until one succeeds
},
)
# Check which model actually served the request
print(f"Served by: {response.model}")
Provider Fallback (Same Model, Different Providers)
# Route to specific providers in priority order
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet",
messages=[{"role": "user", "content": "Hello"}],
max_tokens=200,
extra_body={
"provider": {
"order": ["Anthropic", "AWS Bedrock", "GCP Vertex"],
"allow_fallbacks": True, # Fall to next provider if first fails
},
},
)
Client-Side Fallback Chain
import logging
from openai import OpenAI, APIError, APITimeoutError
log = logging.getLogger("openrouter.fallback")
FALLBACK_CHAIN = [
{"model": "anthropic/claude-3.5-sonnet", "timeout": 30.0, "label": "primary"},
{"model": "openai/gpt-4o", "timeout": 25.0, "label": "secondary"},
{"model": "openai/gpt-4o-mini", "timeout": 15.0, "label": "budget-fallback"},
{"model": "google/gemini-2.0-flash-001", "timeout": 15.0, "label": "last-resort"},
]
def resilient_completion(messages: list[dict], max_tokens: int = 1024, **kwargs):
"""Try each model in the fallback chain until one succeeds.&qImplement function/tool calling with OpenRouter models.
OpenRouter Function Calling
Overview
OpenRouter supports OpenAI-compatible tool/function calling across multiple providers. Define tools as JSON Schema, send them with your request, and the model returns structured tool_calls instead of free text. This works with GPT-4o, Claude 3.5, Gemini, and other tool-capable models via the same API. The key difference from direct provider APIs: OpenRouter normalizes the tool calling interface, so the same code works across providers.
Basic Tool Calling
import os, json
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Define tools with JSON Schema
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City name"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
},
{
"type": "function",
"function": {
"name": "search_database",
"description": "Search the product database",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"},
"limit": {"type": "integer", "default": 10},
},
"required": ["query"],
},
},
},
]
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet", # Also works with openai/gpt-4o, etc.
messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
tools=tools,
tool_choice="auto", # "auto" | "required" | "none" | {"type":"function","function":{"name":"..."}}
max_tokens=1024,
)
message = response.choices[0].message
if message.tool_calls:
for tc in message.tool_calls:
print(f"Function: {tc.function.name}")
print(f"Args: {json.loads(tc.functSend your first OpenRouter API request and understand the response.
OpenRouter Hello World
Overview
Send a minimal chat completion request through OpenRouter, understand the response format, try different models, and verify the full round-trip works. All requests go to the single endpoint POST https://openrouter.ai/api/v1/chat/completions.
Minimal Request (cURL)
curl -s https://openrouter.ai/api/v1/chat/completions \
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "google/gemma-2-9b-it:free",
"messages": [{"role": "user", "content": "Say hello in three languages"}],
"max_tokens": 100
}' | jq .
Response Format
{
"id": "gen-abc123xyz",
"model": "google/gemma-2-9b-it:free",
"object": "chat.completion",
"created": 1711234567,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "Hello! Bonjour! Hola!"
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 12,
"completion_tokens": 8,
"total_tokens": 20
}
}
Key fields:
id(gen-...) -- use this to query generation stats viaGET /api/v1/generation?id=gen-abc123xyzmodel-- confirms which model actually served the requestusage-- token counts for cost calculationfinishreason--stop(complete),length(hit maxtokens),tool_calls(function call)
Python Example
from openai import OpenAI
import os
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://your-app.com", "X-Title": "My App"},
)
# Basic completion
response = client.chat.completions.create(
model="google/gemma-2-9b-it:free",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is OpenRouter in one sentence?"},
],
max_tokens=100,
)
print(response.choices[0].message.content)
print(f"Model: {response.model}")
print(f"Tokens: {response.usage.prompt_tokens} prompt + {response.usage.completion_tokens} completion")
TypeScript Example
import OpenAI from "openai";
const client = new OpenAI({
baSet up OpenRouter API authentication and configure API keys.
OpenRouter Install & Auth
Overview
Set up OpenRouter API credentials, configure the OpenAI-compatible client, verify authentication, and check credit balance. OpenRouter keys start with sk-or-v1- and authenticate against https://openrouter.ai/api/v1.
Prerequisites
- OpenRouter account (free at openrouter.ai)
- Python 3.8+ or Node.js 18+
- OpenAI SDK (
pip install openaiornpm install openai)
Quick Setup
1. Generate an API Key
- Go to openrouter.ai/keys
- Click Create Key, name it (e.g.,
my-app-dev) - Copy the
sk-or-v1-...value immediately (shown only once) - Optionally set a credit limit on the key for spend control
2. Configure Environment
# .env file (add .env to .gitignore!)
OPENROUTER_API_KEY=sk-or-v1-your-key-here
# Or export directly
export OPENROUTER_API_KEY="sk-or-v1-your-key-here"
3. Initialize the Client
Python:
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={
"HTTP-Referer": "https://your-app.com", # For analytics attribution
"X-Title": "Your App Name", # Shows in dashboard
},
)
TypeScript:
import OpenAI from "openai";
const client = new OpenAI({
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env.OPENROUTER_API_KEY,
defaultHeaders: {
"HTTP-Referer": "https://your-app.com",
"X-Title": "Your App Name",
},
});
4. Verify Authentication
# Quick auth + credit check
import requests
resp = requests.get(
"https://openrouter.ai/api/v1/auth/key",
headers={"Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}"},
)
data = resp.json()["data"]
print(f"Key: {data['label']}")
print(f"Credits used: ${data['usage']:.4f}")
print(f"Credit limit: ${data.get('limit', 'unlimited')}")
print(f"Free tier: {data['is_free_tier']}")
print(f"Rate limit: {data['rate_limit']['requests']} req / {data['rate_limit']['interval']}")
5. Send a Test Request
response = client.chat.completions.create(
model="google/gemma-2-9b-it:free", # Free model for testing
messages=[{"role": "userAvoid common OpenRouter integration mistakes and gotchas.
OpenRouter Known Pitfalls
Overview
A curated list of real-world mistakes developers make when integrating OpenRouter, each with the specific API behavior that causes the problem and the exact fix. These are not theoretical -- they come from production incidents and support requests.
Pitfall 1: Missing Provider Prefix on Model ID
# WRONG: Model ID without provider prefix
response = client.chat.completions.create(
model="gpt-4o", # ← Will fail with 400 "model not found"
messages=[{"role": "user", "content": "Hello"}],
)
# RIGHT: Always include provider/model format
response = client.chat.completions.create(
model="openai/gpt-4o", # ← Correct
messages=[{"role": "user", "content": "Hello"}],
)
Pitfall 2: No max_tokens = Runaway Costs
# WRONG: No max_tokens -- model may generate 4000+ tokens
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet", # $15/1M completion tokens
messages=[{"role": "user", "content": "Write a story"}],
# No max_tokens → could generate $0.06+ per request
)
# RIGHT: Always set max_tokens
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet",
messages=[{"role": "user", "content": "Write a story"}],
max_tokens=500, # ← Caps cost at ~$0.0075
)
Pitfall 3: Hardcoded Model IDs Break When Models Are Renamed
# WRONG: Hardcoded model ID scattered across codebase
# When "claude-3-opus" becomes "claude-3-opus-20240229", everything breaks
# RIGHT: Centralize model IDs in config
MODELS = {
"primary": "anthropic/claude-3.5-sonnet",
"budget": "openai/gpt-4o-mini",
"free": "google/gemma-2-9b-it:free",
}
# Validate at startup
import requests
available = {m["id"] for m in requests.get("https://openrouter.ai/api/v1/models").json()["data"]}
for name, model_id in MODELS.items():
if model_id not in available:
print(f"WARNING: {name} model '{model_id}' not available!")
Pitfall 4: Fallbacks Route to Unexpected Providers
# WRONG: Default allow_fallbacks=True without controlling which providers
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet",
messages=[{"role": "user", "content": sensitive_data}],
# OpenRouter might fall back to a different provider you didn't approve
)
# RIGHT: Control fallback behavior explicitly
response = client.chat.completions.create(
model="anthropic/claude-3.5Distribute OpenRouter requests across multiple keys and models for high throughput.
OpenRouter Load Balancing
Overview
A single OpenRouter API key has rate limits (requests/minute and tokens/minute). To scale beyond those limits, distribute requests across multiple keys. OpenRouter also provides server-side load balancing via provider routing and the :nitro variant for low-latency inference. This skill covers multi-key rotation, health-based routing, circuit breakers, and concurrent request patterns.
Multi-Key Round Robin
import os, itertools, time, logging
from openai import OpenAI, RateLimitError
from dataclasses import dataclass, field
log = logging.getLogger("openrouter.lb")
@dataclass
class KeyPool:
"""Round-robin API key pool with health tracking."""
keys: list[str]
_cycle: itertools.cycle = field(init=False, repr=False)
_health: dict[str, dict] = field(init=False, default_factory=dict)
def __post_init__(self):
self._cycle = itertools.cycle(self.keys)
self._health = {k: {"errors": 0, "last_error": 0, "healthy": True} for k in self.keys}
def next_key(self) -> str:
"""Get next healthy key."""
attempts = 0
while attempts < len(self.keys):
key = next(self._cycle)
h = self._health[key]
# Recover after 60s cooldown
if not h["healthy"] and time.time() - h["last_error"] > 60:
h["healthy"] = True
h["errors"] = 0
if h["healthy"]:
return key
attempts += 1
# All keys unhealthy -- return any and hope for the best
return next(self._cycle)
def mark_error(self, key: str):
h = self._health[key]
h["errors"] += 1
h["last_error"] = time.time()
if h["errors"] >= 3: # Circuit breaker: 3 errors → unhealthy
h["healthy"] = False
log.warning(f"Key {key[:12]}... marked unhealthy after {h['errors']} errors")
def mark_success(self, key: str):
self._health[key]["errors"] = 0
self._health[key]["healthy"] = True
pool = KeyPool(keys=[
os.environ.get("OPENROUTER_KEY_1", ""),
os.environ.get("OPENROUTER_KEY_2", ""),
os.environ.get("OPENROUTER_KEY_3", ""),
])
def balanced_completion(messages, model="anthropic/claude-3.5-sonnet", **kwargs):
"""Send request using next healthy key from the pool."""
key = pool.next_key()
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=key,
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
Monitor OpenRouter model availability and implement health checks.
OpenRouter Model Availability
Overview
OpenRouter's /api/v1/models endpoint is the source of truth for model availability. Models can be temporarily unavailable, have degraded performance, or be permanently removed. This skill covers querying model status, building health probes, tracking availability over time, and automating failover.
Query Model Status
# Check if specific models exist and their status
curl -s https://openrouter.ai/api/v1/models | jq '[.data[] | select(
.id == "anthropic/claude-3.5-sonnet" or
.id == "openai/gpt-4o" or
.id == "openai/gpt-4o-mini"
) | {
id,
context_length,
prompt_per_M: ((.pricing.prompt | tonumber) * 1000000),
completion_per_M: ((.pricing.completion | tonumber) * 1000000)
}]'
# List all available models (just IDs)
curl -s https://openrouter.ai/api/v1/models | jq '[.data[].id] | sort'
# Count models by provider
curl -s https://openrouter.ai/api/v1/models | jq '[.data[].id | split("/")[0]] | group_by(.) | map({provider: .[0], count: length}) | sort_by(-.count)'
Health Check Service
import os, time, logging
from datetime import datetime, timezone
from dataclasses import dataclass
import requests
from openai import OpenAI, APIError, APITimeoutError
log = logging.getLogger("openrouter.health")
@dataclass
class HealthStatus:
model: str
available: bool
latency_ms: float
checked_at: str
error: str = ""
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
timeout=15.0,
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "health-check"},
)
def probe_model(model_id: str) -> HealthStatus:
"""Send a minimal request to test model availability."""
start = time.monotonic()
try:
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "hi"}],
max_tokens=1, # Minimal cost
)
latency = (time.monotonic() - start) * 1000
return HealthStatus(
model=model_id, available=True, latency_ms=round(latency, 1),
checked_at=datetime.now(timezone.utc).isoformat(),
)
except (APIError, APITimeoutError) as e:
latency = (time.monotonic() - start) * 1000
return HealthStatus(
model=model_id, available=False, latency_ms=round(latency, 1),
checked_at=datetime.now(timezone.utc).isoformat(),
error=str(e),
)
def check_critical_models() -> list[HealthStatus]:
"""Probe all critical models."""
CRITICAL_MODELS = [
"anthropic/claude-3.5-soQuery, filter, and select from OpenRouter's 400+ model catalog.
OpenRouter Model Catalog
Overview
Query the GET /api/v1/models endpoint to browse 400+ models, filter by capabilities, compare pricing, and check provider endpoints. No API key required for the models endpoint.
List All Models
# Full catalog (no auth required)
curl -s https://openrouter.ai/api/v1/models | jq '.data | length'
# → 400+
# Filter to text output models only
curl -s "https://openrouter.ai/api/v1/models?supported_parameters=tools" | jq '.data | length'
Model Object Shape
{
"id": "anthropic/claude-3.5-sonnet",
"name": "Claude 3.5 Sonnet",
"description": "Anthropic's most intelligent model...",
"context_length": 200000,
"pricing": {
"prompt": "0.000003",
"completion": "0.000015",
"image": "0.0048",
"request": "0"
},
"top_provider": {
"context_length": 200000,
"max_completion_tokens": 8192,
"is_moderated": false
},
"per_request_limits": null,
"architecture": {
"modality": "text+image->text",
"tokenizer": "Claude",
"instruct_type": null
}
}
Key fields:
pricing.prompt/pricing.completion-- cost per token (not per million; multiply by 1M for readable rates)context_length-- max input tokenstopprovider.maxcompletion_tokens-- max output tokensarchitecture.modality--text->text,text+image->text, etc.
Python: Query and Filter
import requests
models = requests.get("https://openrouter.ai/api/v1/models").json()["data"]
# Find all free models
free_models = [m for m in models if m["pricing"]["prompt"] == "0"]
print(f"Free models: {len(free_models)}")
# Models with tool calling support
# (query with supported_parameters)
tool_models = requests.get(
"https://openrouter.ai/api/v1/models?supported_parameters=tools"
).json()["data"]
print(f"Tool-calling models: {len(tool_models)}")
# Sort by prompt price (cheapest first, excluding free)
paid = [m for m in models if float(m["pricing"]["prompt"]) > 0]
paid.sort(key=lambda m: float(m["pricing"]["prompt"]))
for m in paid[:10]:
cost_per_m = float(m["pricing"]["prompt"]) * 1_000_000
print(f" ${cost_per_m:.2f}/M tokens — {m['id']} ({m['context_length']//1000}K ctx)")
# Filter by context length (128K+)
large_ctx = [m for m iImplement intelligent model routing to optimize cost, quality, and latency on OpenRouter.
OpenRouter Model Routing
Overview
OpenRouter gives you access to 100+ models through one API. The key to cost efficiency is routing each request to the right model based on task complexity, required capabilities, cost budget, and latency requirements. This skill covers task-based routing, complexity classification, cost-aware selection, and OpenRouter's native routing features.
Task-Based Router
import os, re
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Model tiers by cost and capability
MODELS = {
"free": "google/gemma-2-9b-it:free", # $0/0 — testing only
"budget": "meta-llama/llama-3.1-8b-instruct", # $0.06/$0.06 per 1M
"mid": "openai/gpt-4o-mini", # $0.15/$0.60 per 1M
"standard":"anthropic/claude-3.5-sonnet", # $3/$15 per 1M
"premium": "openai/o1", # $15/$60 per 1M
}
TASK_ROUTING = {
"classification": "budget", # Simple label assignment
"translation": "mid", # Moderate quality needed
"summarization": "mid", # Good quality, cost-effective
"code_generation": "standard", # Needs high accuracy
"code_review": "standard", # Needs reasoning
"analysis": "standard", # Complex reasoning
"creative_writing":"standard", # Quality matters
"deep_reasoning": "premium", # Multi-step logic
"simple_qa": "budget", # Basic questions
"chat": "mid", # General conversation
}
def route_request(task_type: str, messages: list[dict], **kwargs) -> dict:
"""Route to appropriate model based on task type."""
tier = TASK_ROUTING.get(task_type, "mid")
model = MODELS[tier]
response = client.chat.completions.create(
model=model, messages=messages, **kwargs
)
return {
"content": response.choices[0].message.content,
"model": response.model,
"tier": tier,
"tokens": response.usage.prompt_tokens + response.usage.completion_tokens,
}
Complexity-Based Auto-Router
def classify_complexity(prompt: str) -> str:
"""Classify prompt complexity to select model tier.
Simple heuristics -- replace with a trained classifier for production.
"""
word_count = len(prompt.splitUse multiple AI providers (OpenAI, Anthropic, Google, Meta) through OpenRouter's unified API.
OpenRouter Multi-Provider
Overview
OpenRouter's unified API lets you access models from OpenAI, Anthropic, Google, Meta, Mistral, and others with a single API key and endpoint. Model IDs use provider/model-name format. The same OpenAI SDK code works for any provider by simply changing the model ID. This skill covers provider comparison, cross-provider routing, feature normalization, and BYOK (Bring Your Own Key).
Provider Landscape
# List all providers and their model counts
curl -s https://openrouter.ai/api/v1/models | jq '
[.data[].id | split("/")[0]] |
group_by(.) | map({provider: .[0], models: length}) |
sort_by(-.models)'
Cross-Provider Comparison
import os, time, json
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
def compare_models(prompt: str, models: list[str], max_tokens: int = 500) -> list[dict]:
"""Run the same prompt across multiple models and compare results."""
results = []
for model in models:
start = time.monotonic()
try:
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=0,
)
latency = (time.monotonic() - start) * 1000
results.append({
"model": model,
"served_by": response.model,
"content": response.choices[0].message.content[:200] + "...",
"tokens": response.usage.prompt_tokens + response.usage.completion_tokens,
"latency_ms": round(latency, 1),
"status": "ok",
})
except Exception as e:
results.append({"model": model, "status": "error", "error": str(e)})
return results
# Compare top-tier models on the same task
results = compare_models(
"Explain the CAP theorem in distributed systems",
models=[
"anthropic/claude-3.5-sonnet", # Anthropic
"openai/gpt-4o", # OpenAI
"google/gemini-2.0-flash-001", # Google
"meta-llama/llama-3.1-70b-instruct", # Meta (open-source)
],
)
for r in results:
print(f"{r['model']}: {r.get('latency_ms', 'N/A')}ms, {r.get('tokens', 'N/A')} tokens")
Provider Strength Matrix
| Provider |
|---|
| OpenAI Direct | OpenRouter ID |
|---|---|
gpt-4o |
openai/gpt-4o |
gpt-4o-mini |
openai/gpt-4o-mini |
gpt-4-turbo |
openai/gpt-4-turbo |
o1 |
openai/o1 |
o1-mini |
openai/o1-mini |
You also gain access to non-OpenAI models through the same SDK:
# Same client, any provider
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet", # Anthropic
messages=[{"role": "user", "content": "Hello"}],
)
response = client.chat.completions.create(
model="google/gemini-2.0-flash", # Google
messages=[{"role": "user&Optimize OpenRouter request latency and throughput.
OpenRouter Performance Tuning
Overview
OpenRouter adds minimal overhead (~50-100ms) to direct provider calls. Most latency comes from the upstream model. Key levers: model selection (smaller = faster), streaming (lower TTFT), parallel requests, prompt size reduction, and provider routing to faster infrastructure. This skill covers benchmarking, streaming optimization, concurrent processing, and connection tuning.
Benchmark Latency
import os, time, statistics
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
def benchmark_model(model: str, prompt: str = "Say hello", n: int = 5) -> dict:
"""Benchmark a model's latency over N requests."""
latencies = []
for _ in range(n):
start = time.monotonic()
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=50,
)
latencies.append((time.monotonic() - start) * 1000)
return {
"model": model,
"p50_ms": round(statistics.median(latencies)),
"p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)]),
"avg_ms": round(statistics.mean(latencies)),
"min_ms": round(min(latencies)),
"max_ms": round(max(latencies)),
}
# Compare fast vs slow models
for model in ["openai/gpt-4o-mini", "anthropic/claude-3-haiku", "anthropic/claude-3.5-sonnet"]:
result = benchmark_model(model)
print(f"{result['model']}: p50={result['p50_ms']}ms p95={result['p95_ms']}ms")
Streaming for Lower TTFT
def stream_completion(messages, model="openai/gpt-4o-mini", **kwargs):
"""Stream response for lower time-to-first-token."""
start = time.monotonic()
first_token_time = None
full_content = []
stream = client.chat.completions.create(
model=model, messages=messages, stream=True,
stream_options={"include_usage": True}, # Get token counts at end
**kwargs,
)
for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
if first_token_time is None:
first_token_time = (time.monotonic() - start) * 1000
full_content.append(chunk.choices[0].delta.content)
total_time = (time.monotonic() - start) * 1000
return {
"content": "".join(full_content),
"ttft_ms": round(first_token_time or 0),
"toUnderstand OpenRouter pricing, calculate costs, and optimize spend.
OpenRouter Pricing Basics
Overview
OpenRouter charges per token with separate rates for prompt (input) and completion (output) tokens. Prices are listed per token in the models API (multiply by 1M for per-million rates). Credits are prepaid with a 5.5% processing fee ($0.80 minimum). Free models are available for testing and low-volume use.
How Pricing Works
- Buy credits at openrouter.ai/credits (5.5% fee, $0.80 minimum)
- Each request deducts
(prompttokens promptrate) + (completiontokens completionrate) - Check balance via
GET /api/v1/auth/keyor the dashboard - Auto-topup is available to prevent service interruption
Query Model Pricing
# Get pricing for all models
curl -s https://openrouter.ai/api/v1/models | jq '.data[] | select(.id == "anthropic/claude-3.5-sonnet") | {
id: .id,
prompt_per_M: ((.pricing.prompt | tonumber) * 1000000),
completion_per_M: ((.pricing.completion | tonumber) * 1000000),
context: .context_length
}'
# → { "id": "anthropic/claude-3.5-sonnet", "prompt_per_M": 3, "completion_per_M": 15, "context": 200000 }
Cost Tiers (Representative)
| Tier | Example Model | Prompt/1M | Completion/1M | Use Case |
|---|---|---|---|---|
| Free | google/gemma-2-9b-it:free |
$0.00 | $0.00 | Testing, prototyping |
| Budget | meta-llama/llama-3.1-8b-instruct |
$0.06 | $0.06 | Simple Q&A, classification |
| Mid | openai/gpt-4o-mini |
$0.15 | $0.60 | General purpose |
| Standard | anthropic/claude-3.5-sonnet |
$3.00 | $15.00 | Complex reasoning, code |
| Premium | openai/o1 |
$15.00 | $60.00 | Deep reasoning |
Calculate Request Cost
def estimate_cost(model_id: str, prompt_tokens: int, completion_tokens: int) -> float:
"""Calculate cost for a single request."""
import requests
models = requests.get("https://openrouter.ai/api/v1/models").json()["data"]
model = next((m for m in models if m["id"] == model_id), None)
if not model:
raise ValueError(f"Model {model_id} not found")
prompt_rate = float(model["pricing"]["prompt"]) # Cost per token
completion_rate = float(model["pricing"]["completion"])
return (prompt_tokens * prompt_rate) + (coValidate production readiness of your OpenRouter integration.
OpenRouter Production Checklist
Overview
A comprehensive production readiness checklist for OpenRouter integrations covering security, reliability, observability, cost management, and operational procedures. Each item includes the specific API endpoint or configuration needed to verify compliance.
Security Checklist
SECURITY = {
"api_key_storage": {
"check": "API keys stored in secrets manager (not .env files on disk)",
"verify": "grep -r 'sk-or-v1-' --include='*.py' --include='*.ts' . | grep -v node_modules",
"pass": "Zero matches",
},
"key_rotation": {
"check": "Keys rotated on 90-day schedule",
"verify": "Check key creation dates in OpenRouter dashboard",
"api": "GET /api/v1/keys (management key)",
},
"credit_limits": {
"check": "Per-key credit limits set to isolate blast radius",
"verify": "curl -s https://openrouter.ai/api/v1/auth/key -H 'Authorization: Bearer $KEY' | jq '.data.limit'",
"pass": "Non-null limit value",
},
"secret_scanning": {
"check": "CI pipeline includes secret scanning (gitleaks, trufflehog)",
"verify": "Check CI config for secret scanning step",
},
"https_enforced": {
"check": "All requests use https://openrouter.ai/api/v1",
"verify": "Grep codebase for 'http://openrouter' (should be zero)",
},
}
Reliability Checklist
RELIABILITY = {
"fallback_models": {
"check": "Fallback chain configured for critical models",
"config": """extra_body={"models": ["primary", "secondary", "tertiary"], "route": "fallback"}""",
},
"retry_logic": {
"check": "Retry with exponential backoff for 429 and 5xx errors",
"config": "OpenAI SDK max_retries=3 (built-in backoff)",
},
"timeouts": {
"check": "Per-request timeout configured",
"config": "OpenAI(timeout=30.0) # 30s per request",
},
"circuit_breaker": {
"check": "Circuit breaker on primary model (3 failures → fallback)",
"verify": "Review client wrapper for circuit breaker pattern",
},
"max_tokens": {
"check": "max_tokens set on EVERY request",
"verify": &qUnderstand and handle OpenRouter rate limits.
OpenRouter Rate Limits
Overview
OpenRouter rate limits are per-key, not per-account. Free tier keys get lower limits; paid keys get higher limits that scale with credit balance. The OpenAI SDK has built-in retry with exponential backoff for 429 responses. Check your current limits via GET /api/v1/auth/key. Rate limit headers are returned on every response.
Check Your Rate Limits
# Query current rate limit configuration for your key
curl -s https://openrouter.ai/api/v1/auth/key \
-H "Authorization: Bearer $OPENROUTER_API_KEY" | jq '{
label: .data.label,
rate_limit: .data.rate_limit,
is_free_tier: .data.is_free_tier,
credits_used: .data.usage,
credit_limit: .data.limit
}'
# Example output:
# {
# "label": "my-app-prod",
# "rate_limit": {"requests": 200, "interval": "10s"},
# "is_free_tier": false,
# "credits_used": 12.34,
# "credit_limit": 100
# }
Rate Limit Tiers
| Tier | Requests | Interval | Who |
|---|---|---|---|
| Free (no credits) | 20 | 10s | New accounts |
| Free (with credits) | 200 | 10s | Accounts with any credits |
| Paid | Higher | Varies | Based on credit balance |
Free models have separate limits: 50 req/day (free users), 1000 req/day (with $10+ credits).
Read Rate Limit Headers
import os
from openai import OpenAI
import requests as http_requests
# The OpenAI SDK abstracts headers, so use requests for direct access
def check_rate_headers():
"""Make a request and inspect rate limit headers."""
resp = http_requests.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
"Content-Type": "application/json",
"HTTP-Referer": "https://my-app.com",
},
json={
"model": "openai/gpt-4o-mini",
"messages": [{"role": "user", "content": "hi"}],
"max_tokens": 1,
},
)
return {
"status": resp.status_code,
"x-ratelimit-limit": resp.headers.get("x-ratelimit-limit"),
"x-ratelimit-remaining": resp.headers.get("x-ratelimit-remaining"),
"x-ratelimit-reset": resp.headers.get("x-ratelimit-reset"),
"retry-after": resp.headers.get("retry-after"),
}
Retry Strategy with OpenAI S
Design production architectures using OpenRouter as the LLM gateway.
OpenRouter Reference Architecture
Overview
OpenRouter serves as a unified LLM gateway, abstracting provider complexity. A production architecture wraps it with caching, rate limiting, cost controls, observability, and async processing. This skill provides three reference architectures: simple (single service), standard (microservice), and enterprise (event-driven).
Architecture 1: Simple (Single Service)
┌─────────────┐ ┌──────────────────────────┐ ┌──────────────┐
│ Your App │────▶│ OpenRouter Client │────▶│ OpenRouter │
│ │ │ - Retry (SDK built-in) │ │ /api/v1 │
│ │◀────│ - Cost tracking │◀────│ │
│ │ │ - Structured logging │ └──────────────┘
└─────────────┘ └──────────────────────────┘
import os, logging
from openai import OpenAI
log = logging.getLogger("llm")
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
max_retries=3,
timeout=30.0,
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
def complete(prompt, model="openai/gpt-4o-mini", **kwargs):
kwargs.setdefault("max_tokens", 1024)
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
**kwargs,
)
log.info(f"[{response.model}] {response.usage.prompt_tokens}+{response.usage.completion_tokens} tokens")
return response.choices[0].message.content
Architecture 2: Standard (Microservice)
┌─────────────┐ ┌─────────────────────┐ ┌──────────────┐
│ API Gateway│────▶│ AI Service │────▶│ OpenRouter │
│ (auth, │ │ ┌─────────────┐ │ │ /api/v1 │
│ rate-limit│ │ │ Router │ │ └──────────────┘
│ logging) │ │ │ (task→model)│ │
└─────────────┘ │ └─────────────┘ │
│ ┌─────────────┐ │
│ │ Cache │◀──▶│── Redis
│ │ (TTL-based) │ │
│ └─────────────┘ │
│ ┌─────────────┐ │
│ │ Budget │◀──▶│── SQLite/Postgres
│ │ Enforcer │ │
│ └─────────────┘ │
└─────────────────────┘
from fastapi import FastAPI, Depends, HTTPException
from pydantic import BaseModel
app = FastAPI()
class CompletionRequest(BaseModel):
prompt: str
task_type: str = "general" # classification, code, analysis, etc.
max_tokens: int = 1024
user_id: str = "anonymous"
ROUTING_TABLE = {
"classification": "openaDefine custom routing rules for OpenRouter requests based on user tier, task type, cost budget, and availability.
OpenRouter Routing Rules
Overview
Beyond simple task-based model selection, production systems need configurable routing rules that consider user tier, cost budget, time of day, model availability, and feature requirements. This skill covers building a rules engine for OpenRouter model selection with config-driven rules, dynamic conditions, and override capabilities.
Rules Engine
import os, json, time
from dataclasses import dataclass
from typing import Optional, Callable
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
@dataclass
class RoutingContext:
user_tier: str = "free" # "free" | "basic" | "pro" | "enterprise"
task_type: str = "general" # "chat" | "code" | "analysis" | "classification"
budget_remaining: float = 0.0 # Remaining daily budget in dollars
prompt_tokens_est: int = 0 # Estimated prompt tokens
needs_tools: bool = False # Requires function calling
needs_vision: bool = False # Requires image input
max_latency_ms: int = 30000 # Latency SLA
@dataclass
class RoutingRule:
name: str
priority: int # Lower = higher priority
condition: Callable[[RoutingContext], bool]
model: str
fallbacks: list[str] = None
max_tokens: int = 1024
def matches(self, ctx: RoutingContext) -> bool:
try:
return self.condition(ctx)
except Exception:
return False
# Define rules in priority order
RULES = [
# Rule 1: Free users get free models only
RoutingRule(
name="free-tier",
priority=1,
condition=lambda ctx: ctx.user_tier == "free",
model="google/gemma-2-9b-it:free",
fallbacks=["meta-llama/llama-3.1-8b-instruct"],
max_tokens=512,
),
# Rule 2: Low budget → cheap models
RoutingRule(
name="low-budget",
priority=2,
condition=lambda ctx: ctx.budget_remaining < 1.0 and ctx.user_tier != "enterprise",
model="openai/gpt-4o-mini",
fallbacks=["meta-llama/llama-3.1-8b-instruct"],
max_tokens=512,
),
# Rule 3: Tool calling required → tool-capable models
RoutingRule(
name="tools-required",
priority=3,
condition=lambda ctx: ctx.needs_tools,
model="openai/gpt-4o",
fallbacks=["anthropic/claude-3.5-sonnet"],
),
# Rule 4: Vision required
RoutingRule(
name="vision-required",
priority=4,
condition=lambda ctx: ctx.needs_vision,
Build reusable OpenRouter client wrappers with retries, typing, and middleware.
OpenRouter SDK Patterns
Overview
Build production-grade OpenRouter client wrappers using the OpenAI SDK. The OpenAI Python/TypeScript SDKs work natively with OpenRouter by changing base_url to https://openrouter.ai/api/v1. This skill covers typed wrappers, retry strategies, middleware, and reusable patterns.
Python: Production Client Wrapper
import os, time, hashlib, json, logging
from dataclasses import dataclass
from typing import Optional
from openai import OpenAI, APIError, RateLimitError, APITimeoutError
log = logging.getLogger("openrouter")
@dataclass
class CompletionResult:
content: str
model: str
prompt_tokens: int
completion_tokens: int
generation_id: str
latency_ms: float
class OpenRouterClient:
def __init__(
self,
api_key: Optional[str] = None,
app_name: str = "my-app",
app_url: str = "https://my-app.com",
max_retries: int = 3,
timeout: float = 60.0,
):
self.client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key or os.environ["OPENROUTER_API_KEY"],
max_retries=max_retries, # Built-in SDK retry with backoff
timeout=timeout,
default_headers={
"HTTP-Referer": app_url,
"X-Title": app_name,
},
)
self._cache: dict[str, CompletionResult] = {}
def complete(
self,
prompt: str,
model: str = "anthropic/claude-3.5-sonnet",
system: str = "",
max_tokens: int = 1024,
temperature: float = 0.7,
cache: bool = False,
**extra_params,
) -> CompletionResult:
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
# Optional caching (deterministic requests only)
cache_key = None
if cache and temperature == 0:
cache_key = hashlib.sha256(
json.dumps({"model": model, "messages": messages, "max_tokens": max_tokens}).encode()
).hexdigest()
if cache_key in self._cache:
log.debug(f"Cache hit: {cache_key[:12]}")
return self._cache[cache_key]
start = time.monotonic()
response = self.client.chat.completions.create(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
**extra_params,
)
latency = (time.monotonic() - start) * 1000
result = CompletionResult(
content=response.choices[0].message.content or "",
model=resImplement streaming responses with OpenRouter for real-time UIs.
OpenRouter Streaming Setup
Overview
OpenRouter supports Server-Sent Events (SSE) streaming via stream: true, compatible with the OpenAI SDK. Streaming returns tokens as they're generated, reducing time-to-first-token (TTFT) from seconds to milliseconds. Usage stats are available via streamoptions: {includeusage: true} in the final chunk. This skill covers Python and TypeScript streaming, SSE forwarding to browsers, and error recovery.
Python: Basic Streaming
import os
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Stream with usage stats
stream = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet",
messages=[{"role": "user", "content": "Explain how HTTP streaming works"}],
max_tokens=500,
stream=True,
stream_options={"include_usage": True}, # Get token counts in final chunk
)
full_content = []
for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
token = chunk.choices[0].delta.content
print(token, end="", flush=True)
full_content.append(token)
# Final chunk contains usage stats
if chunk.usage:
print(f"\n---\nTokens: {chunk.usage.prompt_tokens} in + {chunk.usage.completion_tokens} out")
result = "".join(full_content)
Python: Streaming with Metrics
import time
def stream_with_metrics(messages, model="anthropic/claude-3.5-sonnet", **kwargs):
"""Stream response and capture performance metrics."""
start = time.monotonic()
first_token_time = None
chunks = []
usage = None
stream = client.chat.completions.create(
model=model, messages=messages, stream=True,
stream_options={"include_usage": True},
**kwargs,
)
for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
token = chunk.choices[0].delta.content
if first_token_time is None:
first_token_time = (time.monotonic() - start) * 1000
chunks.append(token)
yield token # Yield each token as it arrives
if chunk.usage:
usage = {
"prompt_tokens": chunk.usage.prompt_tokens,
"completion_tokens": chunk.usage.completion_tokens,
}
total_time = (time.monotonic() - start) * 1000
# Metrics available after generator exhausted
stream_with_metrics.last_metrics = {
"ttft_ms": round(first_token_time or 0),
"total_ms": round(toConfigure OpenRouter for multi-user teams with per-user keys, budget controls, and usage attribution.
OpenRouter Team Setup
Overview
OpenRouter supports team usage through per-user API keys with individual credit limits, management keys for programmatic key provisioning, and usage attribution via headers. This skill covers key provisioning, per-user budgets, usage tracking, and governance policies for multi-user deployments.
Key Provisioning via Management API
import os, requests
MGMT_KEY = os.environ["OPENROUTER_MGMT_KEY"] # Management key (cannot call completions)
def create_team_key(name: str, credit_limit: float = 25.0) -> dict:
"""Create a new API key for a team member."""
resp = requests.post(
"https://openrouter.ai/api/v1/keys",
headers={"Authorization": f"Bearer {MGMT_KEY}"},
json={"name": name, "limit": credit_limit},
)
resp.raise_for_status()
data = resp.json()["data"]
return {
"key": data["key"], # sk-or-v1-... (shown once)
"hash": data["key_hash"], # For later identification
"name": name,
"limit": credit_limit,
}
def list_team_keys() -> list[dict]:
"""List all keys with usage and limits."""
resp = requests.get(
"https://openrouter.ai/api/v1/keys",
headers={"Authorization": f"Bearer {MGMT_KEY}"},
)
return [
{
"name": k.get("name"),
"hash": k.get("key_hash"),
"usage": k.get("usage", 0),
"limit": k.get("limit"),
"is_free_tier": k.get("is_free_tier", False),
}
for k in resp.json().get("data", [])
]
def delete_team_key(key_hash: str):
"""Revoke a team member's key."""
resp = requests.delete(
f"https://openrouter.ai/api/v1/keys/{key_hash}",
headers={"Authorization": f"Bearer {MGMT_KEY}"},
)
resp.raise_for_status()
# Provision keys for the team
for member in ["alice-backend", "bob-frontend", "carol-ml"]:
key_info = create_team_key(member, credit_limit=50.0)
print(f"Created key for {member}: {key_info['key'][:20]}...")
Shared Key with User Attribution
from openai import OpenAI
# Alternative: single shared key with user identification via headers
def get_client_for_user(user_id: str) -> OpenAI:
"""Create a client that attributes usage to a specific user."""
return OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
Migrate to OpenRouter from direct provider APIs or upgrade between SDK/model versions.
OpenRouter Upgrade & Migration
Current State
!npm list openai 2>/dev/null | head -5
!pip show openai 2>/dev/null | head -5
Overview
Migrating to OpenRouter from a direct provider API (OpenAI, Anthropic) is minimal: change baseurl and apikey, add two headers. The OpenAI SDK works natively with OpenRouter. This skill covers migrating from direct APIs, switching between models, upgrading SDK versions, and running comparison tests.
Migration from Direct OpenAI
# BEFORE: Direct OpenAI
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello"}],
max_tokens=200,
)
# AFTER: Via OpenRouter (3 lines changed)
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1", # ← Changed
api_key=os.environ["OPENROUTER_API_KEY"], # ← Changed
default_headers={ # ← Added
"HTTP-Referer": "https://my-app.com",
"X-Title": "my-app",
},
)
response = client.chat.completions.create(
model="openai/gpt-4o", # ← Add provider prefix
messages=[{"role": "user", "content": "Hello"}],
max_tokens=200,
)
Migration from Direct Anthropic
# BEFORE: Direct Anthropic SDK
import anthropic
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=200,
messages=[{"role": "user", "content": "Hello"}],
)
content = response.content[0].text
# AFTER: Via OpenRouter (using OpenAI SDK instead of Anthropic SDK)
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={
"HTTP-Referer": "https://my-app.com",
"X-Title": "my-app",
},
)
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet", # OpenRouter model ID
messages=[{"role": "user", "content": "Hello"}],
max_tokens=200,
)
content = response.choices[0].message.content # OpenAI response format
TypeScript Migration
// BEFORE: Direct OpenAI
import OpenAI from "openai";
const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
// AFTER: Via OpenRouter
const client = new OpenAI({
baseURL: "https://openrouter.ai/api/v1",
apiKey: proceTrack and analyze OpenRouter API usage patterns, costs, and performance.
OpenRouter Usage Analytics
Overview
OpenRouter provides usage data through three endpoints: GET /api/v1/auth/key (credit balance and rate limits), GET /api/v1/generation?id= (per-request cost and metadata), and response usage fields (token counts). This skill covers collecting metrics from these sources, building analytics pipelines, cost reporting, and performance dashboards.
Collect Per-Request Metrics
import os, time, json, logging
from datetime import datetime, timezone
from openai import OpenAI
import requests as http_requests
log = logging.getLogger("openrouter.analytics")
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
def tracked_completion(messages, model="openai/gpt-4o-mini", user_id="system", **kwargs):
"""Make a completion and capture full analytics."""
start = time.monotonic()
response = client.chat.completions.create(
model=model, messages=messages, **kwargs
)
latency = (time.monotonic() - start) * 1000
# Fetch exact cost from generation endpoint
cost = 0.0
try:
gen = http_requests.get(
f"https://openrouter.ai/api/v1/generation?id={response.id}",
headers={"Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}"},
timeout=5,
).json()
cost = float(gen.get("data", {}).get("total_cost", 0))
except Exception:
pass
metric = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"generation_id": response.id,
"model_requested": model,
"model_used": response.model,
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_cost": cost,
"latency_ms": round(latency, 1),
"user_id": user_id,
}
log.info(json.dumps(metric))
return response, metric
Analytics Database
import sqlite3
def init_analytics_db(db_path: str = "openrouter_analytics.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
generation_id TEXT UNIQUE,
model_requested TEXT,
model_used TEXT,
prompt_tokens INTEGER,
completion_tokens INTEGER,
total_cost REAL,
latency_ms REAL,
user_id TEXT
)
""Ready to use openrouter-pack?
Related Plugins
ai-ethics-validator
AI ethics and fairness validation
ai-experiment-logger
Track and analyze AI experiments with a web dashboard and MCP tools
ai-ml-engineering-pack
Professional AI/ML Engineering toolkit: Prompt engineering, LLM integration, RAG systems, AI safety with 12 expert plugins
ai-sdk-agents
Multi-agent orchestration with AI SDK v5 - handoffs, routing, and coordination for any AI provider (OpenAI, Anthropic, Google)
anomaly-detection-system
Detect anomalies and outliers in data
automl-pipeline-builder
Build AutoML pipelines