重要前提
安装AI Skills的关键前提是:必须科学上网,且开启TUN模式,这一点至关重要,直接决定安装能否顺利完成,在此郑重提醒三遍:科学上网,科学上网,科学上网。查看完整安装教程 →
local-llm-router by hoodini/ai-agents-skills
npx skills add https://github.com/hoodini/ai-agents-skills --skill local-llm-router通过集成 Serena LSP,将 AI 编码查询智能路由到本地 LLM,为安全、支持离线工作的开发环境提供支持。
在使用此技能前,请确保:
# 安装 Serena(必需)
pip install serena
# 或通过 uvx
uvx --from git+https://github.com/oraios/serena serena start-mcp-server
# 验证本地 LLM 服务
curl http://localhost:11434/api/version # Ollama
curl http://localhost:1234/v1/models # LM Studio
curl http://localhost:1337/v1/models # Jan
import httpx
import asyncio
from dataclasses import dataclass
from enum import Enum
from typing import Optional
class TaskCategory(Enum):
CODING = "coding"
REASONING = "reasoning"
ANALYSIS = "analysis"
DOCUMENTATION = "documentation"
@dataclass
class RouterConfig:
"""本地 LLM 路由器配置。"""
ollama_url: str = "http://localhost:11434"
lmstudio_url: str = "http://localhost:1234"
jan_url: str = "http://localhost:1337"
serena_enabled: bool = True
timeout: int = 30
async def quick_route(query: str, config: RouterConfig = RouterConfig()):
"""快速路由示例 - 检测服务并路由查询。"""
# 1. 检测可用服务
services = await discover_services(config)
if not services:
raise RuntimeError("没有可用的本地 LLM 服务")
# 2. 分类任务
category = classify_task(query)
# 3. 为任务选择最佳模型
model = select_model(category, services)
# 4. 执行查询
return await execute_query(query, model, services[0])
# 使用示例
async def main():
response = await quick_route("Write a function to parse JSON safely")
print(response)
asyncio.run(main())
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
关键:对于所有与代码相关的任务,必须首先调用 Serena MCP。这可以在路由到 LLM 之前提供代码库的语义理解。
import subprocess
import json
from typing import Any
class SerenaMCP:
"""用于代码智能的 Serena MCP 客户端。"""
def __init__(self, workspace_root: str):
self.workspace = workspace_root
self.process = None
async def start(self):
"""启动 Serena MCP 服务器。"""
self.process = subprocess.Popen(
["serena", "start-mcp-server", "--workspace", self.workspace],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
async def call(self, method: str, params: dict) -> Any:
"""调用 Serena MCP 方法。"""
request = {
"jsonrpc": "2.0",
"id": 1,
"method": method,
"params": params
}
self.process.stdin.write(json.dumps(request).encode() + b"\n")
self.process.stdin.flush()
response = self.process.stdout.readline()
return json.loads(response)
async def find_symbol(self, name: str) -> dict:
"""按名称查找符号定义。"""
return await self.call("find_symbol", {"name": name})
async def get_references(self, file: str, line: int, char: int) -> list:
"""获取指定位置符号的所有引用。"""
return await self.call("get_references", {
"file": file,
"line": line,
"character": char
})
async def get_hover_info(self, file: str, line: int, char: int) -> dict:
"""获取指定位置的类型/文档信息。"""
return await self.call("get_hover_info", {
"file": file,
"line": line,
"character": char
})
async def get_diagnostics(self, file: str) -> list:
"""获取文件的错误/警告。"""
return await self.call("get_diagnostics", {"file": file})
async def apply_edit(self, file: str, edits: list) -> bool:
"""将代码编辑应用到文件。"""
return await self.call("apply_edit", {"file": file, "edits": edits})
# 按优先级排序的 Serena 工具(始终优先使用更高优先级的)
SERENA_TOOLS = {
# 优先级 1:符号级操作(最高)
"find_symbol": {"priority": 1, "use_for": ["navigation", "definition"]},
"get_references": {"priority": 1, "use_for": ["refactoring", "impact analysis"]},
"get_hover_info": {"priority": 1, "use_for": ["type info", "documentation"]},
# 优先级 2:代码导航
"go_to_definition": {"priority": 2, "use_for": ["navigation"]},
"go_to_type_definition": {"priority": 2, "use_for": ["type navigation"]},
"go_to_implementation": {"priority": 2, "use_for": ["interface impl"]},
# 优先级 3:代码理解
"get_document_symbols": {"priority": 3, "use_for": ["file structure"]},
"get_workspace_symbols": {"priority": 3, "use_for": ["codebase search"]},
"get_call_hierarchy": {"priority": 3, "use_for": ["call analysis"]},
# 优先级 4:代码修改
"apply_edit": {"priority": 4, "use_for": ["editing"]},
"rename_symbol": {"priority": 4, "use_for": ["refactoring"]},
# 优先级 5:诊断
"get_diagnostics": {"priority": 5, "use_for": ["errors", "warnings"]},
"get_code_actions": {"priority": 5, "use_for": ["quick fixes"]},
}
async def handle_code_request(
query: str,
file_context: Optional[dict] = None,
serena: SerenaMCP = None,
router: "LLMRouter" = None
):
"""
使用 Serena 优先模式处理代码请求。
关键:对于代码任务,始终首先调用 Serena。
"""
# 步骤 1:分类任务
category = classify_task(query)
# 步骤 2:始终使用 Serena 获取代码上下文(如果可用)
serena_context = {}
if serena and file_context:
# 从 Serena 收集语义上下文
if file_context.get("file") and file_context.get("position"):
file = file_context["file"]
line = file_context["position"]["line"]
char = file_context["position"]["character"]
# 获取悬停信息(类型、文档)
serena_context["hover"] = await serena.get_hover_info(file, line, char)
# 对于重构/分析,获取引用
if category in [TaskCategory.ANALYSIS, TaskCategory.CODING]:
if "refactor" in query.lower() or "rename" in query.lower():
serena_context["references"] = await serena.get_references(
file, line, char
)
# 始终获取文件的诊断信息
serena_context["diagnostics"] = await serena.get_diagnostics(file)
# 步骤 3:使用 Serena 上下文构建增强提示
enriched_query = build_enriched_query(query, serena_context)
# 步骤 4:选择并路由到合适的 LLM
model = router.select_model(category)
response = await router.execute(enriched_query, model)
# 步骤 5:如果响应包含编辑,则通过 Serena 应用
if serena and contains_code_edit(response):
edits = parse_code_edits(response)
await serena.apply_edit(file_context["file"], edits)
return response
def build_enriched_query(query: str, serena_context: dict) -> str:
"""使用 Serena 上下文构建增强查询。"""
parts = [query]
if serena_context.get("hover"):
hover = serena_context["hover"]
parts.append(f"\n## 类型信息\n```\n{hover}\n```")
if serena_context.get("references"):
refs = serena_context["references"]
parts.append(f"\n## 引用(找到 {len(refs)} 个)\n")
for ref in refs[:10]: # 限制为前 10 个
parts.append(f"- {ref['file']}:{ref['line']}")
if serena_context.get("diagnostics"):
diags = serena_context["diagnostics"]
if diags:
parts.append(f"\n## 当前问题({len(diags)} 个)\n")
for diag in diags[:5]:
parts.append(f"- 第 {diag['line']} 行:{diag['message']}")
return "\n".join(parts)
| 服务 | 默认端点 | 健康检查 | 模型端点 | 聊天端点 | API 风格 |
|---|---|---|---|---|---|
| Ollama | localhost:11434 | /api/version | /api/tags | /api/chat | Native |
| LM Studio | localhost:1234 | /v1/models | /v1/models | /v1/chat/completions | OpenAI |
| Jan | localhost:1337 | /v1/models | /v1/models | /v1/chat/completions | OpenAI |
| OpenWebUI | localhost:3000 | /api/health | /api/models | /api/chat | Custom |
| LocalAI | localhost:8080 | /readyz | /v1/models | /v1/chat/completions | OpenAI |
| vLLM | localhost:8000 | /health | /v1/models | /v1/chat/completions | OpenAI |
| llama.cpp | localhost:8080 | /health | /v1/models | /v1/chat/completions | OpenAI |
| Kobold.cpp | localhost:5001 | /api/v1/info | /api/v1/models | /api/v1/generate | Custom |
| GPT4All | localhost:4891 | /v1/models | /v1/models | /v1/chat/completions | OpenAI |
| text-generation-webui | localhost:5000 | /api/v1/model | /api/v1/models | /api/v1/chat | Custom |
import sys
import os
import platform
from dataclasses import dataclass
@dataclass
class OSInfo:
platform: str # 'windows', 'linux', 'darwin'
release: str
arch: str # 'x64', 'arm64'
is_wsl: bool
is_container: bool
def detect_os() -> OSInfo:
"""检测操作系统和环境。"""
plat = sys.platform
# 规范化平台名称
if plat == 'win32':
plat = 'windows'
elif plat == 'darwin':
plat = 'darwin'
else:
plat = 'linux'
# WSL 检测
is_wsl = False
if plat == 'linux':
try:
with open('/proc/version', 'r') as f:
is_wsl = 'microsoft' in f.read().lower()
except FileNotFoundError:
pass
is_wsl = is_wsl or os.environ.get('WSL_DISTRO_NAME') is not None
# 容器检测
is_container = (
os.path.exists('/.dockerenv') or
os.environ.get('KUBERNETES_SERVICE_HOST') is not None
)
if not is_container and plat == 'linux':
try:
with open('/proc/1/cgroup', 'r') as f:
is_container = 'docker' in f.read() or 'kubepods' in f.read()
except FileNotFoundError:
pass
return OSInfo(
platform=plat,
release=platform.release(),
arch=platform.machine(),
is_wsl=is_wsl,
is_container=is_container
)
def adjust_endpoint_for_os(endpoint: str, os_info: OSInfo) -> str:
"""根据操作系统环境调整端点。"""
if os_info.is_wsl or os_info.is_container:
# 在 WSL/容器中,localhost 服务位于主机上
return endpoint.replace('localhost', 'host.docker.internal')
return endpoint
import httpx
import asyncio
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class DiscoveredModel:
id: str
name: str
size: int = 0
family: Optional[str] = None
context_length: int = 4096
quantization: Optional[str] = None
@dataclass
class LLMService:
name: str
type: str # 'ollama', 'lmstudio', 'jan', 'openwebui', 'custom'
endpoint: str
status: str = 'unknown' # 'online', 'offline', 'unknown'
models: list = field(default_factory=list)
last_checked: datetime = None
api_style: str = 'openai' # 'openai', 'native'
# 端点路径
health_path: str = '/v1/models'
models_path: str = '/v1/models'
chat_path: str = '/v1/chat/completions'
# 默认服务配置
SERVICE_DEFAULTS = {
'ollama': LLMService(
name='Ollama',
type='ollama',
endpoint='http://localhost:11434',
health_path='/api/version',
models_path='/api/tags',
chat_path='/api/chat',
api_style='native'
),
'lmstudio': LLMService(
name='LM Studio',
type='lmstudio',
endpoint='http://localhost:1234',
health_path='/v1/models',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'jan': LLMService(
name='Jan',
type='jan',
endpoint='http://localhost:1337',
health_path='/v1/models',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'openwebui': LLMService(
name='Open WebUI',
type='openwebui',
endpoint='http://localhost:3000',
health_path='/api/health',
models_path='/api/models',
chat_path='/api/chat',
api_style='custom'
),
'localai': LLMService(
name='LocalAI',
type='localai',
endpoint='http://localhost:8080',
health_path='/readyz',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'vllm': LLMService(
name='vLLM',
type='vllm',
endpoint='http://localhost:8000',
health_path='/health',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'llamacpp': LLMService(
name='llama.cpp',
type='llamacpp',
endpoint='http://localhost:8080',
health_path='/health',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'koboldcpp': LLMService(
name='Kobold.cpp',
type='koboldcpp',
endpoint='http://localhost:5001',
health_path='/api/v1/info',
models_path='/api/v1/model',
chat_path='/api/v1/generate',
api_style='custom'
),
'gpt4all': LLMService(
name='GPT4All',
type='gpt4all',
endpoint='http://localhost:4891',
health_path='/v1/models',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
}
class ServiceDiscovery:
"""发现和监控本地 LLM 服务。"""
def __init__(self, custom_endpoints: list = None):
self.services: dict[str, LLMService] = {}
self.os_info = detect_os()
self.custom_endpoints = custom_endpoints or []
self._client = httpx.AsyncClient(timeout=5.0)
async def discover_all(self) -> list[LLMService]:
"""发现所有可用的 LLM 服务。"""
discovered = []
# 检查默认服务
tasks = []
for key, default in SERVICE_DEFAULTS.items():
service = LLMService(
name=default.name,
type=default.type,
endpoint=adjust_endpoint_for_os(default.endpoint, self.os_info),
health_path=default.health_path,
models_path=default.models_path,
chat_path=default.chat_path,
api_style=default.api_style
)
tasks.append(self._check_service(service))
# 检查自定义端点
for custom in self.custom_endpoints:
service = LLMService(
name=custom.get('name', 'Custom'),
type='custom',
endpoint=custom['endpoint'],
health_path=custom.get('health_path', '/v1/models'),
models_path=custom.get('models_path', '/v1/models'),
chat_path=custom.get('chat_path', '/v1/chat/completions'),
api_style=custom.get('api_style', 'openai')
)
tasks.append(self._check_service(service))
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, LLMService) and result.status == 'online':
discovered.append(result)
self.services[result.type] = result
return discovered
async def _check_service(self, service: LLMService) -> LLMService:
"""检查服务是否在线并发现模型。"""
try:
# 健康检查
response = await self._client.get(
f"{service.endpoint}{service.health_path}"
)
if response.status_code == 200:
service.status = 'online'
service.last_checked = datetime.now()
# 发现模型
service.models = await self._discover_models(service)
else:
service.status = 'offline'
except (httpx.ConnectError, httpx.TimeoutException):
service.status = 'offline'
return service
async def _discover_models(self, service: LLMService) -> list[DiscoveredModel]:
"""发现服务上的可用模型。"""
try:
response = await self._client.get(
f"{service.endpoint}{service.models_path}"
)
data = response.json()
# 根据服务类型解析
if service.type == 'ollama':
return [
DiscoveredModel(
id=m['name'],
name=m['name'],
size=m.get('size', 0),
family=m.get('details', {}).get('family'),
context_length=self._infer_context_length(m['name'])
)
for m in data.get('models', [])
]
else: # OpenAI 风格
return [
DiscoveredModel(
id=m['id'],
name=m['id'],
context_length=m.get('context_length', 4096)
)
for m in data.get('data', [])
]
except Exception:
return []
def _infer_context_length(self, model_name: str) -> int:
"""根据模型名称推断上下文长度。"""
name_lower = model_name.lower()
# 检查显式上下文标记
if '128k' in name_lower or '131k' in name_lower:
return 131072
if '64k' in name_lower:
return 65536
if '32k' in name_lower:
return 32768
if '16k' in name_lower:
return 16384
# 模型系列默认值
if 'qwen' in name_lower:
return 131072 # Qwen 模型通常有 128K+
if 'deepseek' in name_lower:
return 128000
if 'llama-3' in name_lower or 'llama3' in name_lower:
return 128000
if 'codellama' in name_lower:
return 100000
if 'mixtral' in name_lower:
return 65536
return 8192 # 安全默认值
import re
from enum import Enum
from dataclasses import dataclass
class TaskCategory(Enum):
CODING = "coding"
REASONING = "reasoning"
ANALYSIS = "analysis"
DOCUMENTATION = "documentation"
@dataclass
class ClassificationResult:
category: TaskCategory
confidence: float # 0.0 - 1.0
requires_serena: bool
keywords_matched: list[str]
# 任务模式(正则表达式)
TASK_PATTERNS = {
TaskCategory.CODING: [
r"(?:write|create|implement|code|generate)\s+(?:a\s+)?(?:function|class|method|component)",
r"(?:fix|debug|solve)\s+(?:this|the)\s+(?:bug|error|issue)",
r"refactor\s+(?:this|the)",
r"add\s+(?:error\s+handling|validation|logging|tests?)",
r"complete\s+(?:this|the)\s+code",
r"(?:convert|translate)\s+(?:this|the)\s+code",
r"(?:optimize|improve)\s+(?:this|the)\s+(?:function|code|performance)",
],
TaskCategory.REASONING: [
r"(?:design|architect|plan)\s+(?:a|the)\s+(?:system|architecture|solution)",
r"how\s+should\s+(?:I|we)\s+(?:approach|structure|implement)",
r"what\s+(?:is|would\s+be)\s+the\s+best\s+(?:way|approach|pattern)",
r"explain\s+the\s+(?:logic|reasoning|algorithm)",
r"compare\s+(?:and\s+contrast|between)",
r"(?:recommend|suggest)\s+(?:an?\s+)?(?:approach|solution|pattern)",
r"trade-?offs?\s+(?:between|of)",
],
TaskCategory.ANALYSIS: [
r"(?:review|analyze|audit)\s+(?:this|the)\s+code",
r"find\s+(?:potential\s+)?(?:issues|vulnerabilities|bugs|problems)",
r"(?:security|performance)\s+(?:review|analysis|audit)",
r"what\s+(?:could|might)\s+go\s+wrong",
r"identify\s+(?:problems|improvements|issues)",
r"(?:check|scan)\s+for\s+(?:vulnerabilities|issues)",
],
TaskCategory.DOCUMENTATION: [
r"(?:write|create|generate)\s+(?:documentation|docs|docstring)",
r"(?:add|write)\s+(?:comments|jsdoc|docstring|type\s+hints)",
r"(?:document|explain)\s+(?:this|the)\s+(?:code|function|api)",
r"(?:create|write)\s+(?:a\s+)?readme",
r"(?:generate|write)\s+(?:api\s+)?documentation",
r"describe\s+(?:what|how)\s+(?:this|the)",
],
}
# 关键词权重用于评分
KEYWORD_WEIGHTS = {
# 编码
"function": (TaskCategory.CODING, 0.3),
"implement": (TaskCategory.CODING, 0.4),
"code": (TaskCategory.CODING, 0.2),
"debug": (TaskCategory.CODING, 0.5),
"refactor": (TaskCategory.CODING, 0.6),
"fix": (TaskCategory.CODING, 0.4),
"test": (TaskCategory.CODING, 0.3),
"bug": (TaskCategory.CODING, 0.5),
# 推理
"architecture": (TaskCategory.REASONING, 0.6),
"design": (TaskCategory.REASONING, 0.4),
"approach": (TaskCategory.REASONING, 0.3),
"strategy": (TaskCategory.REASONING, 0.5),
"tradeoff": (TaskCategory.REASONING, 0.5),
"compare": (TaskCategory.REASONING, 0.4),
"recommend": (TaskCategory.REASONING, 0.4),
# 分析
"review": (TaskCategory.ANALYSIS, 0.5),
"analyze": (TaskCategory.ANALYSIS, 0.6),
"security": (TaskCategory.ANALYSIS, 0.4),
"vulnerability": (TaskCategory.ANALYSIS, 0.7),
"performance": (TaskCategory.ANALYSIS, 0.3),
"audit": (TaskCategory.ANALYSIS, 0.6),
# 文档
"document": (TaskCategory.DOCUMENTATION, 0.6),
"readme": (TaskCategory.DOCUMENTATION, 0.8),
"docstring": (TaskCategory.DOCUMENTATION, 0.8),
"comment": (TaskCategory.DOCUMENTATION, 0.4),
"explain": (TaskCategory.DOCUMENTATION, 0.3),
}
def classify_task(query: str) -> ClassificationResult:
"""将查询分类到任务类别。"""
query_lower = query.lower()
scores = {cat: 0.0 for cat in TaskCategory}
matched_keywords = []
# 模式匹配(权重:0.5)
for category, patterns in TASK_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, query_lower):
scores[category] += 0.5
# 关键词评分(权重:0.5)
words = re.findall(r'\w+', query_lower)
for word in words:
if word in KEYWORD_WEIGHTS:
category, weight = KEYWORD_WEIGHTS[word]
scores[category] += weight * 0.5
matched_keywords.append(word)
# 找到最高分的类别
best_category = max(scores, key=scores.get)
confidence = min(scores[best_category], 1.0)
# 如果没有明确匹配,默认使用 CODING
if confidence < 0.2:
best_category = TaskCategory.CODING
confidence = 0.5
# 确定是否需要 Serena
requires_serena = (
best_category == TaskCategory.ANALYSIS or
any(kw in query_lower for kw in [
'definition', 'reference', 'symbol', 'rename',
'where is', 'find all', 'go to', 'jump to'
])
)
return ClassificationResult(
category=best_category,
confidence=confidence,
requires_serena=requires_serena,
keywords_matched=matched_keywords
)
from dataclasses import dataclass
from typing import Optional
@dataclass
class ModelCapability:
id: str
family: str
context_window: int
vram_gb: float
categories: list[TaskCategory]
performance_scores: dict[TaskCategory, int] # 0-100
tier: int # 1=best, 2=good, 3=basic
quantization: Optional[str] = None
# 综合模型数据库(40+ 模型)- 更新于 2025 年 1 月
MODEL_DATABASE: dict[str, ModelCapability] = {
# === 编码专家(第 1 层)===
"deepseek-v3": ModelCapability(
id="deepseek-v3",
family="deepseek",
context_window=128000,
vram_gb=48, # MoE: 685B total, 37B active
categories=[TaskCategory.CODING, TaskCategory.REASONING, TaskCategory.ANALYSIS],
performance_scores={
TaskCategory.CODING: 99,
TaskCategory.REASONING: 97,
TaskCategory.ANALYSIS: 96,
TaskCategory.DOCUMENTATION: 92
},
tier=1
),
"qwen2.5-coder-32b": ModelCapability(
id="qwen2.5-coder-32b",
family="qwen",
context_window=131072,
vram_gb=22,
categories=[TaskCategory.CODING, TaskCategory.ANALYSIS],
performance_scores={
TaskCategory.CODING: 96,
TaskCategory.REASONING: 82,
TaskCategory.ANALYSIS: 92,
TaskCategory.DOCUMENTATION: 88
},
tier=1
),
"deepseek-coder-v2": ModelCapability(
id="deepseek-coder-v2",
family="deepseek",
context_window=128000,
vram_gb=48, # MoE: 236B total, 21B active
categories=[TaskCategory.CODING, TaskCategory.ANALYSIS, TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 95,
TaskCategory.REASONING: 88,
TaskCategory.ANALYSIS: 92,
TaskCategory.DOCUMENTATION: 80
},
tier=1
),
"codellama-70b": ModelCapability(
id="codellama-70b",
family="llama",
context_window=100000,
vram_gb=40,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 90,
TaskCategory.REASONING: 70,
TaskCategory.ANALYSIS: 85,
TaskCategory.DOCUMENTATION: 75
},
tier=1
),
"codellama-34b": ModelCapability(
id="codellama-34b",
family="llama",
context_window=100000,
vram_gb=20,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 85,
TaskCategory.REASONING: 65,
TaskCategory.ANALYSIS: 80,
TaskCategory.DOCUMENTATION: 70
},
tier=2
),
"qwen2.5-coder-14b": ModelCapability(
id="qwen2.5-coder-14b",
family="qwen",
context_window=131072,
vram_gb=10,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 82,
TaskCategory.REASONING: 60,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 70
},
tier=2
),
"starcoder2-15b": ModelCapability(
id="starcoder2-15b",
family="starcoder",
context_window=16384,
vram_gb=10,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 80,
TaskCategory.REASONING: 50,
TaskCategory.ANALYSIS: 70,
TaskCategory.DOCUMENTATION: 60
},
tier=2
),
"deepseek-coder-6.7b": ModelCapability(
id="deepseek-coder-6.7b",
family="deepseek",
context_window=16384,
vram_gb=5,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 75,
TaskCategory.REASONING: 50,
TaskCategory.ANALYSIS: 65,
TaskCategory.DOCUMENTATION: 55
},
tier=3
),
"codellama-7b": ModelCapability(
id="codellama-7b",
family="llama",
context_window=16384,
vram_gb=5,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 70,
TaskCategory.REASONING: 45,
TaskCategory.ANALYSIS: 60,
TaskCategory.DOCUMENTATION: 50
},
tier=3
),
# === 推理专家 ===
"deepseek-r1": ModelCapability(
id="deepseek-r1",
family="deepseek",
context_window=128000,
vram_gb=160, # 671B total
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 92,
TaskCategory.REASONING: 99,
TaskCategory.ANALYSIS: 95,
TaskCategory.DOCUMENTATION: 90
},
tier=1
),
"deepseek-r1-distill-70b": ModelCapability(
id="deepseek-r1-distill-70b",
family="deepseek",
context_window=128000,
vram_gb=42,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 88,
TaskCategory.REASONING: 94,
TaskCategory.ANALYSIS: 90,
TaskCategory.DOCUMENTATION: 86
},
tier=1
),
"qwen2.5-72b-instruct": ModelCapability(
id="qwen2.5-72b-instruct",
family="qwen",
context_window=131072,
vram_gb=48,
categories=[TaskCategory.REASONING, Task
Intelligent routing of AI coding queries to local LLMs with Serena LSP integration for secure, offline-capable development environments.
Before using this skill, ensure:
# Install Serena (required)
pip install serena
# Or via uvx
uvx --from git+https://github.com/oraios/serena serena start-mcp-server
# Verify local LLM service
curl http://localhost:11434/api/version # Ollama
curl http://localhost:1234/v1/models # LM Studio
curl http://localhost:1337/v1/models # Jan
import httpx
import asyncio
from dataclasses import dataclass
from enum import Enum
from typing import Optional
class TaskCategory(Enum):
CODING = "coding"
REASONING = "reasoning"
ANALYSIS = "analysis"
DOCUMENTATION = "documentation"
@dataclass
class RouterConfig:
"""Local LLM Router configuration."""
ollama_url: str = "http://localhost:11434"
lmstudio_url: str = "http://localhost:1234"
jan_url: str = "http://localhost:1337"
serena_enabled: bool = True
timeout: int = 30
async def quick_route(query: str, config: RouterConfig = RouterConfig()):
"""Quick routing example - detects services and routes query."""
# 1. Detect available services
services = await discover_services(config)
if not services:
raise RuntimeError("No local LLM services available")
# 2. Classify task
category = classify_task(query)
# 3. Select best model for task
model = select_model(category, services)
# 4. Execute query
return await execute_query(query, model, services[0])
# Example usage
async def main():
response = await quick_route("Write a function to parse JSON safely")
print(response)
asyncio.run(main())
CRITICAL : Serena MCP MUST be invoked FIRST for all code-related tasks. This provides semantic understanding of the codebase before routing to an LLM.
import subprocess
import json
from typing import Any
class SerenaMCP:
"""Serena MCP client for code intelligence."""
def __init__(self, workspace_root: str):
self.workspace = workspace_root
self.process = None
async def start(self):
"""Start Serena MCP server."""
self.process = subprocess.Popen(
["serena", "start-mcp-server", "--workspace", self.workspace],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
async def call(self, method: str, params: dict) -> Any:
"""Call Serena MCP method."""
request = {
"jsonrpc": "2.0",
"id": 1,
"method": method,
"params": params
}
self.process.stdin.write(json.dumps(request).encode() + b"\n")
self.process.stdin.flush()
response = self.process.stdout.readline()
return json.loads(response)
async def find_symbol(self, name: str) -> dict:
"""Find symbol definition by name."""
return await self.call("find_symbol", {"name": name})
async def get_references(self, file: str, line: int, char: int) -> list:
"""Get all references to symbol at position."""
return await self.call("get_references", {
"file": file,
"line": line,
"character": char
})
async def get_hover_info(self, file: str, line: int, char: int) -> dict:
"""Get type/documentation info at position."""
return await self.call("get_hover_info", {
"file": file,
"line": line,
"character": char
})
async def get_diagnostics(self, file: str) -> list:
"""Get errors/warnings for file."""
return await self.call("get_diagnostics", {"file": file})
async def apply_edit(self, file: str, edits: list) -> bool:
"""Apply code edits to file."""
return await self.call("apply_edit", {"file": file, "edits": edits})
# Serena tools by priority (always use higher priority first)
SERENA_TOOLS = {
# Priority 1: Symbol-level operations (highest)
"find_symbol": {"priority": 1, "use_for": ["navigation", "definition"]},
"get_references": {"priority": 1, "use_for": ["refactoring", "impact analysis"]},
"get_hover_info": {"priority": 1, "use_for": ["type info", "documentation"]},
# Priority 2: Code navigation
"go_to_definition": {"priority": 2, "use_for": ["navigation"]},
"go_to_type_definition": {"priority": 2, "use_for": ["type navigation"]},
"go_to_implementation": {"priority": 2, "use_for": ["interface impl"]},
# Priority 3: Code understanding
"get_document_symbols": {"priority": 3, "use_for": ["file structure"]},
"get_workspace_symbols": {"priority": 3, "use_for": ["codebase search"]},
"get_call_hierarchy": {"priority": 3, "use_for": ["call analysis"]},
# Priority 4: Code modification
"apply_edit": {"priority": 4, "use_for": ["editing"]},
"rename_symbol": {"priority": 4, "use_for": ["refactoring"]},
# Priority 5: Diagnostics
"get_diagnostics": {"priority": 5, "use_for": ["errors", "warnings"]},
"get_code_actions": {"priority": 5, "use_for": ["quick fixes"]},
}
async def handle_code_request(
query: str,
file_context: Optional[dict] = None,
serena: SerenaMCP = None,
router: "LLMRouter" = None
):
"""
Handle code request with Serena-first pattern.
CRITICAL: Serena is ALWAYS invoked first for code tasks.
"""
# Step 1: Classify the task
category = classify_task(query)
# Step 2: ALWAYS use Serena for code context (if available)
serena_context = {}
if serena and file_context:
# Gather semantic context from Serena
if file_context.get("file") and file_context.get("position"):
file = file_context["file"]
line = file_context["position"]["line"]
char = file_context["position"]["character"]
# Get hover info (type, docs)
serena_context["hover"] = await serena.get_hover_info(file, line, char)
# For refactoring/analysis, get references
if category in [TaskCategory.ANALYSIS, TaskCategory.CODING]:
if "refactor" in query.lower() or "rename" in query.lower():
serena_context["references"] = await serena.get_references(
file, line, char
)
# Always get diagnostics for the file
serena_context["diagnostics"] = await serena.get_diagnostics(file)
# Step 3: Build enriched prompt with Serena context
enriched_query = build_enriched_query(query, serena_context)
# Step 4: Select and route to appropriate LLM
model = router.select_model(category)
response = await router.execute(enriched_query, model)
# Step 5: If response contains edits, apply via Serena
if serena and contains_code_edit(response):
edits = parse_code_edits(response)
await serena.apply_edit(file_context["file"], edits)
return response
def build_enriched_query(query: str, serena_context: dict) -> str:
"""Build query enriched with Serena context."""
parts = [query]
if serena_context.get("hover"):
hover = serena_context["hover"]
parts.append(f"\n## Type Information\n```\n{hover}\n```")
if serena_context.get("references"):
refs = serena_context["references"]
parts.append(f"\n## References ({len(refs)} found)\n")
for ref in refs[:10]: # Limit to first 10
parts.append(f"- {ref['file']}:{ref['line']}")
if serena_context.get("diagnostics"):
diags = serena_context["diagnostics"]
if diags:
parts.append(f"\n## Current Issues ({len(diags)})\n")
for diag in diags[:5]:
parts.append(f"- Line {diag['line']}: {diag['message']}")
return "\n".join(parts)
| Service | Default Endpoint | Health Check | Models Endpoint | Chat Endpoint | API Style |
|---|---|---|---|---|---|
| Ollama | localhost:11434 | /api/version | /api/tags | /api/chat | Native |
| LM Studio | localhost:1234 | /v1/models |
import sys
import os
import platform
from dataclasses import dataclass
@dataclass
class OSInfo:
platform: str # 'windows', 'linux', 'darwin'
release: str
arch: str # 'x64', 'arm64'
is_wsl: bool
is_container: bool
def detect_os() -> OSInfo:
"""Detect operating system and environment."""
plat = sys.platform
# Normalize platform name
if plat == 'win32':
plat = 'windows'
elif plat == 'darwin':
plat = 'darwin'
else:
plat = 'linux'
# WSL detection
is_wsl = False
if plat == 'linux':
try:
with open('/proc/version', 'r') as f:
is_wsl = 'microsoft' in f.read().lower()
except FileNotFoundError:
pass
is_wsl = is_wsl or os.environ.get('WSL_DISTRO_NAME') is not None
# Container detection
is_container = (
os.path.exists('/.dockerenv') or
os.environ.get('KUBERNETES_SERVICE_HOST') is not None
)
if not is_container and plat == 'linux':
try:
with open('/proc/1/cgroup', 'r') as f:
is_container = 'docker' in f.read() or 'kubepods' in f.read()
except FileNotFoundError:
pass
return OSInfo(
platform=plat,
release=platform.release(),
arch=platform.machine(),
is_wsl=is_wsl,
is_container=is_container
)
def adjust_endpoint_for_os(endpoint: str, os_info: OSInfo) -> str:
"""Adjust endpoint based on OS environment."""
if os_info.is_wsl or os_info.is_container:
# In WSL/containers, localhost services are on the host
return endpoint.replace('localhost', 'host.docker.internal')
return endpoint
import httpx
import asyncio
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class DiscoveredModel:
id: str
name: str
size: int = 0
family: Optional[str] = None
context_length: int = 4096
quantization: Optional[str] = None
@dataclass
class LLMService:
name: str
type: str # 'ollama', 'lmstudio', 'jan', 'openwebui', 'custom'
endpoint: str
status: str = 'unknown' # 'online', 'offline', 'unknown'
models: list = field(default_factory=list)
last_checked: datetime = None
api_style: str = 'openai' # 'openai', 'native'
# Endpoint paths
health_path: str = '/v1/models'
models_path: str = '/v1/models'
chat_path: str = '/v1/chat/completions'
# Default service configurations
SERVICE_DEFAULTS = {
'ollama': LLMService(
name='Ollama',
type='ollama',
endpoint='http://localhost:11434',
health_path='/api/version',
models_path='/api/tags',
chat_path='/api/chat',
api_style='native'
),
'lmstudio': LLMService(
name='LM Studio',
type='lmstudio',
endpoint='http://localhost:1234',
health_path='/v1/models',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'jan': LLMService(
name='Jan',
type='jan',
endpoint='http://localhost:1337',
health_path='/v1/models',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'openwebui': LLMService(
name='Open WebUI',
type='openwebui',
endpoint='http://localhost:3000',
health_path='/api/health',
models_path='/api/models',
chat_path='/api/chat',
api_style='custom'
),
'localai': LLMService(
name='LocalAI',
type='localai',
endpoint='http://localhost:8080',
health_path='/readyz',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'vllm': LLMService(
name='vLLM',
type='vllm',
endpoint='http://localhost:8000',
health_path='/health',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'llamacpp': LLMService(
name='llama.cpp',
type='llamacpp',
endpoint='http://localhost:8080',
health_path='/health',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
'koboldcpp': LLMService(
name='Kobold.cpp',
type='koboldcpp',
endpoint='http://localhost:5001',
health_path='/api/v1/info',
models_path='/api/v1/model',
chat_path='/api/v1/generate',
api_style='custom'
),
'gpt4all': LLMService(
name='GPT4All',
type='gpt4all',
endpoint='http://localhost:4891',
health_path='/v1/models',
models_path='/v1/models',
chat_path='/v1/chat/completions',
api_style='openai'
),
}
class ServiceDiscovery:
"""Discover and monitor local LLM services."""
def __init__(self, custom_endpoints: list = None):
self.services: dict[str, LLMService] = {}
self.os_info = detect_os()
self.custom_endpoints = custom_endpoints or []
self._client = httpx.AsyncClient(timeout=5.0)
async def discover_all(self) -> list[LLMService]:
"""Discover all available LLM services."""
discovered = []
# Check default services
tasks = []
for key, default in SERVICE_DEFAULTS.items():
service = LLMService(
name=default.name,
type=default.type,
endpoint=adjust_endpoint_for_os(default.endpoint, self.os_info),
health_path=default.health_path,
models_path=default.models_path,
chat_path=default.chat_path,
api_style=default.api_style
)
tasks.append(self._check_service(service))
# Check custom endpoints
for custom in self.custom_endpoints:
service = LLMService(
name=custom.get('name', 'Custom'),
type='custom',
endpoint=custom['endpoint'],
health_path=custom.get('health_path', '/v1/models'),
models_path=custom.get('models_path', '/v1/models'),
chat_path=custom.get('chat_path', '/v1/chat/completions'),
api_style=custom.get('api_style', 'openai')
)
tasks.append(self._check_service(service))
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, LLMService) and result.status == 'online':
discovered.append(result)
self.services[result.type] = result
return discovered
async def _check_service(self, service: LLMService) -> LLMService:
"""Check if service is online and discover models."""
try:
# Health check
response = await self._client.get(
f"{service.endpoint}{service.health_path}"
)
if response.status_code == 200:
service.status = 'online'
service.last_checked = datetime.now()
# Discover models
service.models = await self._discover_models(service)
else:
service.status = 'offline'
except (httpx.ConnectError, httpx.TimeoutException):
service.status = 'offline'
return service
async def _discover_models(self, service: LLMService) -> list[DiscoveredModel]:
"""Discover available models on service."""
try:
response = await self._client.get(
f"{service.endpoint}{service.models_path}"
)
data = response.json()
# Parse based on service type
if service.type == 'ollama':
return [
DiscoveredModel(
id=m['name'],
name=m['name'],
size=m.get('size', 0),
family=m.get('details', {}).get('family'),
context_length=self._infer_context_length(m['name'])
)
for m in data.get('models', [])
]
else: # OpenAI-style
return [
DiscoveredModel(
id=m['id'],
name=m['id'],
context_length=m.get('context_length', 4096)
)
for m in data.get('data', [])
]
except Exception:
return []
def _infer_context_length(self, model_name: str) -> int:
"""Infer context length from model name."""
name_lower = model_name.lower()
# Check for explicit context markers
if '128k' in name_lower or '131k' in name_lower:
return 131072
if '64k' in name_lower:
return 65536
if '32k' in name_lower:
return 32768
if '16k' in name_lower:
return 16384
# Model family defaults
if 'qwen' in name_lower:
return 131072 # Qwen models typically have 128K+
if 'deepseek' in name_lower:
return 128000
if 'llama-3' in name_lower or 'llama3' in name_lower:
return 128000
if 'codellama' in name_lower:
return 100000
if 'mixtral' in name_lower:
return 65536
return 8192 # Safe default
import re
from enum import Enum
from dataclasses import dataclass
class TaskCategory(Enum):
CODING = "coding"
REASONING = "reasoning"
ANALYSIS = "analysis"
DOCUMENTATION = "documentation"
@dataclass
class ClassificationResult:
category: TaskCategory
confidence: float # 0.0 - 1.0
requires_serena: bool
keywords_matched: list[str]
# Task patterns (regex)
TASK_PATTERNS = {
TaskCategory.CODING: [
r"(?:write|create|implement|code|generate)\s+(?:a\s+)?(?:function|class|method|component)",
r"(?:fix|debug|solve)\s+(?:this|the)\s+(?:bug|error|issue)",
r"refactor\s+(?:this|the)",
r"add\s+(?:error\s+handling|validation|logging|tests?)",
r"complete\s+(?:this|the)\s+code",
r"(?:convert|translate)\s+(?:this|the)\s+code",
r"(?:optimize|improve)\s+(?:this|the)\s+(?:function|code|performance)",
],
TaskCategory.REASONING: [
r"(?:design|architect|plan)\s+(?:a|the)\s+(?:system|architecture|solution)",
r"how\s+should\s+(?:I|we)\s+(?:approach|structure|implement)",
r"what\s+(?:is|would\s+be)\s+the\s+best\s+(?:way|approach|pattern)",
r"explain\s+the\s+(?:logic|reasoning|algorithm)",
r"compare\s+(?:and\s+contrast|between)",
r"(?:recommend|suggest)\s+(?:an?\s+)?(?:approach|solution|pattern)",
r"trade-?offs?\s+(?:between|of)",
],
TaskCategory.ANALYSIS: [
r"(?:review|analyze|audit)\s+(?:this|the)\s+code",
r"find\s+(?:potential\s+)?(?:issues|vulnerabilities|bugs|problems)",
r"(?:security|performance)\s+(?:review|analysis|audit)",
r"what\s+(?:could|might)\s+go\s+wrong",
r"identify\s+(?:problems|improvements|issues)",
r"(?:check|scan)\s+for\s+(?:vulnerabilities|issues)",
],
TaskCategory.DOCUMENTATION: [
r"(?:write|create|generate)\s+(?:documentation|docs|docstring)",
r"(?:add|write)\s+(?:comments|jsdoc|docstring|type\s+hints)",
r"(?:document|explain)\s+(?:this|the)\s+(?:code|function|api)",
r"(?:create|write)\s+(?:a\s+)?readme",
r"(?:generate|write)\s+(?:api\s+)?documentation",
r"describe\s+(?:what|how)\s+(?:this|the)",
],
}
# Keyword weights for scoring
KEYWORD_WEIGHTS = {
# Coding
"function": (TaskCategory.CODING, 0.3),
"implement": (TaskCategory.CODING, 0.4),
"code": (TaskCategory.CODING, 0.2),
"debug": (TaskCategory.CODING, 0.5),
"refactor": (TaskCategory.CODING, 0.6),
"fix": (TaskCategory.CODING, 0.4),
"test": (TaskCategory.CODING, 0.3),
"bug": (TaskCategory.CODING, 0.5),
# Reasoning
"architecture": (TaskCategory.REASONING, 0.6),
"design": (TaskCategory.REASONING, 0.4),
"approach": (TaskCategory.REASONING, 0.3),
"strategy": (TaskCategory.REASONING, 0.5),
"tradeoff": (TaskCategory.REASONING, 0.5),
"compare": (TaskCategory.REASONING, 0.4),
"recommend": (TaskCategory.REASONING, 0.4),
# Analysis
"review": (TaskCategory.ANALYSIS, 0.5),
"analyze": (TaskCategory.ANALYSIS, 0.6),
"security": (TaskCategory.ANALYSIS, 0.4),
"vulnerability": (TaskCategory.ANALYSIS, 0.7),
"performance": (TaskCategory.ANALYSIS, 0.3),
"audit": (TaskCategory.ANALYSIS, 0.6),
# Documentation
"document": (TaskCategory.DOCUMENTATION, 0.6),
"readme": (TaskCategory.DOCUMENTATION, 0.8),
"docstring": (TaskCategory.DOCUMENTATION, 0.8),
"comment": (TaskCategory.DOCUMENTATION, 0.4),
"explain": (TaskCategory.DOCUMENTATION, 0.3),
}
def classify_task(query: str) -> ClassificationResult:
"""Classify a query into a task category."""
query_lower = query.lower()
scores = {cat: 0.0 for cat in TaskCategory}
matched_keywords = []
# Pattern matching (weight: 0.5)
for category, patterns in TASK_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, query_lower):
scores[category] += 0.5
# Keyword scoring (weight: 0.5)
words = re.findall(r'\w+', query_lower)
for word in words:
if word in KEYWORD_WEIGHTS:
category, weight = KEYWORD_WEIGHTS[word]
scores[category] += weight * 0.5
matched_keywords.append(word)
# Find highest scoring category
best_category = max(scores, key=scores.get)
confidence = min(scores[best_category], 1.0)
# Default to CODING if no clear match
if confidence < 0.2:
best_category = TaskCategory.CODING
confidence = 0.5
# Determine if Serena is required
requires_serena = (
best_category == TaskCategory.ANALYSIS or
any(kw in query_lower for kw in [
'definition', 'reference', 'symbol', 'rename',
'where is', 'find all', 'go to', 'jump to'
])
)
return ClassificationResult(
category=best_category,
confidence=confidence,
requires_serena=requires_serena,
keywords_matched=matched_keywords
)
from dataclasses import dataclass
from typing import Optional
@dataclass
class ModelCapability:
id: str
family: str
context_window: int
vram_gb: float
categories: list[TaskCategory]
performance_scores: dict[TaskCategory, int] # 0-100
tier: int # 1=best, 2=good, 3=basic
quantization: Optional[str] = None
# Comprehensive model database (40+ models) - Updated January 2025
MODEL_DATABASE: dict[str, ModelCapability] = {
# === CODING SPECIALISTS (Tier 1) ===
"deepseek-v3": ModelCapability(
id="deepseek-v3",
family="deepseek",
context_window=128000,
vram_gb=48, # MoE: 685B total, 37B active
categories=[TaskCategory.CODING, TaskCategory.REASONING, TaskCategory.ANALYSIS],
performance_scores={
TaskCategory.CODING: 99,
TaskCategory.REASONING: 97,
TaskCategory.ANALYSIS: 96,
TaskCategory.DOCUMENTATION: 92
},
tier=1
),
"qwen2.5-coder-32b": ModelCapability(
id="qwen2.5-coder-32b",
family="qwen",
context_window=131072,
vram_gb=22,
categories=[TaskCategory.CODING, TaskCategory.ANALYSIS],
performance_scores={
TaskCategory.CODING: 96,
TaskCategory.REASONING: 82,
TaskCategory.ANALYSIS: 92,
TaskCategory.DOCUMENTATION: 88
},
tier=1
),
"deepseek-coder-v2": ModelCapability(
id="deepseek-coder-v2",
family="deepseek",
context_window=128000,
vram_gb=48, # MoE: 236B total, 21B active
categories=[TaskCategory.CODING, TaskCategory.ANALYSIS, TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 95,
TaskCategory.REASONING: 88,
TaskCategory.ANALYSIS: 92,
TaskCategory.DOCUMENTATION: 80
},
tier=1
),
"codellama-70b": ModelCapability(
id="codellama-70b",
family="llama",
context_window=100000,
vram_gb=40,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 90,
TaskCategory.REASONING: 70,
TaskCategory.ANALYSIS: 85,
TaskCategory.DOCUMENTATION: 75
},
tier=1
),
"codellama-34b": ModelCapability(
id="codellama-34b",
family="llama",
context_window=100000,
vram_gb=20,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 85,
TaskCategory.REASONING: 65,
TaskCategory.ANALYSIS: 80,
TaskCategory.DOCUMENTATION: 70
},
tier=2
),
"qwen2.5-coder-14b": ModelCapability(
id="qwen2.5-coder-14b",
family="qwen",
context_window=131072,
vram_gb=10,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 82,
TaskCategory.REASONING: 60,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 70
},
tier=2
),
"starcoder2-15b": ModelCapability(
id="starcoder2-15b",
family="starcoder",
context_window=16384,
vram_gb=10,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 80,
TaskCategory.REASONING: 50,
TaskCategory.ANALYSIS: 70,
TaskCategory.DOCUMENTATION: 60
},
tier=2
),
"deepseek-coder-6.7b": ModelCapability(
id="deepseek-coder-6.7b",
family="deepseek",
context_window=16384,
vram_gb=5,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 75,
TaskCategory.REASONING: 50,
TaskCategory.ANALYSIS: 65,
TaskCategory.DOCUMENTATION: 55
},
tier=3
),
"codellama-7b": ModelCapability(
id="codellama-7b",
family="llama",
context_window=16384,
vram_gb=5,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 70,
TaskCategory.REASONING: 45,
TaskCategory.ANALYSIS: 60,
TaskCategory.DOCUMENTATION: 50
},
tier=3
),
# === REASONING SPECIALISTS ===
"deepseek-r1": ModelCapability(
id="deepseek-r1",
family="deepseek",
context_window=128000,
vram_gb=160, # 671B total
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 92,
TaskCategory.REASONING: 99,
TaskCategory.ANALYSIS: 95,
TaskCategory.DOCUMENTATION: 90
},
tier=1
),
"deepseek-r1-distill-70b": ModelCapability(
id="deepseek-r1-distill-70b",
family="deepseek",
context_window=128000,
vram_gb=42,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 88,
TaskCategory.REASONING: 94,
TaskCategory.ANALYSIS: 90,
TaskCategory.DOCUMENTATION: 86
},
tier=1
),
"qwen2.5-72b-instruct": ModelCapability(
id="qwen2.5-72b-instruct",
family="qwen",
context_window=131072,
vram_gb=48,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 88,
TaskCategory.REASONING: 95,
TaskCategory.ANALYSIS: 92,
TaskCategory.DOCUMENTATION: 94
},
tier=1
),
"llama-3.3-70b-instruct": ModelCapability(
id="llama-3.3-70b-instruct",
family="llama",
context_window=128000,
vram_gb=42,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 85,
TaskCategory.REASONING: 92,
TaskCategory.ANALYSIS: 88,
TaskCategory.DOCUMENTATION: 90
},
tier=1
),
"deepseek-r1-distill-32b": ModelCapability(
id="deepseek-r1-distill-32b",
family="deepseek",
context_window=128000,
vram_gb=22,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 82,
TaskCategory.REASONING: 90,
TaskCategory.ANALYSIS: 85,
TaskCategory.DOCUMENTATION: 82
},
tier=2
),
"mistral-small-24b": ModelCapability(
id="mistral-small-24b",
family="mistral",
context_window=32768,
vram_gb=16,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 80,
TaskCategory.REASONING: 85,
TaskCategory.ANALYSIS: 82,
TaskCategory.DOCUMENTATION: 84
},
tier=2
),
"qwen2.5-32b-instruct": ModelCapability(
id="qwen2.5-32b-instruct",
family="qwen",
context_window=131072,
vram_gb=22,
categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 78,
TaskCategory.REASONING: 86,
TaskCategory.ANALYSIS: 82,
TaskCategory.DOCUMENTATION: 88
},
tier=2
),
"phi-4": ModelCapability(
id="phi-4",
family="phi",
context_window=16384,
vram_gb=10,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 82,
TaskCategory.REASONING: 88,
TaskCategory.ANALYSIS: 80,
TaskCategory.DOCUMENTATION: 78
},
tier=2
),
"deepseek-r1-distill-14b": ModelCapability(
id="deepseek-r1-distill-14b",
family="deepseek",
context_window=128000,
vram_gb=10,
categories=[TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 75,
TaskCategory.REASONING: 85,
TaskCategory.ANALYSIS: 78,
TaskCategory.DOCUMENTATION: 76
},
tier=2
),
"llama-3.2-11b-vision": ModelCapability(
id="llama-3.2-11b-vision",
family="llama",
context_window=128000,
vram_gb=8,
categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 68,
TaskCategory.REASONING: 78,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 80
},
tier=2
),
"gemma-2-27b": ModelCapability(
id="gemma-2-27b",
family="gemma",
context_window=8192,
vram_gb=18,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 78,
TaskCategory.REASONING: 82,
TaskCategory.ANALYSIS: 78,
TaskCategory.DOCUMENTATION: 80
},
tier=2
),
"deepseek-r1-distill-8b": ModelCapability(
id="deepseek-r1-distill-8b",
family="deepseek",
context_window=128000,
vram_gb=6,
categories=[TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 68,
TaskCategory.REASONING: 78,
TaskCategory.ANALYSIS: 70,
TaskCategory.DOCUMENTATION: 68
},
tier=3
),
"gemma-2-9b": ModelCapability(
id="gemma-2-9b",
family="gemma",
context_window=8192,
vram_gb=7,
categories=[TaskCategory.REASONING, TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 72,
TaskCategory.REASONING: 75,
TaskCategory.ANALYSIS: 70,
TaskCategory.DOCUMENTATION: 74
},
tier=3
),
"llama-3.2-3b": ModelCapability(
id="llama-3.2-3b",
family="llama",
context_window=128000,
vram_gb=3,
categories=[TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 55,
TaskCategory.REASONING: 65,
TaskCategory.ANALYSIS: 58,
TaskCategory.DOCUMENTATION: 65
},
tier=3
),
# === ANALYSIS SPECIALISTS (Serena Required) ===
"codellama-34b-instruct": ModelCapability(
id="codellama-34b-instruct",
family="llama",
context_window=100000,
vram_gb=20,
categories=[TaskCategory.ANALYSIS],
performance_scores={
TaskCategory.CODING: 80,
TaskCategory.REASONING: 70,
TaskCategory.ANALYSIS: 88,
TaskCategory.DOCUMENTATION: 75
},
tier=2
),
# === DOCUMENTATION SPECIALISTS ===
"mistral-nemo-12b": ModelCapability(
id="mistral-nemo-12b",
family="mistral",
context_window=128000,
vram_gb=8,
categories=[TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 65,
TaskCategory.REASONING: 70,
TaskCategory.ANALYSIS: 65,
TaskCategory.DOCUMENTATION: 82
},
tier=2
),
"mistral-7b": ModelCapability(
id="mistral-7b",
family="mistral",
context_window=32768,
vram_gb=5,
categories=[TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 55,
TaskCategory.REASONING: 60,
TaskCategory.ANALYSIS: 55,
TaskCategory.DOCUMENTATION: 72
},
tier=3
),
# === ADDITIONAL MODELS ===
"phi-3-medium": ModelCapability(
id="phi-3-medium",
family="phi",
context_window=128000,
vram_gb=8,
categories=[TaskCategory.CODING, TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 72,
TaskCategory.REASONING: 75,
TaskCategory.ANALYSIS: 68,
TaskCategory.DOCUMENTATION: 70
},
tier=2
),
"gemma-2-27b": ModelCapability(
id="gemma-2-27b",
family="gemma",
context_window=8192,
vram_gb=18,
categories=[TaskCategory.CODING, TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 78,
TaskCategory.REASONING: 80,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 78
},
tier=2
),
"yi-34b": ModelCapability(
id="yi-34b",
family="yi",
context_window=200000,
vram_gb=20,
categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 72,
TaskCategory.REASONING: 82,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 80
},
tier=2
),
"command-r-plus": ModelCapability(
id="command-r-plus",
family="cohere",
context_window=128000,
vram_gb=48,
categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 70,
TaskCategory.REASONING: 85,
TaskCategory.ANALYSIS: 78,
TaskCategory.DOCUMENTATION: 88
},
tier=1
),
"wizardcoder-33b": ModelCapability(
id="wizardcoder-33b",
family="wizard",
context_window=16384,
vram_gb=20,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 85,
TaskCategory.REASONING: 60,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 65
},
tier=2
),
"magicoder-7b": ModelCapability(
id="magicoder-7b",
family="magicoder",
context_window=16384,
vram_gb=5,
categories=[TaskCategory.CODING],
performance_scores={
TaskCategory.CODING: 78,
TaskCategory.REASONING: 50,
TaskCategory.ANALYSIS: 65,
TaskCategory.DOCUMENTATION: 55
},
tier=3
),
"dolphin-mixtral-8x7b": ModelCapability(
id="dolphin-mixtral-8x7b",
family="dolphin",
context_window=32768,
vram_gb=28,
categories=[TaskCategory.CODING, TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 75,
TaskCategory.REASONING: 78,
TaskCategory.ANALYSIS: 72,
TaskCategory.DOCUMENTATION: 75
},
tier=2
),
"nous-hermes-2-mixtral": ModelCapability(
id="nous-hermes-2-mixtral",
family="nous",
context_window=32768,
vram_gb=28,
categories=[TaskCategory.REASONING],
performance_scores={
TaskCategory.CODING: 72,
TaskCategory.REASONING: 82,
TaskCategory.ANALYSIS: 75,
TaskCategory.DOCUMENTATION: 78
},
tier=2
),
"solar-10.7b": ModelCapability(
id="solar-10.7b",
family="solar",
context_window=4096,
vram_gb=7,
categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],
performance_scores={
TaskCategory.CODING: 60,
TaskCategory.REASONING: 72,
TaskCategory.ANALYSIS: 65,
TaskCategory.DOCUMENTATION: 75
},
tier=3
),
}
# Task-to-model priority mapping (Updated January 2025)
TASK_MODEL_PRIORITY = {
TaskCategory.CODING: [
# Tier 1 - Best
"deepseek-v3", "qwen2.5-coder-32b", "deepseek-coder-v2",
# Tier 2 - Good
"codellama-70b", "qwen2.5-coder-14b", "codellama-34b",
"starcoder2-15b", "phi-4",
# Tier 3 - Basic
"qwen2.5-coder-7b", "codellama-7b", "deepseek-coder-6.7b"
],
TaskCategory.REASONING: [
# Tier 1 - Best
"deepseek-r1", "deepseek-v3", "deepseek-r1-distill-70b",
"qwen2.5-72b-instruct", "llama-3.3-70b-instruct",
# Tier 2 - Good
"deepseek-r1-distill-32b", "mistral-small-24b", "qwen2.5-32b-instruct",
"phi-4", "gemma-2-27b",
# Tier 3 - Basic
"deepseek-r1-distill-14b", "deepseek-r1-distill-8b", "gemma-2-9b"
],
TaskCategory.ANALYSIS: [
# Requires Serena LSP
"deepseek-v3", "qwen2.5-coder-32b", "deepseek-coder-v2",
"codellama-34b-instruct", "qwen2.5-72b-instruct"
],
TaskCategory.DOCUMENTATION: [
"qwen2.5-72b-instruct", "llama-3.3-70b-instruct", "qwen2.5-32b-instruct",
"mistral-small-24b", "mistral-nemo-12b", "gemma-2-27b"
],
}
from typing import Optional
class ModelSelector:
"""Select optimal model for task based on availability and requirements."""
def __init__(self, available_models: list[str]):
self.available = set(m.lower() for m in available_models)
def select(
self,
category: TaskCategory,
required_context: int = 0,
max_vram_gb: Optional[float] = None
) -> Optional[str]:
"""Select best available model for task category."""
# Get priority list for category
priority_list = TASK_MODEL_PRIORITY.get(category, [])
for model_id in priority_list:
# Check if model is available
if not self._is_available(model_id):
continue
# Check model capability
capability = MODEL_DATABASE.get(model_id)
if not capability:
continue
# Check context window requirement
if required_context > 0 and capability.context_window < required_context:
continue
# Check VRAM constraint
if max_vram_gb and capability.vram_gb > max_vram_gb:
continue
return model_id
# Fallback: return any available model
for model_id, capability in MODEL_DATABASE.items():
if self._is_available(model_id):
return model_id
return None
def _is_available(self, model_id: str) -> bool:
"""Check if model is available (fuzzy matching)."""
model_lower = model_id.lower()
# Exact match
if model_lower in self.available:
return True
# Partial match (model name contained in available)
for avail in self.available:
if model_lower in avail or avail in model_lower:
return True
return False
def get_fallback_models(self, category: TaskCategory) -> list[str]:
"""Get list of fallback models for category."""
priority_list = TASK_MODEL_PRIORITY.get(category, [])
available_in_priority = [
m for m in priority_list if self._is_available(m)
]
# Return tier 2 and 3 models as fallbacks
fallbacks = []
for model_id in available_in_priority:
capability = MODEL_DATABASE.get(model_id)
if capability and capability.tier >= 2:
fallbacks.append(model_id)
return fallbacks
from abc import ABC, abstractmethod
import re
class TokenCounter(ABC):
"""Base class for token counting."""
@abstractmethod
def count(self, text: str) -> int:
pass
class EstimationCounter(TokenCounter):
"""Estimation-based token counter (no external dependencies)."""
def __init__(self, chars_per_token: float = 4.0):
self.chars_per_token = chars_per_token
def count(self, text: str) -> int:
return int(len(text) / self.chars_per_token)
class QwenCounter(TokenCounter):
"""Token counter for Qwen models."""
def count(self, text: str) -> int:
# Qwen uses slightly different tokenization
return int(len(text) / 3.5)
class LlamaCounter(TokenCounter):
"""Token counter for Llama models."""
def count(self, text: str) -> int:
# Llama uses SentencePiece
return int(len(text) / 3.8)
# Model family to counter mapping
TOKEN_COUNTERS = {
"qwen": QwenCounter(),
"deepseek": EstimationCounter(4.0),
"llama": LlamaCounter(),
"mistral": EstimationCounter(4.0),
"mixtral": EstimationCounter(4.0),
"default": EstimationCounter(4.0),
}
def get_token_counter(model_id: str) -> TokenCounter:
"""Get appropriate token counter for model."""
capability = MODEL_DATABASE.get(model_id)
if capability:
return TOKEN_COUNTERS.get(capability.family, TOKEN_COUNTERS["default"])
return TOKEN_COUNTERS["default"]
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class Message:
role: str # 'system', 'user', 'assistant', 'tool'
content: str
timestamp: datetime = field(default_factory=datetime.now)
token_count: int = 0
metadata: dict = field(default_factory=dict)
@dataclass
class ConversationContext:
session_id: str
messages: list[Message] = field(default_factory=list)
total_tokens: int = 0
system_prompt: str = ""
system_prompt_tokens: int = 0
active_model: str = ""
model_history: list[str] = field(default_factory=list)
compaction_count: int = 0
class ContextManager:
"""Manage conversation context with compaction support."""
def __init__(
self,
session_id: str,
system_prompt: str = "",
compaction_threshold: float = 0.8, # 80% of context window
compaction_target: float = 0.5, # Compact to 50%
preserve_recent: int = 10 # Keep last N messages
):
self.context = ConversationContext(
session_id=session_id,
system_prompt=system_prompt
)
self.compaction_threshold = compaction_threshold
self.compaction_target = compaction_target
self.preserve_recent = preserve_recent
self._counter: Optional[TokenCounter] = None
def set_model(self, model_id: str):
"""Set active model and update token counter."""
if self.context.active_model:
self.context.model_history.append(self.context.active_model)
self.context.active_model = model_id
self._counter = get_token_counter(model_id)
# Recount all tokens with new counter
self._recount_tokens()
def add_message(self, role: str, content: str, metadata: dict = None):
"""Add message to context."""
token_count = self._counter.count(content) if self._counter else 0
message = Message(
role=role,
content=content,
token_count=token_count,
metadata=metadata or {}
)
self.context.messages.append(message)
self.context.total_tokens += token_count
def check_and_compact(self, max_tokens: int) -> bool:
"""Check if compaction needed and perform if so."""
threshold = int(max_tokens * self.compaction_threshold)
if self.context.total_tokens > threshold:
self._compact(max_tokens)
return True
return False
def _compact(self, max_tokens: int):
"""Compact context to target size."""
target = int(max_tokens * self.compaction_target)
# Step 1: Truncate large tool outputs
for msg in self.context.messages:
if msg.role == 'tool' and msg.token_count > 500:
original = msg.token_count
msg.content = f"[Tool output truncated - {msg.metadata.get('tool_name', 'unknown')}]"
msg.token_count = self._counter.count(msg.content)
msg.metadata['truncated'] = True
msg.metadata['original_tokens'] = original
self._recalculate_total()
if self.context.total_tokens <= target:
return
# Step 2: Summarize older messages
if len(self.context.messages) > self.preserve_recent:
older = self.context.messages[:-self.preserve_recent]
recent = self.context.messages[-self.preserve_recent:]
# Create summary of older messages
summary = self._create_summary(older)
summary_msg = Message(
role='system',
content=f"[Previous conversation summary]\n{summary}",
token_count=self._counter.count(summary),
metadata={'compacted': True}
)
self.context.messages = [summary_msg] + recent
self.context.compaction_count += 1
self._recalculate_total()
def _create_summary(self, messages: list[Message]) -> str:
"""Create summary of messages (simple implementation)."""
# In production, this would use a lightweight LLM
key_points = []
for msg in messages:
if msg.role == 'user':
# Extract first sentence of user queries
first_sentence = msg.content.split('.')[0][:100]
key_points.append(f"- User asked: {first_sentence}")
elif msg.role == 'assistant' and len(key_points) < 10:
# Extract key decisions/results
if 'created' in msg.content.lower() or 'implemented' in msg.content.lower():
first_sentence = msg.content.split('.')[0][:100]
key_points.append(f"- Assistant: {first_sentence}")
return "\n".join(key_points[:10])
def _recount_tokens(self):
"""Recount all tokens with current counter."""
if not self._counter:
return
self.context.system_prompt_tokens = self._counter.count(self.context.system_prompt)
for msg in self.context.messages:
msg.token_count = self._counter.count(msg.content)
self._recalculate_total()
def _recalculate_total(self):
"""Recalculate total token count."""
self.context.total_tokens = (
self.context.system_prompt_tokens +
sum(m.token_count for m in self.context.messages)
)
def export_for_api(self) -> list[dict]:
"""Export messages in API format."""
messages = []
if self.context.system_prompt:
messages.append({
"role": "system",
"content": self.context.system_prompt
})
for msg in self.context.messages:
messages.append({
"role": msg.role,
"content": msg.content
})
return messages
def prepare_handoff(self, new_model: str) -> "ContextManager":
"""Prepare context for model switch."""
self.set_model(new_model)
return self
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class ServiceConfig:
"""Configuration for a single LLM service."""
enabled: bool = True
endpoint: str = ""
priority: int = 1
timeout: int = 30000
max_retries: int = 3
api_style: str = "openai"
@dataclass
class TaskRoutingConfig:
"""Configuration for task routing."""
primary_models: list[str] = field(default_factory=list)
fallback_models: list[str] = field(default_factory=list)
min_context: int = 8192
require_serena: bool = False
@dataclass
class SecurityConfig:
"""Security configuration for air-gapped networks."""
allow_external: bool = False
allowed_hosts: list[str] = field(default_factory=lambda: [
"localhost", "127.0.0.1", "host.docker.internal"
])
allowed_cidrs: list[str] = field(default_factory=lambda: [
"192.168.0.0/16", "10.0.0.0/8", "172.16.0.0/12"
])
audit_enabled: bool = True
audit_log_path: str = "./audit.log"
log_queries: bool = True
log_responses: bool = False # Don't log sensitive responses
verify_checksums: bool = True
@dataclass
class ContextConfig:
"""Context management configuration."""
compaction_threshold: float = 0.8
compaction_target: float = 0.5
preserve_recent_messages: int = 10
preserve_recent_tool_calls: int = 5
max_tool_output_tokens: int = 500
@dataclass
class RouterConfig:
"""Complete router configuration."""
# Services
ollama: ServiceConfig = field(default_factory=lambda: ServiceConfig(
endpoint="http://localhost:11434",
priority=1
))
lmstudio: ServiceConfig = field(default_factory=lambda: ServiceConfig(
endpoint="http://localhost:1234",
priority=2
))
jan: ServiceConfig = field(default_factory=lambda: ServiceConfig(
endpoint="http://localhost:1337",
priority=3
))
custom_endpoints: list[dict] = field(default_factory=list)
# Task routing (Updated January 2025)
coding: TaskRoutingConfig = field(default_factory=lambda: TaskRoutingConfig(
primary_models=["deepseek-v3", "qwen2.5-coder-32b", "deepseek-coder-v2"],
fallback_models=["codellama-34b", "qwen2.5-coder-14b", "phi-4"],
min_context=8192
))
reasoning: TaskRoutingConfig = field(default_factory=lambda: TaskRoutingConfig(
primary_models=["deepseek-r1", "deepseek-v3", "qwen2.5-72b-instruct"],
fallback_models=["deepseek-r1-distill-32b", "mistral-small-24b"],
min_context=16384
))
analysis: TaskRoutingConfig = field(default_factory=lambda: TaskRoutingConfig(
primary_models=["deepseek-v3", "qwen2.5-coder-32b"],
fallback_models=["codellama-34b-instruct", "qwen2.5-72b-instruct"],
min_context=16384,
require_serena=True
))
documentation: TaskRoutingConfig = field(default_factory=lambda: TaskRoutingConfig(
primary_models=["qwen2.5-72b-instruct", "llama-3.3-70b-instruct"],
fallback_models=["qwen2.5-32b-instruct", "mistral-nemo-12b"],
min_context=8192
))
# Serena
serena_enabled: bool = True
serena_priority: str = "always_first"
# Context
context: ContextConfig = field(default_factory=ContextConfig)
# Security
security: SecurityConfig = field(default_factory=SecurityConfig)
# Default configuration instance
DEFAULT_CONFIG = RouterConfig()
def load_config_from_dict(data: dict) -> RouterConfig:
"""Load configuration from dictionary (e.g., parsed YAML)."""
config = RouterConfig()
# Update services
if 'services' in data:
for service_name, service_data in data['services'].items():
if hasattr(config, service_name):
setattr(config, service_name, ServiceConfig(**service_data))
# Update task routing
for category in ['coding', 'reasoning', 'analysis', 'documentation']:
if category in data.get('task_routing', {}):
setattr(config, category, TaskRoutingConfig(**data['task_routing'][category]))
# Update security
if 'security' in data:
config.security = SecurityConfig(**data['security'])
return config
# local-llm-router.yaml
# Copy this to your project and customize
version: "1.0"
environment: "air-gapped"
services:
ollama:
enabled: true
endpoint: "http://localhost:11434"
priority: 1
timeout: 30000
lmstudio:
enabled: true
endpoint: "http://localhost:1234"
priority: 2
jan:
enabled: false
endpoint: "http://localhost:1337"
priority: 3
custom_endpoints:
- name: "internal-gpu-server"
endpoint: "http://192.168.1.100:8000"
priority: 0
api_style: "openai"
task_routing:
coding:
primary_models:
- "deepseek-v3"
- "qwen2.5-coder-32b"
- "deepseek-coder-v2"
fallback_models:
- "codellama-34b"
- "qwen2.5-coder-14b"
- "phi-4"
min_context: 8192
reasoning:
primary_models:
- "deepseek-r1"
- "deepseek-v3"
- "qwen2.5-72b-instruct"
fallback_models:
- "deepseek-r1-distill-32b"
- "mistral-small-24b"
min_context: 16384
analysis:
primary_models:
- "deepseek-v3"
- "qwen2.5-coder-32b"
require_serena: true
documentation:
primary_models:
- "qwen2.5-72b-instruct"
- "llama-3.3-70b-instruct"
fallback_models:
- "mistral-nemo-12b"
serena:
enabled: true
priority: "always_first"
workspace: "${WORKSPACE_ROOT}"
context:
compaction_threshold: 0.8
preserve_recent_messages: 10
security:
allow_external: false
allowed_hosts:
- "localhost"
- "127.0.0.1"
- "192.168.0.0/16"
audit_enabled: true
audit_log_path: "./llm-router-audit.log"
from enum import IntEnum
from dataclasses import dataclass
from typing import Optional, Any
class FallbackLevel(IntEnum):
PRIMARY = 0
FALLBACK_MODELS = 1
REDUCED_CONTEXT = 2
SMALLEST_MODEL = 3
FAILED = 4
@dataclass
class ExecutionResult:
success: bool
model: Optional[str] = None
service: Optional[str] = None
response: Any = None
fallback_level: FallbackLevel = FallbackLevel.PRIMARY
error: Optional[str] = None
class FallbackExecutor:
"""Execute queries with multi-level fallback."""
def __init__(
self,
discovery: ServiceDiscovery,
context_manager: ContextManager,
config: RouterConfig
):
self.discovery = discovery
self.context = context_manager
self.config = config
async def execute_with_fallback(
self,
query: str,
category: TaskCategory
) -> ExecutionResult:
"""Execute query with fallback strategy."""
# Get model lists
task_config = getattr(self.config, category.value)
primary_models = task_config.primary_models
fallback_models = task_config.fallback_models
# Level 0: Try primary models
for model in primary_models:
result = await self._try_model(model, query)
React 组合模式指南:Vercel 组件架构最佳实践,提升代码可维护性
123,700 周安装
Browserbase:AI代理与自动化云端浏览器基础设施,支持Playwright/Puppeteer
56 周安装
子代理驱动开发:AI辅助编程新方法,通过独立子代理与两阶段审查提升代码质量
56 周安装
使用Google Gemini 3 Pro生成AI图像 - 支持自定义宽高比、分辨率与参考图
56 周安装
Home Assistant 配置完全指南:从 configuration.yaml 到包组织,掌握智能家居核心配置
56 周安装
Terraform Cloud Plan JSON 下载与分析工具 - 详细资源变更解析
56 周安装
iOS专家技能:SwiftUI开发最佳实践、项目结构与铁律指南(2024最新)
56 周安装
/v1/models |
/v1/chat/completions |
| OpenAI |
| Jan | localhost:1337 | /v1/models | /v1/models | /v1/chat/completions | OpenAI |
| OpenWebUI | localhost:3000 | /api/health | /api/models | /api/chat | Custom |
| LocalAI | localhost:8080 | /readyz | /v1/models | /v1/chat/completions | OpenAI |
| vLLM | localhost:8000 | /health | /v1/models | /v1/chat/completions | OpenAI |
| llama.cpp | localhost:8080 | /health | /v1/models | /v1/chat/completions | OpenAI |
| Kobold.cpp | localhost:5001 | /api/v1/info | /api/v1/models | /api/v1/generate | Custom |
| GPT4All | localhost:4891 | /v1/models | /v1/models | /v1/chat/completions | OpenAI |
| text-generation-webui | localhost:5000 | /api/v1/model | /api/v1/models | /api/v1/chat | Custom |