sglang by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill sglang用于 LLM 和 VLM 的高性能服务框架,具备 RadixAttention 以实现自动前缀缓存。
在以下情况使用 SGLang:
在以下情况使用 vLLM:
在以下情况使用 TensorRT-LLM:
# pip 安装(推荐)
pip install "sglang[all]"
# 使用 FlashInfer(更快,CUDA 11.8/12.1)
pip install sglang[all] flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
# 从源码安装
git clone https://github.com/sgl-project/sglang.git
cd sglang
pip install -e "python[all]"
# 基础服务器(Llama 3-8B)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
--port 30000
# 启用 RadixAttention(自动前缀缓存)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
--port 30000 \
--enable-radix-cache # 默认:启用
# 多 GPU(张量并行)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-70B-Instruct \
--tp 4 \
--port 30000
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
import sglang as sgl
# 设置后端
sgl.set_default_backend(sgl.OpenAI("http://localhost:30000/v1"))
# 简单生成
@sgl.function
def simple_gen(s, question):
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", max_tokens=100)
# 运行
state = simple_gen.run(question="What is the capital of France?")
print(state["answer"])
# 输出:"The capital of France is Paris."
import sglang as sgl
@sgl.function
def extract_person(s, text):
s += f"Extract person information from: {text}\n"
s += "Output JSON:\n"
# 受约束的 JSON 生成
s += sgl.gen(
"json_output",
max_tokens=200,
regex=r'\{"name": "[^"]+", "age": \d+, "occupation": "[^"]+"\}'
)
# 运行
state = extract_person.run(
text="John Smith is a 35-year-old software engineer."
)
print(state["json_output"])
# 输出:{"name": "John Smith", "age": 35, "occupation": "software engineer"}
功能:自动缓存并跨请求重用公共前缀。
性能:
工作原理:
示例(具有系统提示词的智能体):
Request 1: [SYSTEM_PROMPT] + "What's the weather?"
→ 计算完整提示词(1000 个令牌)
Request 2: [SAME_SYSTEM_PROMPT] + "Book a flight"
→ 重用系统提示词 KV 缓存(998 个令牌)
→ 仅计算 2 个新令牌
→ 快 5 倍!
@sgl.function
def structured_extraction(s, article):
s += f"Article: {article}\n\n"
s += "Extract key information as JSON:\n"
# JSON 模式约束
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"author": {"type": "string"},
"summary": {"type": "string"},
"sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]}
},
"required": ["title", "author", "summary", "sentiment"]
}
s += sgl.gen("info", max_tokens=300, json_schema=schema)
state = structured_extraction.run(article="...")
print(state["info"])
# 输出:符合模式的有效 JSON
@sgl.function
def extract_email(s, text):
s += f"Extract email from: {text}\n"
s += "Email: "
# 电子邮件正则表达式模式
s += sgl.gen(
"email",
max_tokens=50,
regex=r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
)
state = extract_email.run(text="Contact john.doe@example.com for details")
print(state["email"])
# 输出:"john.doe@example.com"
@sgl.function
def generate_code(s, description):
s += f"Generate Python code for: {description}\n"
s += "```python\n"
# Python 的 EBNF 语法
python_grammar = """
?start: function_def
function_def: "def" NAME "(" [parameters] "):" suite
parameters: parameter ("," parameter)*
parameter: NAME
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
"""
s += sgl.gen("code", max_tokens=200, grammar=python_grammar)
s += "\n```"
import sglang as sgl
# 定义工具
tools = [
{
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
}
}
},
{
"name": "book_flight",
"description": "Book a flight",
"parameters": {
"type": "object",
"properties": {
"from": {"type": "string"},
"to": {"type": "string"},
"date": {"type": "string"}
}
}
}
]
@sgl.function
def agent_workflow(s, user_query, tools):
# 系统提示词(由 RadixAttention 缓存)
s += "You are a helpful assistant with access to tools.\n"
s += f"Available tools: {tools}\n\n"
# 用户查询
s += f"User: {user_query}\n"
s += "Assistant: "
# 使用函数调用生成
s += sgl.gen(
"response",
max_tokens=200,
tools=tools, # SGLang 处理工具调用格式
stop=["User:", "\n\n"]
)
# 多个查询重用系统提示词
state1 = agent_workflow.run(
user_query="What's the weather in NYC?",
tools=tools
)
# 第一次调用:计算完整的系统提示词
state2 = agent_workflow.run(
user_query="Book a flight to LA",
tools=tools
)
# 第二次调用:重用系统提示词(快 5 倍)
少样本提示(提示词中包含 10 个示例):
智能体工作流(1000 令牌的系统提示词):
JSON 解码:
| 工作负载 | vLLM | SGLang | 加速比 |
|---|---|---|---|
| 简单生成 | 2500 令牌/秒 | 2800 令牌/秒 | 1.12× |
| 少样本(10 个示例) | 500 令牌/秒 | 5000 令牌/秒 | 10× |
| 智能体(工具调用) | 800 令牌/秒 | 4000 令牌/秒 | 5× |
| JSON 输出 | 600 令牌/秒 | 2400 令牌/秒 | 4× |
@sgl.function
def multi_turn_chat(s, history, new_message):
# 系统提示词(始终缓存)
s += "You are a helpful AI assistant.\n\n"
# 对话历史记录(随着增长而缓存)
for msg in history:
s += f"{msg['role']}: {msg['content']}\n"
# 新的用户消息(仅新部分)
s += f"User: {new_message}\n"
s += "Assistant: "
s += sgl.gen("response", max_tokens=200)
# 第 1 轮
history = []
state = multi_turn_chat.run(history=history, new_message="Hi there!")
history.append({"role": "User", "content": "Hi there!"})
history.append({"role": "Assistant", "content": state["response"]})
# 第 2 轮(重用第 1 轮的 KV 缓存)
state = multi_turn_chat.run(history=history, new_message="What's 2+2?")
# 仅计算新消息(快得多!)
# 第 3 轮(重用第 1 轮 + 第 2 轮的 KV 缓存)
state = multi_turn_chat.run(history=history, new_message="Tell me a joke")
# 随着历史记录增长,速度逐渐加快
# 使用草稿模型启动(快 2-3 倍)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-70B-Instruct \
--speculative-model meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-num-steps 5
@sgl.function
def describe_image(s, image_path):
s += sgl.image(image_path)
s += "Describe this image in detail: "
s += sgl.gen("description", max_tokens=200)
state = describe_image.run(image_path="photo.jpg")
print(state["description"])
# 自动批处理(连续批处理)
states = sgl.run_batch(
[
simple_gen.bind(question="What is AI?"),
simple_gen.bind(question="What is ML?"),
simple_gen.bind(question="What is DL?"),
]
)
# 所有 3 个请求在单个批次中处理(高效)
# 使用 OpenAI API 启动服务器
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
--port 30000
# 使用 OpenAI 客户端
curl http://localhost:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "default",
"messages": [
{"role": "system", "content": "You are helpful"},
{"role": "user", "content": "Hello"}
],
"temperature": 0.7,
"max_tokens": 100
}'
# 与 OpenAI Python SDK 兼容
from openai import OpenAI
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")
response = client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Hello"}]
)
文本模型:
视觉模型:
来自 HuggingFace 的 100+ 模型
NVIDIA:A100, H100, L4, T4 (CUDA 11.8+) AMD:MI300, MI250 (ROCm 6.0+) Intel:带 GPU 的 Xeon(即将推出) Apple:通过 MPS 的 M1/M2/M3(实验性)
每周安装量
151
仓库
GitHub 星标数
22.6K
首次出现
2026 年 1 月 21 日
安全审计
安装于
claude-code123
opencode118
gemini-cli111
cursor108
codex100
antigravity96
High-performance serving framework for LLMs and VLMs with RadixAttention for automatic prefix caching.
Use SGLang when:
Use vLLM instead when:
Use TensorRT-LLM instead when:
# pip install (recommended)
pip install "sglang[all]"
# With FlashInfer (faster, CUDA 11.8/12.1)
pip install sglang[all] flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
# From source
git clone https://github.com/sgl-project/sglang.git
cd sglang
pip install -e "python[all]"
# Basic server (Llama 3-8B)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
--port 30000
# With RadixAttention (automatic prefix caching)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
--port 30000 \
--enable-radix-cache # Default: enabled
# Multi-GPU (tensor parallelism)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-70B-Instruct \
--tp 4 \
--port 30000
import sglang as sgl
# Set backend
sgl.set_default_backend(sgl.OpenAI("http://localhost:30000/v1"))
# Simple generation
@sgl.function
def simple_gen(s, question):
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", max_tokens=100)
# Run
state = simple_gen.run(question="What is the capital of France?")
print(state["answer"])
# Output: "The capital of France is Paris."
import sglang as sgl
@sgl.function
def extract_person(s, text):
s += f"Extract person information from: {text}\n"
s += "Output JSON:\n"
# Constrained JSON generation
s += sgl.gen(
"json_output",
max_tokens=200,
regex=r'\{"name": "[^"]+", "age": \d+, "occupation": "[^"]+"\}'
)
# Run
state = extract_person.run(
text="John Smith is a 35-year-old software engineer."
)
print(state["json_output"])
# Output: {"name": "John Smith", "age": 35, "occupation": "software engineer"}
What it does : Automatically caches and reuses common prefixes across requests.
Performance :
How it works :
Example (Agent with system prompt):
Request 1: [SYSTEM_PROMPT] + "What's the weather?"
→ Computes full prompt (1000 tokens)
Request 2: [SAME_SYSTEM_PROMPT] + "Book a flight"
→ Reuses system prompt KV cache (998 tokens)
→ Only computes 2 new tokens
→ 5× faster!
@sgl.function
def structured_extraction(s, article):
s += f"Article: {article}\n\n"
s += "Extract key information as JSON:\n"
# JSON schema constraint
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"author": {"type": "string"},
"summary": {"type": "string"},
"sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]}
},
"required": ["title", "author", "summary", "sentiment"]
}
s += sgl.gen("info", max_tokens=300, json_schema=schema)
state = structured_extraction.run(article="...")
print(state["info"])
# Output: Valid JSON matching schema
@sgl.function
def extract_email(s, text):
s += f"Extract email from: {text}\n"
s += "Email: "
# Email regex pattern
s += sgl.gen(
"email",
max_tokens=50,
regex=r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
)
state = extract_email.run(text="Contact john.doe@example.com for details")
print(state["email"])
# Output: "john.doe@example.com"
@sgl.function
def generate_code(s, description):
s += f"Generate Python code for: {description}\n"
s += "```python\n"
# EBNF grammar for Python
python_grammar = """
?start: function_def
function_def: "def" NAME "(" [parameters] "):" suite
parameters: parameter ("," parameter)*
parameter: NAME
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
"""
s += sgl.gen("code", max_tokens=200, grammar=python_grammar)
s += "\n```"
import sglang as sgl
# Define tools
tools = [
{
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
}
}
},
{
"name": "book_flight",
"description": "Book a flight",
"parameters": {
"type": "object",
"properties": {
"from": {"type": "string"},
"to": {"type": "string"},
"date": {"type": "string"}
}
}
}
]
@sgl.function
def agent_workflow(s, user_query, tools):
# System prompt (cached with RadixAttention)
s += "You are a helpful assistant with access to tools.\n"
s += f"Available tools: {tools}\n\n"
# User query
s += f"User: {user_query}\n"
s += "Assistant: "
# Generate with function calling
s += sgl.gen(
"response",
max_tokens=200,
tools=tools, # SGLang handles tool call format
stop=["User:", "\n\n"]
)
# Multiple queries reuse system prompt
state1 = agent_workflow.run(
user_query="What's the weather in NYC?",
tools=tools
)
# First call: Computes full system prompt
state2 = agent_workflow.run(
user_query="Book a flight to LA",
tools=tools
)
# Second call: Reuses system prompt (5× faster)
Few-shot prompting (10 examples in prompt):
Agent workflows (1000-token system prompt):
JSON decoding :
| Workload | vLLM | SGLang | Speedup |
|---|---|---|---|
| Simple generation | 2500 tok/s | 2800 tok/s | 1.12× |
| Few-shot (10 examples) | 500 tok/s | 5000 tok/s | 10× |
| Agent (tool calls) | 800 tok/s | 4000 tok/s | 5× |
| JSON output | 600 tok/s | 2400 tok/s | 4× |
@sgl.function
def multi_turn_chat(s, history, new_message):
# System prompt (always cached)
s += "You are a helpful AI assistant.\n\n"
# Conversation history (cached as it grows)
for msg in history:
s += f"{msg['role']}: {msg['content']}\n"
# New user message (only new part)
s += f"User: {new_message}\n"
s += "Assistant: "
s += sgl.gen("response", max_tokens=200)
# Turn 1
history = []
state = multi_turn_chat.run(history=history, new_message="Hi there!")
history.append({"role": "User", "content": "Hi there!"})
history.append({"role": "Assistant", "content": state["response"]})
# Turn 2 (reuses Turn 1 KV cache)
state = multi_turn_chat.run(history=history, new_message="What's 2+2?")
# Only computes new message (much faster!)
# Turn 3 (reuses Turn 1 + Turn 2 KV cache)
state = multi_turn_chat.run(history=history, new_message="Tell me a joke")
# Progressively faster as history grows
# Launch with draft model (2-3× faster)
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-70B-Instruct \
--speculative-model meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-num-steps 5
@sgl.function
def describe_image(s, image_path):
s += sgl.image(image_path)
s += "Describe this image in detail: "
s += sgl.gen("description", max_tokens=200)
state = describe_image.run(image_path="photo.jpg")
print(state["description"])
# Automatic batching (continuous batching)
states = sgl.run_batch(
[
simple_gen.bind(question="What is AI?"),
simple_gen.bind(question="What is ML?"),
simple_gen.bind(question="What is DL?"),
]
)
# All 3 processed in single batch (efficient)
# Start server with OpenAI API
python -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3-8B-Instruct \
--port 30000
# Use with OpenAI client
curl http://localhost:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "default",
"messages": [
{"role": "system", "content": "You are helpful"},
{"role": "user", "content": "Hello"}
],
"temperature": 0.7,
"max_tokens": 100
}'
# Works with OpenAI Python SDK
from openai import OpenAI
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")
response = client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Hello"}]
)
Text models :
Vision models :
100+ models from HuggingFace
NVIDIA : A100, H100, L4, T4 (CUDA 11.8+) AMD : MI300, MI250 (ROCm 6.0+) Intel : Xeon with GPU (coming soon) Apple : M1/M2/M3 via MPS (experimental)
Weekly Installs
151
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code123
opencode118
gemini-cli111
cursor108
codex100
antigravity96
React 组合模式指南:Vercel 组件架构最佳实践,提升代码可维护性
118,000 周安装
Vite 8 高级配置指南:基于Rolldown的性能优化、环境API与构建策略
136 周安装
Ant Design React 组件库使用指南 - 快速构建企业级React应用UI
118 周安装
Nano Banana Pro:基于Gemini 3 Pro的AI图像生成与编辑工具,支持1K/2K/4K分辨率
117 周安装
Godot 4 GDScript 设计模式与最佳实践 | 游戏开发架构、信号、场景优化指南
149 周安装
Agent Builder:AI智能体构建框架,简化客户服务、研究、运营等领域的AI应用开发
120 周安装
Tailwind CSS UI重构指南:基于《重构UI》的52条最佳实践与代码规范
140 周安装