phoenix-observability by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill phoenix-observability面向 LLM 应用的开源 AI 可观测性与评估平台,提供追踪、评估、数据集、实验和实时监控功能。
在以下场景使用 Phoenix:
核心特性:
请改用以下替代方案:
pip install arize-phoenix
# 安装特定后端
pip install arize-phoenix[embeddings] # 嵌入分析
pip install arize-phoenix-otel # OpenTelemetry 配置
pip install arize-phoenix-evals # 评估框架
pip install arize-phoenix-client # 轻量级 REST 客户端
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
import phoenix as px
# 在 notebook 中启动(ThreadServer 模式)
session = px.launch_app()
# 查看 UI
session.view() # 嵌入式 iframe
print(session.url) # http://localhost:6006
# 启动 Phoenix 服务器
phoenix serve
# 使用 PostgreSQL
export PHOENIX_SQL_DATABASE_URL="postgresql://user:pass@host/db"
phoenix serve --port 6006
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor
# 使用 Phoenix 配置 OpenTelemetry
tracer_provider = register(
project_name="my-llm-app",
endpoint="http://localhost:6006/v1/traces"
)
# 插桩 OpenAI SDK
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
# 所有 OpenAI 调用现在都会被追踪
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello!"}]
)
追踪 代表一个完整的执行流程,而 跨度 是该追踪内的单个操作。
from phoenix.otel import register
from opentelemetry import trace
# 设置追踪
tracer_provider = register(project_name="my-app")
tracer = trace.get_tracer(__name__)
# 创建自定义跨度
with tracer.start_as_current_span("process_query") as span:
span.set_attribute("input.value", query)
# 子跨度会自动嵌套
with tracer.start_as_current_span("retrieve_context"):
context = retriever.search(query)
with tracer.start_as_current_span("generate_response"):
response = llm.generate(query, context)
span.set_attribute("output.value", response)
项目用于组织相关的追踪:
import os
os.environ["PHOENIX_PROJECT_NAME"] = "production-chatbot"
# 或者按追踪设置
from phoenix.otel import register
tracer_provider = register(project_name="experiment-v2")
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor
tracer_provider = register()
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
from phoenix.otel import register
from openinference.instrumentation.langchain import LangChainInstrumentor
tracer_provider = register()
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)
# 所有 LangChain 操作都会被追踪
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
response = llm.invoke("Hello!")
from phoenix.otel import register
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
tracer_provider = register()
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
from phoenix.otel import register
from openinference.instrumentation.anthropic import AnthropicInstrumentor
tracer_provider = register()
AnthropicInstrumentor().instrument(tracer_provider=tracer_provider)
from phoenix.evals import (
OpenAIModel,
HallucinationEvaluator,
RelevanceEvaluator,
ToxicityEvaluator,
llm_classify
)
# 设置评估模型
eval_model = OpenAIModel(model="gpt-4o")
# 评估幻觉
hallucination_eval = HallucinationEvaluator(eval_model)
results = hallucination_eval.evaluate(
input="What is the capital of France?",
output="The capital of France is Paris.",
reference="Paris is the capital of France."
)
from phoenix.evals import llm_classify
# 定义自定义评估
def evaluate_helpfulness(input_text, output_text):
template = """
评估该回答对于给定问题是否有帮助。
问题:{input}
回答:{output}
这个回答有帮助吗?回答 'helpful' 或 'not_helpful'。
"""
result = llm_classify(
model=eval_model,
template=template,
input=input_text,
output=output_text,
rails=["helpful", "not_helpful"]
)
return result
from phoenix import Client
from phoenix.evals import run_evals
client = Client()
# 获取待评估的跨度
spans_df = client.get_spans_dataframe(
project_name="my-app",
filter_condition="span_kind == 'LLM'"
)
# 运行评估
eval_results = run_evals(
dataframe=spans_df,
evaluators=[
HallucinationEvaluator(eval_model),
RelevanceEvaluator(eval_model)
],
provide_explanation=True
)
# 将结果记录回 Phoenix
client.log_evaluations(eval_results)
from phoenix import Client
client = Client()
# 创建数据集
dataset = client.create_dataset(
name="qa-test-set",
description="QA 评估数据集"
)
# 添加示例
client.add_examples_to_dataset(
dataset_name="qa-test-set",
examples=[
{
"input": {"question": "What is Python?"},
"output": {"answer": "A programming language"}
},
{
"input": {"question": "What is ML?"},
"output": {"answer": "Machine learning"}
}
]
)
from phoenix import Client
from phoenix.experiments import run_experiment
client = Client()
def my_model(input_data):
"""你的模型函数。"""
question = input_data["question"]
return {"answer": generate_answer(question)}
def accuracy_evaluator(input_data, output, expected):
"""自定义评估器。"""
return {
"score": 1.0 if expected["answer"].lower() in output["answer"].lower() else 0.0,
"label": "correct" if expected["answer"].lower() in output["answer"].lower() else "incorrect"
}
# 运行实验
results = run_experiment(
dataset_name="qa-test-set",
task=my_model,
evaluators=[accuracy_evaluator],
experiment_name="baseline-v1"
)
print(f"平均准确率:{results.aggregate_metrics['accuracy']}")
from phoenix import Client
client = Client(endpoint="http://localhost:6006")
# 获取跨度为 DataFrame
spans_df = client.get_spans_dataframe(
project_name="my-app",
filter_condition="span_kind == 'LLM'",
limit=1000
)
# 获取特定跨度
span = client.get_span(span_id="abc123")
# 获取追踪
trace = client.get_trace(trace_id="xyz789")
from phoenix import Client
client = Client()
# 记录用户反馈
client.log_annotation(
span_id="abc123",
name="user_rating",
annotator_kind="HUMAN",
score=0.8,
label="helpful",
metadata={"comment": "Good response"}
)
# 导出到 pandas
df = client.get_spans_dataframe(project_name="my-app")
# 导出追踪
traces = client.list_traces(project_name="my-app")
docker run -p 6006:6006 arizephoenix/phoenix:latest
# 设置数据库 URL
export PHOENIX_SQL_DATABASE_URL="postgresql://user:pass@host:5432/phoenix"
# 启动服务器
phoenix serve --host 0.0.0.0 --port 6006
| 变量 | 描述 | 默认值 |
|---|---|---|
PHOENIX_PORT | HTTP 服务器端口 | 6006 |
PHOENIX_HOST | 服务器绑定地址 | 127.0.0.1 |
PHOENIX_GRPC_PORT | gRPC/OTLP 端口 | 4317 |
PHOENIX_SQL_DATABASE_URL | 数据库连接 | SQLite 临时文件 |
PHOENIX_WORKING_DIR | 数据存储目录 | 操作系统临时目录 |
PHOENIX_ENABLE_AUTH | 启用身份验证 | false |
PHOENIX_SECRET | JWT 签名密钥 | 启用身份验证时必需 |
export PHOENIX_ENABLE_AUTH=true
export PHOENIX_SECRET="your-secret-key-min-32-chars"
export PHOENIX_ADMIN_SECRET="admin-bootstrap-token"
phoenix serve
追踪未显示:
from phoenix.otel import register
# 验证端点
tracer_provider = register(
project_name="my-app",
endpoint="http://localhost:6006/v1/traces" # 正确的端点
)
# 强制刷新
from opentelemetry import trace
trace.get_tracer_provider().force_flush()
Notebook 中内存占用高:
# 完成后关闭会话
session = px.launch_app()
# ... 执行工作 ...
session.close()
px.close_app()
数据库连接问题:
# 验证 PostgreSQL 连接
psql $PHOENIX_SQL_DATABASE_URL -c "SELECT 1"
# 检查 Phoenix 日志
phoenix serve --log-level debug
每周安装量
142
代码仓库
GitHub 星标数
22.6K
首次出现
2026年1月21日
安全审计
安装于
claude-code117
opencode113
gemini-cli107
cursor104
codex93
antigravity91
Open-source AI observability and evaluation platform for LLM applications with tracing, evaluation, datasets, experiments, and real-time monitoring.
Use Phoenix when:
Key features:
Use alternatives instead:
pip install arize-phoenix
# With specific backends
pip install arize-phoenix[embeddings] # Embedding analysis
pip install arize-phoenix-otel # OpenTelemetry config
pip install arize-phoenix-evals # Evaluation framework
pip install arize-phoenix-client # Lightweight REST client
import phoenix as px
# Launch in notebook (ThreadServer mode)
session = px.launch_app()
# View UI
session.view() # Embedded iframe
print(session.url) # http://localhost:6006
# Start Phoenix server
phoenix serve
# With PostgreSQL
export PHOENIX_SQL_DATABASE_URL="postgresql://user:pass@host/db"
phoenix serve --port 6006
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor
# Configure OpenTelemetry with Phoenix
tracer_provider = register(
project_name="my-llm-app",
endpoint="http://localhost:6006/v1/traces"
)
# Instrument OpenAI SDK
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
# All OpenAI calls are now traced
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello!"}]
)
A trace represents a complete execution flow, while spans are individual operations within that trace.
from phoenix.otel import register
from opentelemetry import trace
# Setup tracing
tracer_provider = register(project_name="my-app")
tracer = trace.get_tracer(__name__)
# Create custom spans
with tracer.start_as_current_span("process_query") as span:
span.set_attribute("input.value", query)
# Child spans are automatically nested
with tracer.start_as_current_span("retrieve_context"):
context = retriever.search(query)
with tracer.start_as_current_span("generate_response"):
response = llm.generate(query, context)
span.set_attribute("output.value", response)
Projects organize related traces:
import os
os.environ["PHOENIX_PROJECT_NAME"] = "production-chatbot"
# Or per-trace
from phoenix.otel import register
tracer_provider = register(project_name="experiment-v2")
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor
tracer_provider = register()
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
from phoenix.otel import register
from openinference.instrumentation.langchain import LangChainInstrumentor
tracer_provider = register()
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)
# All LangChain operations traced
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
response = llm.invoke("Hello!")
from phoenix.otel import register
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
tracer_provider = register()
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
from phoenix.otel import register
from openinference.instrumentation.anthropic import AnthropicInstrumentor
tracer_provider = register()
AnthropicInstrumentor().instrument(tracer_provider=tracer_provider)
from phoenix.evals import (
OpenAIModel,
HallucinationEvaluator,
RelevanceEvaluator,
ToxicityEvaluator,
llm_classify
)
# Setup model for evaluation
eval_model = OpenAIModel(model="gpt-4o")
# Evaluate hallucination
hallucination_eval = HallucinationEvaluator(eval_model)
results = hallucination_eval.evaluate(
input="What is the capital of France?",
output="The capital of France is Paris.",
reference="Paris is the capital of France."
)
from phoenix.evals import llm_classify
# Define custom evaluation
def evaluate_helpfulness(input_text, output_text):
template = """
Evaluate if the response is helpful for the given question.
Question: {input}
Response: {output}
Is this response helpful? Answer 'helpful' or 'not_helpful'.
"""
result = llm_classify(
model=eval_model,
template=template,
input=input_text,
output=output_text,
rails=["helpful", "not_helpful"]
)
return result
from phoenix import Client
from phoenix.evals import run_evals
client = Client()
# Get spans to evaluate
spans_df = client.get_spans_dataframe(
project_name="my-app",
filter_condition="span_kind == 'LLM'"
)
# Run evaluations
eval_results = run_evals(
dataframe=spans_df,
evaluators=[
HallucinationEvaluator(eval_model),
RelevanceEvaluator(eval_model)
],
provide_explanation=True
)
# Log results back to Phoenix
client.log_evaluations(eval_results)
from phoenix import Client
client = Client()
# Create dataset
dataset = client.create_dataset(
name="qa-test-set",
description="QA evaluation dataset"
)
# Add examples
client.add_examples_to_dataset(
dataset_name="qa-test-set",
examples=[
{
"input": {"question": "What is Python?"},
"output": {"answer": "A programming language"}
},
{
"input": {"question": "What is ML?"},
"output": {"answer": "Machine learning"}
}
]
)
from phoenix import Client
from phoenix.experiments import run_experiment
client = Client()
def my_model(input_data):
"""Your model function."""
question = input_data["question"]
return {"answer": generate_answer(question)}
def accuracy_evaluator(input_data, output, expected):
"""Custom evaluator."""
return {
"score": 1.0 if expected["answer"].lower() in output["answer"].lower() else 0.0,
"label": "correct" if expected["answer"].lower() in output["answer"].lower() else "incorrect"
}
# Run experiment
results = run_experiment(
dataset_name="qa-test-set",
task=my_model,
evaluators=[accuracy_evaluator],
experiment_name="baseline-v1"
)
print(f"Average accuracy: {results.aggregate_metrics['accuracy']}")
from phoenix import Client
client = Client(endpoint="http://localhost:6006")
# Get spans as DataFrame
spans_df = client.get_spans_dataframe(
project_name="my-app",
filter_condition="span_kind == 'LLM'",
limit=1000
)
# Get specific span
span = client.get_span(span_id="abc123")
# Get trace
trace = client.get_trace(trace_id="xyz789")
from phoenix import Client
client = Client()
# Log user feedback
client.log_annotation(
span_id="abc123",
name="user_rating",
annotator_kind="HUMAN",
score=0.8,
label="helpful",
metadata={"comment": "Good response"}
)
# Export to pandas
df = client.get_spans_dataframe(project_name="my-app")
# Export traces
traces = client.list_traces(project_name="my-app")
docker run -p 6006:6006 arizephoenix/phoenix:latest
# Set database URL
export PHOENIX_SQL_DATABASE_URL="postgresql://user:pass@host:5432/phoenix"
# Start server
phoenix serve --host 0.0.0.0 --port 6006
| Variable | Description | Default |
|---|---|---|
PHOENIX_PORT | HTTP server port | 6006 |
PHOENIX_HOST | Server bind address | 127.0.0.1 |
PHOENIX_GRPC_PORT | gRPC/OTLP port | 4317 |
PHOENIX_SQL_DATABASE_URL |
export PHOENIX_ENABLE_AUTH=true
export PHOENIX_SECRET="your-secret-key-min-32-chars"
export PHOENIX_ADMIN_SECRET="admin-bootstrap-token"
phoenix serve
Traces not appearing:
from phoenix.otel import register
# Verify endpoint
tracer_provider = register(
project_name="my-app",
endpoint="http://localhost:6006/v1/traces" # Correct endpoint
)
# Force flush
from opentelemetry import trace
trace.get_tracer_provider().force_flush()
High memory in notebook:
# Close session when done
session = px.launch_app()
# ... do work ...
session.close()
px.close_app()
Database connection issues:
# Verify PostgreSQL connection
psql $PHOENIX_SQL_DATABASE_URL -c "SELECT 1"
# Check Phoenix logs
phoenix serve --log-level debug
Weekly Installs
142
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketFailSnykPass
Installed on
claude-code117
opencode113
gemini-cli107
cursor104
codex93
antigravity91
| Database connection |
| SQLite temp |
PHOENIX_WORKING_DIR | Data storage directory | OS temp |
PHOENIX_ENABLE_AUTH | Enable authentication | false |
PHOENIX_SECRET | JWT signing secret | Required if auth enabled |