VLM by answerzhao/agent-skills
npx skills add https://github.com/answerzhao/agent-skills --skill VLM此技能指导使用 z-ai-web-dev-sdk 包实现视觉聊天功能,使 AI 模型能够理解并结合文本提示对图像做出响应。
技能位置 : {project_path}/skills/VLM
此技能位于您项目的上述路径中。
参考脚本 : 示例测试脚本位于 {技能位置}/scripts/ 目录中,用于快速测试和参考。请参阅 {技能位置}/scripts/vlm.ts 获取工作示例。
视觉聊天允许您构建能够分析图像、从视觉内容中提取信息,并通过自然语言对话回答有关图像问题的应用程序。
重要提示 : z-ai-web-dev-sdk 必须仅在后端代码中使用。切勿在客户端代码中使用它。
z-ai-web-dev-sdk 包已安装。请按照以下示例所示导入它。
对于简单的图像分析任务,您可以使用 z-ai CLI 而无需编写代码。这非常适合快速图像描述、测试视觉能力或简单自动化。
# 描述来自 URL 的图像
z-ai vision --prompt "What's in this image?" --image "https://example.com/photo.jpg"
# 使用短选项
z-ai vision -p "Describe this image" -i "https://example.com/image.png"
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 分析本地图像文件
z-ai vision -p "What objects are in this photo?" -i "./photo.jpg"
# 将响应保存到文件
z-ai vision -p "Describe the scene" -i "./landscape.png" -o description.json
# 同时分析多张图像
z-ai vision \
-p "Compare these two images" \
-i "./photo1.jpg" \
-i "./photo2.jpg" \
-o comparison.json
# 多张图像的详细分析
z-ai vision \
--prompt "What are the differences between these images?" \
--image "https://example.com/before.jpg" \
--image "https://example.com/after.jpg"
# 为复杂的视觉推理启用思考
z-ai vision \
-p "Count the number of people in this image and describe their activities" \
-i "./crowd.jpg" \
--thinking \
-o analysis.json
# 流式传输视觉分析结果
z-ai vision -p "Describe this image in detail" -i "./photo.jpg" --stream
--prompt, -p <text>: 必需 - 关于图像的提问或指令--image, -i <URL 或路径>: 可选 - 图像 URL 或本地文件路径(可多次使用)--thinking, -t: 可选 - 启用思维链推理(默认:禁用)--output, -o <path>: 可选 - 输出文件路径(JSON 格式)--stream: 可选 - 实时流式传输响应使用 CLI 适用于:
使用 SDK 适用于:
为了获得更好的性能和可靠性,请使用 base64 编码将图像传递给模型,而不是使用图像 URL。
Vision Chat API 支持三种类型的媒体内容:
将此类型用于静态图像(PNG、JPEG、GIF、WebP 等)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
将此类型用于视频内容(MP4、AVI、MOV 等)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'video_url', video_url: { url: videoUrl } }
]
}
将此类型用于文档文件(PDF、DOCX、TXT 等)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'file_url', file_url: { url: fileUrl } }
]
}
注意 : 您可以在单条消息中组合多种内容类型。例如,您可以同时包含文本和多张图像,或者包含文本以及图像和文档。
import ZAI from 'z-ai-web-dev-sdk';
async function analyzeImage(imageUrl, question) {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: question
},
{
type: 'image_url',
image_url: {
url: imageUrl
}
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
// 用法
const result = await analyzeImage(
'https://example.com/product.jpg',
'Describe this product in detail'
);
console.log('Analysis:', result);
import ZAI from 'z-ai-web-dev-sdk';
async function compareImages(imageUrls, question) {
const zai = await ZAI.create();
const content = [
{
type: 'text',
text: question
},
...imageUrls.map(url => ({
type: 'image_url',
image_url: { url }
}))
];
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: content
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
// 用法
const comparison = await compareImages(
[
'https://example.com/before.jpg',
'https://example.com/after.jpg'
],
'Compare these two images and describe the differences'
);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function analyzeLocalImage(imagePath, question) {
const zai = await ZAI.create();
// 读取图像文件并转换为 base64
const imageBuffer = fs.readFileSync(imagePath);
const base64Image = imageBuffer.toString('base64');
const mimeType = imagePath.endsWith('.png') ? 'image/png' : 'image/jpeg';
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: question
},
{
type: 'image_url',
image_url: {
url: `data:${mimeType};base64,${base64Image}`
}
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
import ZAI from 'z-ai-web-dev-sdk';
class VisionChatSession {
constructor() {
this.messages = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async addImage(imageUrl, initialQuestion) {
this.messages.push({
role: 'user',
content: [
{
type: 'text',
text: initialQuestion
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
});
return this.getResponse();
}
async followUp(question) {
this.messages.push({
role: 'user',
content: [
{
type: 'text',
text: question
}
]
});
return this.getResponse();
}
async getResponse() {
const response = await this.zai.chat.completions.createVision({
messages: this.messages,
thinking: { type: 'disabled' }
});
const assistantMessage = response.choices[0]?.message?.content;
this.messages.push({
role: 'assistant',
content: assistantMessage
});
return assistantMessage;
}
}
// 用法
const session = new VisionChatSession();
await session.initialize();
const initial = await session.addImage(
'https://example.com/chart.jpg',
'What does this chart show?'
);
console.log('Initial analysis:', initial);
const followup = await session.followUp('What are the key trends?');
console.log('Follow-up:', followup);
import ZAI from 'z-ai-web-dev-sdk';
async function classifyImage(imageUrl) {
const zai = await ZAI.create();
const prompt = `Analyze this image and provide:
1. Main subject/category
2. Key objects detected
3. Scene description
4. Suggested tags (comma-separated)
Format your response as JSON.`;
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: prompt
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
}
],
thinking: { type: 'disabled' }
});
const content = response.choices[0]?.message?.content;
try {
return JSON.parse(content);
} catch (e) {
return { rawResponse: content };
}
}
import ZAI from 'z-ai-web-dev-sdk';
async function extractText(imageUrl) {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: 'Extract all text from this image. Preserve the layout and formatting as much as possible.'
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
async function safeVisionChat(imageUrl, question) {
try {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
thinking: { type: 'disabled' }
});
return {
success: true,
content: response.choices[0]?.message?.content
};
} catch (error) {
console.error('Vision chat error:', error);
return {
success: false,
error: error.message
};
}
}
import express from 'express';
import ZAI from 'z-ai-web-dev-sdk';
const app = express();
app.use(express.json());
let zaiInstance;
// 初始化 SDK(仅一次)
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/analyze-image', async (req, res) => {
try {
const { imageUrl, question } = req.body;
if (!imageUrl || !question) {
return res.status(400).json({
error: 'imageUrl and question are required'
});
}
const response = await zaiInstance.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
thinking: { type: 'disabled' }
});
res.json({
success: true,
analysis: response.choices[0]?.message?.content
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('Vision chat API running on port 3000');
});
});
问题 : "SDK 必须用于后端"
问题 : 图像未加载或未被分析
问题 : 分析质量差
问题 : 响应时间慢
每周安装量
104
代码仓库
GitHub 星标数
24
首次出现
2026年1月23日
安全审计
安装于
opencode93
codex86
gemini-cli86
cursor85
github-copilot80
kimi-cli77
This skill guides the implementation of vision chat functionality using the z-ai-web-dev-sdk package, enabling AI models to understand and respond to images combined with text prompts.
Skill Location : {project_path}/skills/VLM
this skill is located at above path in your project.
Reference Scripts : Example test scripts are available in the {Skill Location}/scripts/ directory for quick testing and reference. See {Skill Location}/scripts/vlm.ts for a working example.
Vision Chat allows you to build applications that can analyze images, extract information from visual content, and answer questions about images through natural language conversation.
IMPORTANT : z-ai-web-dev-sdk MUST be used in backend code only. Never use it in client-side code.
The z-ai-web-dev-sdk package is already installed. Import it as shown in the examples below.
For simple image analysis tasks, you can use the z-ai CLI instead of writing code. This is ideal for quick image descriptions, testing vision capabilities, or simple automation.
# Describe an image from URL
z-ai vision --prompt "What's in this image?" --image "https://example.com/photo.jpg"
# Using short options
z-ai vision -p "Describe this image" -i "https://example.com/image.png"
# Analyze a local image file
z-ai vision -p "What objects are in this photo?" -i "./photo.jpg"
# Save response to file
z-ai vision -p "Describe the scene" -i "./landscape.png" -o description.json
# Analyze multiple images at once
z-ai vision \
-p "Compare these two images" \
-i "./photo1.jpg" \
-i "./photo2.jpg" \
-o comparison.json
# Multiple images with detailed analysis
z-ai vision \
--prompt "What are the differences between these images?" \
--image "https://example.com/before.jpg" \
--image "https://example.com/after.jpg"
# Enable thinking for complex visual reasoning
z-ai vision \
-p "Count the number of people in this image and describe their activities" \
-i "./crowd.jpg" \
--thinking \
-o analysis.json
# Stream the vision analysis
z-ai vision -p "Describe this image in detail" -i "./photo.jpg" --stream
--prompt, -p <text>: Required - Question or instruction about the image(s)--image, -i <URL or path>: Optional - Image URL or local file path (can be used multiple times)--thinking, -t: Optional - Enable chain-of-thought reasoning (default: disabled)--output, -o <path>: Optional - Output file path (JSON format)--stream: Optional - Stream the response in real-timeUse CLI for:
Use SDK for:
For better performance and reliability, use base64 encoding to pass images to the model instead of image URLs.
The Vision Chat API supports three types of media content:
Use this type for static images (PNG, JPEG, GIF, WebP, etc.)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
Use this type for video content (MP4, AVI, MOV, etc.)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'video_url', video_url: { url: videoUrl } }
]
}
Use this type for document files (PDF, DOCX, TXT, etc.)
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'file_url', file_url: { url: fileUrl } }
]
}
Note : You can combine multiple content types in a single message. For example, you can include both text and multiple images, or text with both an image and a document.
import ZAI from 'z-ai-web-dev-sdk';
async function analyzeImage(imageUrl, question) {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: question
},
{
type: 'image_url',
image_url: {
url: imageUrl
}
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
// Usage
const result = await analyzeImage(
'https://example.com/product.jpg',
'Describe this product in detail'
);
console.log('Analysis:', result);
import ZAI from 'z-ai-web-dev-sdk';
async function compareImages(imageUrls, question) {
const zai = await ZAI.create();
const content = [
{
type: 'text',
text: question
},
...imageUrls.map(url => ({
type: 'image_url',
image_url: { url }
}))
];
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: content
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
// Usage
const comparison = await compareImages(
[
'https://example.com/before.jpg',
'https://example.com/after.jpg'
],
'Compare these two images and describe the differences'
);
import ZAI from 'z-ai-web-dev-sdk';
import fs from 'fs';
async function analyzeLocalImage(imagePath, question) {
const zai = await ZAI.create();
// Read image file and convert to base64
const imageBuffer = fs.readFileSync(imagePath);
const base64Image = imageBuffer.toString('base64');
const mimeType = imagePath.endsWith('.png') ? 'image/png' : 'image/jpeg';
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: question
},
{
type: 'image_url',
image_url: {
url: `data:${mimeType};base64,${base64Image}`
}
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
import ZAI from 'z-ai-web-dev-sdk';
class VisionChatSession {
constructor() {
this.messages = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async addImage(imageUrl, initialQuestion) {
this.messages.push({
role: 'user',
content: [
{
type: 'text',
text: initialQuestion
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
});
return this.getResponse();
}
async followUp(question) {
this.messages.push({
role: 'user',
content: [
{
type: 'text',
text: question
}
]
});
return this.getResponse();
}
async getResponse() {
const response = await this.zai.chat.completions.createVision({
messages: this.messages,
thinking: { type: 'disabled' }
});
const assistantMessage = response.choices[0]?.message?.content;
this.messages.push({
role: 'assistant',
content: assistantMessage
});
return assistantMessage;
}
}
// Usage
const session = new VisionChatSession();
await session.initialize();
const initial = await session.addImage(
'https://example.com/chart.jpg',
'What does this chart show?'
);
console.log('Initial analysis:', initial);
const followup = await session.followUp('What are the key trends?');
console.log('Follow-up:', followup);
import ZAI from 'z-ai-web-dev-sdk';
async function classifyImage(imageUrl) {
const zai = await ZAI.create();
const prompt = `Analyze this image and provide:
1. Main subject/category
2. Key objects detected
3. Scene description
4. Suggested tags (comma-separated)
Format your response as JSON.`;
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: prompt
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
}
],
thinking: { type: 'disabled' }
});
const content = response.choices[0]?.message?.content;
try {
return JSON.parse(content);
} catch (e) {
return { rawResponse: content };
}
}
import ZAI from 'z-ai-web-dev-sdk';
async function extractText(imageUrl) {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: 'Extract all text from this image. Preserve the layout and formatting as much as possible.'
},
{
type: 'image_url',
image_url: { url: imageUrl }
}
]
}
],
thinking: { type: 'disabled' }
});
return response.choices[0]?.message?.content;
}
async function safeVisionChat(imageUrl, question) {
try {
const zai = await ZAI.create();
const response = await zai.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
thinking: { type: 'disabled' }
});
return {
success: true,
content: response.choices[0]?.message?.content
};
} catch (error) {
console.error('Vision chat error:', error);
return {
success: false,
error: error.message
};
}
}
import express from 'express';
import ZAI from 'z-ai-web-dev-sdk';
const app = express();
app.use(express.json());
let zaiInstance;
// Initialize SDK once
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/analyze-image', async (req, res) => {
try {
const { imageUrl, question } = req.body;
if (!imageUrl || !question) {
return res.status(400).json({
error: 'imageUrl and question are required'
});
}
const response = await zaiInstance.chat.completions.createVision({
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
thinking: { type: 'disabled' }
});
res.json({
success: true,
analysis: response.choices[0]?.message?.content
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('Vision chat API running on port 3000');
});
});
Issue : "SDK must be used in backend"
Issue : Image not loading or being analyzed
Issue : Poor analysis quality
Issue : Slow response times
Weekly Installs
104
Repository
GitHub Stars
24
First Seen
Jan 23, 2026
Security Audits
Gen Agent Trust HubWarnSocketPassSnykWarn
Installed on
opencode93
codex86
gemini-cli86
cursor85
github-copilot80
kimi-cli77
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
62,200 周安装