web-reader by answerzhao/agent-skills
npx skills add https://github.com/answerzhao/agent-skills --skill web-reader此技能指导使用 z-ai-web-dev-sdk 包实现网页读取和内容提取功能,使应用程序能够以编程方式获取和处理网页内容。
技能位置 : {project_path}/skills/web-reader
此技能位于您项目的上述路径中。
参考脚本 : 示例测试脚本位于 {技能位置}/scripts/ 目录中,用于快速测试和参考。请参阅 {技能位置}/scripts/web-reader.ts 以获取工作示例。
Web Reader 允许您构建能够从网页提取内容、检索文章元数据和处理 HTML 内容的应用程序。该 API 自动处理内容提取,从任何网页 URL 提供干净、结构化的数据。
重要提示 : z-ai-web-dev-sdk 必须仅在后端代码中使用。切勿在客户端代码中使用它。
z-ai-web-dev-sdk 包已安装。请按照以下示例所示导入它。
对于简单的网页内容提取,您可以使用 z-ai CLI 而无需编写代码。这非常适合快速内容抓取、测试 URL 或简单的自动化任务。
# 从网页提取内容
z-ai function --name "page_reader" --args '{"url": "https://example.com"}'
# 使用短选项
z-ai function -n page_reader -a '{"url": "https://www.example.com/article"}'
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 将提取的内容保存到 JSON 文件
z-ai function \
-n page_reader \
-a '{"url": "https://news.example.com/article"}' \
-o page_content.json
# 提取并保存博客文章
z-ai function \
-n page_reader \
-a '{"url": "https://blog.example.com/post/123"}' \
-o blog_post.json
# 提取新闻文章
z-ai function \
-n page_reader \
-a '{"url": "https://news.site.com/breaking-news"}' \
-o news.json
# 读取文档页面
z-ai function \
-n page_reader \
-a '{"url": "https://docs.example.com/getting-started"}' \
-o docs.json
# 抓取博客内容
z-ai function \
-n page_reader \
-a '{"url": "https://techblog.com/ai-trends-2024"}' \
-o blog.json
# 提取研究文章
z-ai function \
-n page_reader \
-a '{"url": "https://research.org/papers/quantum-computing"}' \
-o research.json
--name, -n: 必需 - 函数名称(使用 "page_reader")--args, -a: 必需 - JSON 参数对象,包含:
url (字符串,必需): 要读取的网页 URL--output, -o <path>: 可选 - 输出文件路径(JSON 格式)CLI 返回一个包含以下内容的 JSON 对象:
title: 页面标题html: 主要内容 HTMLtext: 纯文本内容publish_time: 发布时间戳(如果可用)url: 原始 URLmetadata: 额外的页面元数据{
"title": "Introduction to Machine Learning",
"html": "<article><h1>Introduction to Machine Learning</h1><p>Machine learning is...</p></article>",
"text": "Introduction to Machine Learning\n\nMachine learning is...",
"publish_time": "2024-01-15T10:30:00Z",
"url": "https://example.com/ml-intro",
"metadata": {
"author": "John Doe",
"description": "A comprehensive guide to ML"
}
}
# 创建一个简单的脚本来处理多个 URL
for url in \
"https://site1.com/article1" \
"https://site2.com/article2" \
"https://site3.com/article3"
do
filename=$(echo $url | md5sum | cut -d' ' -f1)
z-ai function -n page_reader -a "{\"url\": \"$url\"}" -o "${filename}.json"
done
使用 CLI 适用于:
使用 SDK 适用于:
Web Reader 使用 page_reader 函数来:
import ZAI from 'z-ai-web-dev-sdk';
async function readWebPage(url) {
try {
const zai = await ZAI.create();
const result = await zai.functions.invoke('page_reader', {
url: url
});
console.log('Title:', result.data.title);
console.log('URL:', result.data.url);
console.log('Published:', result.data.publishedTime);
console.log('HTML Content:', result.data.html);
console.log('Tokens Used:', result.data.usage.tokens);
return result.data;
} catch (error) {
console.error('Page reading failed:', error.message);
throw error;
}
}
// 用法
const pageData = await readWebPage('https://example.com/article');
console.log('Page title:', pageData.title);
import ZAI from 'z-ai-web-dev-sdk';
async function extractArticleText(url) {
const zai = await ZAI.create();
const result = await zai.functions.invoke('page_reader', {
url: url
});
// 将 HTML 转换为纯文本(基本方法)
const plainText = result.data.html
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
return {
title: result.data.title,
text: plainText,
url: result.data.url,
publishedTime: result.data.publishedTime
};
}
// 用法
const article = await extractArticleText('https://news.example.com/story');
console.log(article.title);
console.log(article.text.substring(0, 200) + '...');
import ZAI from 'z-ai-web-dev-sdk';
async function readMultiplePages(urls) {
const zai = await ZAI.create();
const results = [];
for (const url of urls) {
try {
const result = await zai.functions.invoke('page_reader', {
url: url
});
results.push({
url: url,
success: true,
data: result.data
});
} catch (error) {
results.push({
url: url,
success: false,
error: error.message
});
}
}
return results;
}
// 用法
const urls = [
'https://example.com/article1',
'https://example.com/article2',
'https://example.com/article3'
];
const pages = await readMultiplePages(urls);
pages.forEach(page => {
if (page.success) {
console.log(`✓ ${page.data.title}`);
} else {
console.log(`✗ ${page.url}: ${page.error}`);
}
});
import ZAI from 'z-ai-web-dev-sdk';
class WebContentAnalyzer {
constructor() {
this.cache = new Map();
}
async initialize() {
this.zai = await ZAI.create();
}
async readPage(url, useCache = true) {
// 检查缓存
if (useCache && this.cache.has(url)) {
console.log('Returning cached result for:', url);
return this.cache.get(url);
}
// 获取新内容
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
// 缓存结果
if (useCache) {
this.cache.set(url, result.data);
}
return result.data;
}
async getPageMetadata(url) {
const data = await this.readPage(url);
return {
title: data.title,
url: data.url,
publishedTime: data.publishedTime,
contentLength: data.html.length,
wordCount: this.estimateWordCount(data.html)
};
}
estimateWordCount(html) {
const text = html.replace(/<[^>]*>/g, ' ');
const words = text.split(/\s+/).filter(word => word.length > 0);
return words.length;
}
async comparePages(url1, url2) {
const [page1, page2] = await Promise.all([
this.readPage(url1),
this.readPage(url2)
]);
return {
page1: {
title: page1.title,
wordCount: this.estimateWordCount(page1.html),
published: page1.publishedTime
},
page2: {
title: page2.title,
wordCount: this.estimateWordCount(page2.html),
published: page2.publishedTime
}
};
}
clearCache() {
this.cache.clear();
}
}
// 用法
const analyzer = new WebContentAnalyzer();
await analyzer.initialize();
const metadata = await analyzer.getPageMetadata('https://example.com/article');
console.log('Article Metadata:', metadata);
const comparison = await analyzer.comparePages(
'https://example.com/article1',
'https://example.com/article2'
);
console.log('Comparison:', comparison);
import ZAI from 'z-ai-web-dev-sdk';
class FeedReader {
constructor() {
this.articles = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async fetchArticlesFromUrls(urls) {
const articles = [];
for (const url of urls) {
try {
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
articles.push({
title: result.data.title,
url: result.data.url,
publishedTime: result.data.publishedTime,
content: result.data.html,
fetchedAt: new Date().toISOString()
});
console.log(`Fetched: ${result.data.title}`);
} catch (error) {
console.error(`Failed to fetch ${url}:`, error.message);
}
}
this.articles = articles;
return articles;
}
getRecentArticles(limit = 10) {
return this.articles
.sort((a, b) => {
const dateA = new Date(a.publishedTime || a.fetchedAt);
const dateB = new Date(b.publishedTime || b.fetchedAt);
return dateB - dateA;
})
.slice(0, limit);
}
searchArticles(keyword) {
return this.articles.filter(article => {
const searchText = `${article.title} ${article.content}`.toLowerCase();
return searchText.includes(keyword.toLowerCase());
});
}
}
// 用法
const reader = new FeedReader();
await reader.initialize();
const feedUrls = [
'https://example.com/article1',
'https://example.com/article2',
'https://example.com/article3'
];
await reader.fetchArticlesFromUrls(feedUrls);
const recent = reader.getRecentArticles(5);
console.log('Recent articles:', recent.map(a => a.title));
import ZAI from 'z-ai-web-dev-sdk';
async function aggregateContent(urls, options = {}) {
const zai = await ZAI.create();
const aggregated = {
sources: [],
totalWords: 0,
aggregatedAt: new Date().toISOString()
};
for (const url of urls) {
try {
const result = await zai.functions.invoke('page_reader', {
url: url
});
const text = result.data.html.replace(/<[^>]*>/g, ' ');
const wordCount = text.split(/\s+/).filter(w => w.length > 0).length;
aggregated.sources.push({
title: result.data.title,
url: result.data.url,
publishedTime: result.data.publishedTime,
wordCount: wordCount,
excerpt: text.substring(0, 200).trim() + '...'
});
aggregated.totalWords += wordCount;
if (options.delay) {
await new Promise(resolve => setTimeout(resolve, options.delay));
}
} catch (error) {
console.error(`Failed to fetch ${url}:`, error.message);
}
}
return aggregated;
}
// 用法
const sources = [
'https://example.com/news1',
'https://example.com/news2',
'https://example.com/news3'
];
const aggregated = await aggregateContent(sources, { delay: 1000 });
console.log(`Aggregated ${aggregated.sources.length} sources`);
console.log(`Total words: ${aggregated.totalWords}`);
import ZAI from 'z-ai-web-dev-sdk';
class ScrapingPipeline {
constructor() {
this.processors = [];
}
async initialize() {
this.zai = await ZAI.create();
}
addProcessor(name, processorFn) {
this.processors.push({ name, fn: processorFn });
}
async scrape(url) {
// 获取页面
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
let data = {
raw: result.data,
processed: {}
};
// 运行处理器
for (const processor of this.processors) {
try {
data.processed[processor.name] = await processor.fn(data.raw);
console.log(`✓ Processed with ${processor.name}`);
} catch (error) {
console.error(`✗ Failed ${processor.name}:`, error.message);
data.processed[processor.name] = null;
}
}
return data;
}
}
// 处理器函数
function extractLinks(pageData) {
const linkRegex = /href=["'](https?:\/\/[^"']+)["']/g;
const links = [];
let match;
while ((match = linkRegex.exec(pageData.html)) !== null) {
links.push(match[1]);
}
return [...new Set(links)]; // 移除重复项
}
function extractImages(pageData) {
const imgRegex = /src=["'](https?:\/\/[^"']+\.(jpg|jpeg|png|gif|webp))["']/gi;
const images = [];
let match;
while ((match = imgRegex.exec(pageData.html)) !== null) {
images.push(match[1]);
}
return [...new Set(images)];
}
function extractPlainText(pageData) {
return pageData.html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
// 用法
const pipeline = new ScrapingPipeline();
await pipeline.initialize();
pipeline.addProcessor('links', extractLinks);
pipeline.addProcessor('images', extractImages);
pipeline.addProcessor('plainText', extractPlainText);
const result = await pipeline.scrape('https://example.com/article');
console.log('Links found:', result.processed.links.length);
console.log('Images found:', result.processed.images.length);
console.log('Text length:', result.processed.plainText.length);
{
code: 200,
status: 200,
data: {
title: "Article Title",
url: "https://example.com/article",
html: "<div>Article content...</div>",
publishedTime: "2025-01-15T10:30:00Z",
usage: {
tokens: 1500
}
},
meta: {
usage: {
tokens: 1500
}
}
}
| 字段 | 类型 | 描述 |
|---|---|---|
code | 数字 | 响应状态码 |
status | 数字 | HTTP 状态码 |
data.title | 字符串 | 页面标题 |
data.url | 字符串 | 页面 URL |
data.html | 字符串 | 提取的 HTML 内容 |
data.publishedTime | 字符串 | 发布日期(可选) |
data.usage.tokens | 数字 | 处理使用的令牌数 |
meta.usage.tokens | 数字 | 使用的总令牌数 |
async function safeReadPage(url) {
try {
const zai = await ZAI.create();
// 验证 URL
if (!url || !url.startsWith('http')) {
throw new Error('Invalid URL format');
}
const result = await zai.functions.invoke('page_reader', {
url: url
});
// 检查响应状态
if (result.code !== 200) {
throw new Error(`Failed to fetch page: ${result.code}`);
}
// 验证基本数据
if (!result.data.html || !result.data.title) {
throw new Error('Incomplete page data received');
}
return {
success: true,
data: result.data
};
} catch (error) {
console.error('Page reading error:', error);
return {
success: false,
error: error.message
};
}
}
class RateLimitedReader {
constructor(requestsPerMinute = 10) {
this.requestsPerMinute = requestsPerMinute;
this.requestTimes = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async readPage(url) {
await this.waitForRateLimit();
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
this.requestTimes.push(Date.now());
return result.data;
}
async waitForRateLimit() {
const now = Date.now();
const oneMinuteAgo = now - 60000;
// 移除旧时间戳
this.requestTimes = this.requestTimes.filter(time => time > oneMinuteAgo);
// 检查是否需要等待
if (this.requestTimes.length >= this.requestsPerMinute) {
const oldestRequest = this.requestTimes[0];
const waitTime = 60000 - (now - oldestRequest);
if (waitTime > 0) {
console.log(`Rate limit reached. Waiting ${waitTime}ms...`);
await new Promise(resolve => setTimeout(resolve, waitTime));
}
}
}
}
// 用法
const reader = new RateLimitedReader(10); // 每分钟 10 个请求
await reader.initialize();
const urls = ['https://example.com/1', 'https://example.com/2'];
for (const url of urls) {
const data = await reader.readPage(url);
console.log('Fetched:', data.title);
}
import ZAI from 'z-ai-web-dev-sdk';
class CachedWebReader {
constructor(cacheDuration = 3600000) { // 默认 1 小时
this.cache = new Map();
this.cacheDuration = cacheDuration;
}
async initialize() {
this.zai = await ZAI.create();
}
async readPage(url, forceRefresh = false) {
const cacheKey = url;
const cached = this.cache.get(cacheKey);
// 如果缓存有效且不强制刷新,则返回缓存
if (cached && !forceRefresh) {
const age = Date.now() - cached.timestamp;
if (age < this.cacheDuration) {
console.log('Returning cached content for:', url);
return cached.data;
}
}
// 获取新内容
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
// 更新缓存
this.cache.set(cacheKey, {
data: result.data,
timestamp: Date.now()
});
return result.data;
}
clearCache() {
this.cache.clear();
}
getCacheStats() {
return {
size: this.cache.size,
entries: Array.from(this.cache.keys())
};
}
}
// 用法
const reader = new CachedWebReader(3600000); // 1 小时缓存
await reader.initialize();
const data1 = await reader.readPage('https://example.com'); // 新获取
const data2 = await reader.readPage('https://example.com'); // 从缓存
const data3 = await reader.readPage('https://example.com', true); // 强制刷新
import ZAI from 'z-ai-web-dev-sdk';
async function readPagesInParallel(urls, concurrency = 3) {
const zai = await ZAI.create();
const results = [];
// 分批处理
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.allSettled(
batch.map(url =>
zai.functions.invoke('page_reader', { url })
.then(result => ({
url: url,
success: true,
data: result.data
}))
.catch(error => ({
url: url,
success: false,
error: error.message
}))
)
);
results.push(...batchResults.map(r => r.value));
console.log(`Completed batch ${Math.floor(i / concurrency) + 1}`);
}
return results;
}
// 用法
const urls = [
'https://example.com/1',
'https://example.com/2',
'https://example.com/3',
'https://example.com/4',
'https://example.com/5'
];
const results = await readPagesInParallel(urls, 2); // 2 个并发请求
results.forEach(result => {
if (result.success) {
console.log(`✓ ${result.data.title}`);
} else {
console.log(`✗ ${result.url}: ${result.error}`);
}
});
import ZAI from 'z-ai-web-dev-sdk';
class ContentProcessor {
static extractMainContent(html) {
// 移除脚本、样式和注释
let content = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<!--[\s\S]*?-->/g, '');
return content;
}
static htmlToPlainText(html) {
return html
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/p>/gi, '\n\n')
.replace(/<[^>]*>/g, '')
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/\s+/g, ' ')
.trim();
}
static extractMetadata(html) {
const metadata = {};
// 提取元描述
const descMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
if (descMatch) metadata.description = descMatch[1];
// 提取关键词
const keywordsMatch = html.match(/<meta\s+name=["']keywords["']\s+content=["']([^"']+)["']/i);
if (keywordsMatch) metadata.keywords = keywordsMatch[1].split(',').map(k => k.trim());
// 提取作者
const authorMatch = html.match(/<meta\s+name=["']author["']\s+content=["']([^"']+)["']/i);
if (authorMatch) metadata.author = authorMatch[1];
return metadata;
}
}
// 用法
async function processWebPage(url) {
const zai = await ZAI.create();
const result = await zai.functions.invoke('page_reader', { url });
return {
title: result.data.title,
url: result.data.url,
mainContent: ContentProcessor.extractMainContent(result.data.html),
plainText: ContentProcessor.htmlToPlainText(result.data.html),
metadata: ContentProcessor.extractMetadata(result.data.html),
publishedTime: result.data.publishedTime
};
}
const processed = await processWebPage('https://example.com/article');
console.log('Processed content:', processed.title);
import express from 'express';
import ZAI from 'z-ai-web-dev-sdk';
const app = express();
app.use(express.json());
let zaiInstance;
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/read-page', async (req, res) => {
try {
const { url } = req.body;
if (!url) {
return res.status(400).json({
error: 'URL is required'
});
}
const result = await zaiInstance.functions.invoke('page_reader', {
url: url
});
res.json({
success: true,
data: {
title: result.data.title,
url: result.data.url,
content: result.data.html,
publishedTime: result.data.publishedTime,
tokensUsed: result.data.usage.tokens
}
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
app.post('/api/read-multiple', async (req, res) => {
try {
const { urls } = req.body;
if (!urls || !Array.isArray(urls)) {
return res.status(400).json({
error: 'URLs array is required'
});
}
const results = await Promise.allSettled(
urls.map(url =>
zaiInstance.functions.invoke('page_reader', { url })
.then(result => ({
url: url,
success: true,
data: result.data
}))
.catch(error => ({
url: url,
success: false,
error: error.message
}))
)
);
res.json({
success: true,
results: results.map(r => r.value)
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('Web reader API running on port 3000');
});
});
import ZAI from 'z-ai-web-dev-sdk';
import cron from 'node-cron';
class ScheduledFetcher {
constructor() {
this.urls = [];
this.results = [];
}
async initialize() {
this.zai = await ZAI.create();
}
addUrl(url, schedule) {
this.urls.push({ url, schedule });
}
async fetchContent(url) {
try {
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
return {
url: url,
success: true,
title: result.data.title,
content: result.data.html,
fetchedAt: new Date().toISOString()
};
} catch (error) {
return {
url: url,
success: false,
error: error.message,
fetchedAt: new Date().toISOString()
};
}
}
startScheduledFetch(url, schedule) {
cron.schedule(schedule, async () => {
console.log(`Fetching ${url}...`);
const result = await this.fetchContent(url);
this.results.push(result);
// 仅保留最后 100 个结果
if (this.results.length > 100) {
this.results = this.results.slice(-100);
}
console.log(`Fetched: ${result.success ? result.title : result.error}`);
});
}
start() {
for (const { url, schedule } of this.urls) {
this.startScheduledFetch(url, schedule);
}
}
getResults() {
return this.results;
}
}
// 用法
const fetcher = new ScheduledFetcher();
await fetcher.initialize();
// 每小时获取一次
fetcher.addUrl('https://example.com/news', '0 * * * *');
// 每天午夜获取一次
fetcher.addUrl('https://example.com/daily', '0 0 * * *');
fetcher.start();
console.log('Scheduled fetching started');
问题 : "SDK 必须用于后端"
问题 : 无法获取页面(404、403 等)
问题 : 内容不完整或缺失
问题 : 令牌使用量高
问题 : 响应时间慢
问题 : HTML 内容为空
每周安装量
1.4K
仓库
GitHub 星标数
24
首次出现时间
2026年1月23日
安全审计
安装于
opencode1.2K
gemini-cli1.2K
codex1.2K
github-copilot1.2K
kimi-cli1.2K
amp1.2K
This skill guides the implementation of web page reading and content extraction functionality using the z-ai-web-dev-sdk package, enabling applications to fetch and process web page content programmatically.
Skill Location : {project_path}/skills/web-reader
This skill is located at the above path in your project.
Reference Scripts : Example test scripts are available in the {Skill Location}/scripts/ directory for quick testing and reference. See {Skill Location}/scripts/web-reader.ts for a working example.
Web Reader allows you to build applications that can extract content from web pages, retrieve article metadata, and process HTML content. The API automatically handles content extraction, providing clean, structured data from any web URL.
IMPORTANT : z-ai-web-dev-sdk MUST be used in backend code only. Never use it in client-side code.
The z-ai-web-dev-sdk package is already installed. Import it as shown in the examples below.
For simple web page content extraction, you can use the z-ai CLI instead of writing code. This is ideal for quick content scraping, testing URLs, or simple automation tasks.
# Extract content from a web page
z-ai function --name "page_reader" --args '{"url": "https://example.com"}'
# Using short options
z-ai function -n page_reader -a '{"url": "https://www.example.com/article"}'
# Save extracted content to JSON file
z-ai function \
-n page_reader \
-a '{"url": "https://news.example.com/article"}' \
-o page_content.json
# Extract and save blog post
z-ai function \
-n page_reader \
-a '{"url": "https://blog.example.com/post/123"}' \
-o blog_post.json
# Extract news article
z-ai function \
-n page_reader \
-a '{"url": "https://news.site.com/breaking-news"}' \
-o news.json
# Read documentation page
z-ai function \
-n page_reader \
-a '{"url": "https://docs.example.com/getting-started"}' \
-o docs.json
# Scrape blog content
z-ai function \
-n page_reader \
-a '{"url": "https://techblog.com/ai-trends-2024"}' \
-o blog.json
# Extract research article
z-ai function \
-n page_reader \
-a '{"url": "https://research.org/papers/quantum-computing"}' \
-o research.json
--name, -n: Required - Function name (use "page_reader")--args, -a: Required - JSON arguments object with:
url (string, required): The URL of the web page to read--output, -o <path>: Optional - Output file path (JSON format)The CLI returns a JSON object containing:
title: Page titlehtml: Main content HTMLtext: Plain text contentpublish_time: Publication timestamp (if available)url: Original URLmetadata: Additional page metadata{
"title": "Introduction to Machine Learning",
"html": "<article><h1>Introduction to Machine Learning</h1><p>Machine learning is...</p></article>",
"text": "Introduction to Machine Learning\n\nMachine learning is...",
"publish_time": "2024-01-15T10:30:00Z",
"url": "https://example.com/ml-intro",
"metadata": {
"author": "John Doe",
"description": "A comprehensive guide to ML"
}
}
# Create a simple script to process multiple URLs
for url in \
"https://site1.com/article1" \
"https://site2.com/article2" \
"https://site3.com/article3"
do
filename=$(echo $url | md5sum | cut -d' ' -f1)
z-ai function -n page_reader -a "{\"url\": \"$url\"}" -o "${filename}.json"
done
Use CLI for:
Use SDK for:
The Web Reader uses the page_reader function to:
import ZAI from 'z-ai-web-dev-sdk';
async function readWebPage(url) {
try {
const zai = await ZAI.create();
const result = await zai.functions.invoke('page_reader', {
url: url
});
console.log('Title:', result.data.title);
console.log('URL:', result.data.url);
console.log('Published:', result.data.publishedTime);
console.log('HTML Content:', result.data.html);
console.log('Tokens Used:', result.data.usage.tokens);
return result.data;
} catch (error) {
console.error('Page reading failed:', error.message);
throw error;
}
}
// Usage
const pageData = await readWebPage('https://example.com/article');
console.log('Page title:', pageData.title);
import ZAI from 'z-ai-web-dev-sdk';
async function extractArticleText(url) {
const zai = await ZAI.create();
const result = await zai.functions.invoke('page_reader', {
url: url
});
// Convert HTML to plain text (basic approach)
const plainText = result.data.html
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
return {
title: result.data.title,
text: plainText,
url: result.data.url,
publishedTime: result.data.publishedTime
};
}
// Usage
const article = await extractArticleText('https://news.example.com/story');
console.log(article.title);
console.log(article.text.substring(0, 200) + '...');
import ZAI from 'z-ai-web-dev-sdk';
async function readMultiplePages(urls) {
const zai = await ZAI.create();
const results = [];
for (const url of urls) {
try {
const result = await zai.functions.invoke('page_reader', {
url: url
});
results.push({
url: url,
success: true,
data: result.data
});
} catch (error) {
results.push({
url: url,
success: false,
error: error.message
});
}
}
return results;
}
// Usage
const urls = [
'https://example.com/article1',
'https://example.com/article2',
'https://example.com/article3'
];
const pages = await readMultiplePages(urls);
pages.forEach(page => {
if (page.success) {
console.log(`✓ ${page.data.title}`);
} else {
console.log(`✗ ${page.url}: ${page.error}`);
}
});
import ZAI from 'z-ai-web-dev-sdk';
class WebContentAnalyzer {
constructor() {
this.cache = new Map();
}
async initialize() {
this.zai = await ZAI.create();
}
async readPage(url, useCache = true) {
// Check cache
if (useCache && this.cache.has(url)) {
console.log('Returning cached result for:', url);
return this.cache.get(url);
}
// Fetch fresh content
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
// Cache the result
if (useCache) {
this.cache.set(url, result.data);
}
return result.data;
}
async getPageMetadata(url) {
const data = await this.readPage(url);
return {
title: data.title,
url: data.url,
publishedTime: data.publishedTime,
contentLength: data.html.length,
wordCount: this.estimateWordCount(data.html)
};
}
estimateWordCount(html) {
const text = html.replace(/<[^>]*>/g, ' ');
const words = text.split(/\s+/).filter(word => word.length > 0);
return words.length;
}
async comparePages(url1, url2) {
const [page1, page2] = await Promise.all([
this.readPage(url1),
this.readPage(url2)
]);
return {
page1: {
title: page1.title,
wordCount: this.estimateWordCount(page1.html),
published: page1.publishedTime
},
page2: {
title: page2.title,
wordCount: this.estimateWordCount(page2.html),
published: page2.publishedTime
}
};
}
clearCache() {
this.cache.clear();
}
}
// Usage
const analyzer = new WebContentAnalyzer();
await analyzer.initialize();
const metadata = await analyzer.getPageMetadata('https://example.com/article');
console.log('Article Metadata:', metadata);
const comparison = await analyzer.comparePages(
'https://example.com/article1',
'https://example.com/article2'
);
console.log('Comparison:', comparison);
import ZAI from 'z-ai-web-dev-sdk';
class FeedReader {
constructor() {
this.articles = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async fetchArticlesFromUrls(urls) {
const articles = [];
for (const url of urls) {
try {
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
articles.push({
title: result.data.title,
url: result.data.url,
publishedTime: result.data.publishedTime,
content: result.data.html,
fetchedAt: new Date().toISOString()
});
console.log(`Fetched: ${result.data.title}`);
} catch (error) {
console.error(`Failed to fetch ${url}:`, error.message);
}
}
this.articles = articles;
return articles;
}
getRecentArticles(limit = 10) {
return this.articles
.sort((a, b) => {
const dateA = new Date(a.publishedTime || a.fetchedAt);
const dateB = new Date(b.publishedTime || b.fetchedAt);
return dateB - dateA;
})
.slice(0, limit);
}
searchArticles(keyword) {
return this.articles.filter(article => {
const searchText = `${article.title} ${article.content}`.toLowerCase();
return searchText.includes(keyword.toLowerCase());
});
}
}
// Usage
const reader = new FeedReader();
await reader.initialize();
const feedUrls = [
'https://example.com/article1',
'https://example.com/article2',
'https://example.com/article3'
];
await reader.fetchArticlesFromUrls(feedUrls);
const recent = reader.getRecentArticles(5);
console.log('Recent articles:', recent.map(a => a.title));
import ZAI from 'z-ai-web-dev-sdk';
async function aggregateContent(urls, options = {}) {
const zai = await ZAI.create();
const aggregated = {
sources: [],
totalWords: 0,
aggregatedAt: new Date().toISOString()
};
for (const url of urls) {
try {
const result = await zai.functions.invoke('page_reader', {
url: url
});
const text = result.data.html.replace(/<[^>]*>/g, ' ');
const wordCount = text.split(/\s+/).filter(w => w.length > 0).length;
aggregated.sources.push({
title: result.data.title,
url: result.data.url,
publishedTime: result.data.publishedTime,
wordCount: wordCount,
excerpt: text.substring(0, 200).trim() + '...'
});
aggregated.totalWords += wordCount;
if (options.delay) {
await new Promise(resolve => setTimeout(resolve, options.delay));
}
} catch (error) {
console.error(`Failed to fetch ${url}:`, error.message);
}
}
return aggregated;
}
// Usage
const sources = [
'https://example.com/news1',
'https://example.com/news2',
'https://example.com/news3'
];
const aggregated = await aggregateContent(sources, { delay: 1000 });
console.log(`Aggregated ${aggregated.sources.length} sources`);
console.log(`Total words: ${aggregated.totalWords}`);
import ZAI from 'z-ai-web-dev-sdk';
class ScrapingPipeline {
constructor() {
this.processors = [];
}
async initialize() {
this.zai = await ZAI.create();
}
addProcessor(name, processorFn) {
this.processors.push({ name, fn: processorFn });
}
async scrape(url) {
// Fetch the page
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
let data = {
raw: result.data,
processed: {}
};
// Run through processors
for (const processor of this.processors) {
try {
data.processed[processor.name] = await processor.fn(data.raw);
console.log(`✓ Processed with ${processor.name}`);
} catch (error) {
console.error(`✗ Failed ${processor.name}:`, error.message);
data.processed[processor.name] = null;
}
}
return data;
}
}
// Processor functions
function extractLinks(pageData) {
const linkRegex = /href=["'](https?:\/\/[^"']+)["']/g;
const links = [];
let match;
while ((match = linkRegex.exec(pageData.html)) !== null) {
links.push(match[1]);
}
return [...new Set(links)]; // Remove duplicates
}
function extractImages(pageData) {
const imgRegex = /src=["'](https?:\/\/[^"']+\.(jpg|jpeg|png|gif|webp))["']/gi;
const images = [];
let match;
while ((match = imgRegex.exec(pageData.html)) !== null) {
images.push(match[1]);
}
return [...new Set(images)];
}
function extractPlainText(pageData) {
return pageData.html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
// Usage
const pipeline = new ScrapingPipeline();
await pipeline.initialize();
pipeline.addProcessor('links', extractLinks);
pipeline.addProcessor('images', extractImages);
pipeline.addProcessor('plainText', extractPlainText);
const result = await pipeline.scrape('https://example.com/article');
console.log('Links found:', result.processed.links.length);
console.log('Images found:', result.processed.images.length);
console.log('Text length:', result.processed.plainText.length);
{
code: 200,
status: 200,
data: {
title: "Article Title",
url: "https://example.com/article",
html: "<div>Article content...</div>",
publishedTime: "2025-01-15T10:30:00Z",
usage: {
tokens: 1500
}
},
meta: {
usage: {
tokens: 1500
}
}
}
| Field | Type | Description |
|---|---|---|
code | number | Response status code |
status | number | HTTP status code |
data.title | string | Page title |
data.url | string | Page URL |
data.html | string | Extracted HTML content |
async function safeReadPage(url) {
try {
const zai = await ZAI.create();
// Validate URL
if (!url || !url.startsWith('http')) {
throw new Error('Invalid URL format');
}
const result = await zai.functions.invoke('page_reader', {
url: url
});
// Check response status
if (result.code !== 200) {
throw new Error(`Failed to fetch page: ${result.code}`);
}
// Verify essential data
if (!result.data.html || !result.data.title) {
throw new Error('Incomplete page data received');
}
return {
success: true,
data: result.data
};
} catch (error) {
console.error('Page reading error:', error);
return {
success: false,
error: error.message
};
}
}
class RateLimitedReader {
constructor(requestsPerMinute = 10) {
this.requestsPerMinute = requestsPerMinute;
this.requestTimes = [];
}
async initialize() {
this.zai = await ZAI.create();
}
async readPage(url) {
await this.waitForRateLimit();
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
this.requestTimes.push(Date.now());
return result.data;
}
async waitForRateLimit() {
const now = Date.now();
const oneMinuteAgo = now - 60000;
// Remove old timestamps
this.requestTimes = this.requestTimes.filter(time => time > oneMinuteAgo);
// Check if we need to wait
if (this.requestTimes.length >= this.requestsPerMinute) {
const oldestRequest = this.requestTimes[0];
const waitTime = 60000 - (now - oldestRequest);
if (waitTime > 0) {
console.log(`Rate limit reached. Waiting ${waitTime}ms...`);
await new Promise(resolve => setTimeout(resolve, waitTime));
}
}
}
}
// Usage
const reader = new RateLimitedReader(10); // 10 requests per minute
await reader.initialize();
const urls = ['https://example.com/1', 'https://example.com/2'];
for (const url of urls) {
const data = await reader.readPage(url);
console.log('Fetched:', data.title);
}
import ZAI from 'z-ai-web-dev-sdk';
class CachedWebReader {
constructor(cacheDuration = 3600000) { // 1 hour default
this.cache = new Map();
this.cacheDuration = cacheDuration;
}
async initialize() {
this.zai = await ZAI.create();
}
async readPage(url, forceRefresh = false) {
const cacheKey = url;
const cached = this.cache.get(cacheKey);
// Return cached if valid and not forcing refresh
if (cached && !forceRefresh) {
const age = Date.now() - cached.timestamp;
if (age < this.cacheDuration) {
console.log('Returning cached content for:', url);
return cached.data;
}
}
// Fetch fresh content
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
// Update cache
this.cache.set(cacheKey, {
data: result.data,
timestamp: Date.now()
});
return result.data;
}
clearCache() {
this.cache.clear();
}
getCacheStats() {
return {
size: this.cache.size,
entries: Array.from(this.cache.keys())
};
}
}
// Usage
const reader = new CachedWebReader(3600000); // 1 hour cache
await reader.initialize();
const data1 = await reader.readPage('https://example.com'); // Fresh fetch
const data2 = await reader.readPage('https://example.com'); // From cache
const data3 = await reader.readPage('https://example.com', true); // Force refresh
import ZAI from 'z-ai-web-dev-sdk';
async function readPagesInParallel(urls, concurrency = 3) {
const zai = await ZAI.create();
const results = [];
// Process in batches
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.allSettled(
batch.map(url =>
zai.functions.invoke('page_reader', { url })
.then(result => ({
url: url,
success: true,
data: result.data
}))
.catch(error => ({
url: url,
success: false,
error: error.message
}))
)
);
results.push(...batchResults.map(r => r.value));
console.log(`Completed batch ${Math.floor(i / concurrency) + 1}`);
}
return results;
}
// Usage
const urls = [
'https://example.com/1',
'https://example.com/2',
'https://example.com/3',
'https://example.com/4',
'https://example.com/5'
];
const results = await readPagesInParallel(urls, 2); // 2 concurrent requests
results.forEach(result => {
if (result.success) {
console.log(`✓ ${result.data.title}`);
} else {
console.log(`✗ ${result.url}: ${result.error}`);
}
});
import ZAI from 'z-ai-web-dev-sdk';
class ContentProcessor {
static extractMainContent(html) {
// Remove scripts, styles, and comments
let content = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<!--[\s\S]*?-->/g, '');
return content;
}
static htmlToPlainText(html) {
return html
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/p>/gi, '\n\n')
.replace(/<[^>]*>/g, '')
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/\s+/g, ' ')
.trim();
}
static extractMetadata(html) {
const metadata = {};
// Extract meta description
const descMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
if (descMatch) metadata.description = descMatch[1];
// Extract keywords
const keywordsMatch = html.match(/<meta\s+name=["']keywords["']\s+content=["']([^"']+)["']/i);
if (keywordsMatch) metadata.keywords = keywordsMatch[1].split(',').map(k => k.trim());
// Extract author
const authorMatch = html.match(/<meta\s+name=["']author["']\s+content=["']([^"']+)["']/i);
if (authorMatch) metadata.author = authorMatch[1];
return metadata;
}
}
// Usage
async function processWebPage(url) {
const zai = await ZAI.create();
const result = await zai.functions.invoke('page_reader', { url });
return {
title: result.data.title,
url: result.data.url,
mainContent: ContentProcessor.extractMainContent(result.data.html),
plainText: ContentProcessor.htmlToPlainText(result.data.html),
metadata: ContentProcessor.extractMetadata(result.data.html),
publishedTime: result.data.publishedTime
};
}
const processed = await processWebPage('https://example.com/article');
console.log('Processed content:', processed.title);
import express from 'express';
import ZAI from 'z-ai-web-dev-sdk';
const app = express();
app.use(express.json());
let zaiInstance;
async function initZAI() {
zaiInstance = await ZAI.create();
}
app.post('/api/read-page', async (req, res) => {
try {
const { url } = req.body;
if (!url) {
return res.status(400).json({
error: 'URL is required'
});
}
const result = await zaiInstance.functions.invoke('page_reader', {
url: url
});
res.json({
success: true,
data: {
title: result.data.title,
url: result.data.url,
content: result.data.html,
publishedTime: result.data.publishedTime,
tokensUsed: result.data.usage.tokens
}
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
app.post('/api/read-multiple', async (req, res) => {
try {
const { urls } = req.body;
if (!urls || !Array.isArray(urls)) {
return res.status(400).json({
error: 'URLs array is required'
});
}
const results = await Promise.allSettled(
urls.map(url =>
zaiInstance.functions.invoke('page_reader', { url })
.then(result => ({
url: url,
success: true,
data: result.data
}))
.catch(error => ({
url: url,
success: false,
error: error.message
}))
)
);
res.json({
success: true,
results: results.map(r => r.value)
});
} catch (error) {
res.status(500).json({
success: false,
error: error.message
});
}
});
initZAI().then(() => {
app.listen(3000, () => {
console.log('Web reader API running on port 3000');
});
});
import ZAI from 'z-ai-web-dev-sdk';
import cron from 'node-cron';
class ScheduledFetcher {
constructor() {
this.urls = [];
this.results = [];
}
async initialize() {
this.zai = await ZAI.create();
}
addUrl(url, schedule) {
this.urls.push({ url, schedule });
}
async fetchContent(url) {
try {
const result = await this.zai.functions.invoke('page_reader', {
url: url
});
return {
url: url,
success: true,
title: result.data.title,
content: result.data.html,
fetchedAt: new Date().toISOString()
};
} catch (error) {
return {
url: url,
success: false,
error: error.message,
fetchedAt: new Date().toISOString()
};
}
}
startScheduledFetch(url, schedule) {
cron.schedule(schedule, async () => {
console.log(`Fetching ${url}...`);
const result = await this.fetchContent(url);
this.results.push(result);
// Keep only last 100 results
if (this.results.length > 100) {
this.results = this.results.slice(-100);
}
console.log(`Fetched: ${result.success ? result.title : result.error}`);
});
}
start() {
for (const { url, schedule } of this.urls) {
this.startScheduledFetch(url, schedule);
}
}
getResults() {
return this.results;
}
}
// Usage
const fetcher = new ScheduledFetcher();
await fetcher.initialize();
// Fetch every hour
fetcher.addUrl('https://example.com/news', '0 * * * *');
// Fetch every day at midnight
fetcher.addUrl('https://example.com/daily', '0 0 * * *');
fetcher.start();
console.log('Scheduled fetching started');
Issue : "SDK must be used in backend"
Issue : Failed to fetch page (404, 403, etc.)
Issue : Incomplete or missing content
Issue : High token usage
Issue : Slow response times
Issue : Empty HTML content
Weekly Installs
1.4K
Repository
GitHub Stars
24
First Seen
Jan 23, 2026
Security Audits
Gen Agent Trust HubFailSocketPassSnykWarn
Installed on
opencode1.2K
gemini-cli1.2K
codex1.2K
github-copilot1.2K
kimi-cli1.2K
amp1.2K
React 组合模式指南:Vercel 组件架构最佳实践,提升代码可维护性
102,200 周安装
AI智能体长期记忆系统 - 精英级架构,融合6种方法,永不丢失上下文
1,200 周安装
AI新闻播客制作技能:实时新闻转对话式播客脚本与音频生成
1,200 周安装
Word文档处理器:DOCX创建、编辑、分析与修订痕迹处理全指南 | 自动化办公解决方案
1,200 周安装
React Router 框架模式指南:全栈开发、文件路由、数据加载与渲染策略
1,200 周安装
Nano Banana AI 图像生成工具:使用 Gemini 3 Pro 生成与编辑高分辨率图像
1,200 周安装
SVG Logo Designer - AI 驱动的专业矢量标识设计工具,生成可缩放品牌标识
1,200 周安装
data.publishedTime |
| string |
| Publication date (optional) |
data.usage.tokens | number | Tokens used for processing |
meta.usage.tokens | number | Total tokens used |