cheerio-parsing by mindrally/skills
npx skills add https://github.com/mindrally/skills --skill cheerio-parsing您是 Cheerio、Node.js HTML 解析、DOM 操作以及为网络抓取构建高效数据提取管道的专家。
npm install cheerio axios
const cheerio = require('cheerio');
const axios = require('axios');
// 从字符串加载
const $ = cheerio.load('<html><body><h1>Hello</h1></body></html>');
// 带选项加载
const $ = cheerio.load(html, {
xmlMode: false, // 解析为 XML
decodeEntities: true, // 解码 HTML 实体
lowerCaseTags: false, // 保持标签大小写
lowerCaseAttributeNames: false
});
// 获取并解析
async function fetchAndParse(url) {
const response = await axios.get(url);
return cheerio.load(response.data);
}
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
// 按标签
$('h1')
// 按类
$('.article')
// 按 ID
$('#main-content')
// 按属性
$('[data-id="123"]')
$('a[href^="https://"]') // 以...开头
$('a[href$=".pdf"]') // 以...结尾
$('a[href*="example"]') // 包含
// 组合
$('div.article > h2') // 直接子元素
$('div.article h2') // 任意后代元素
$('h2 + p') // 相邻兄弟元素
$('h2 ~ p') // 通用兄弟元素
// 伪选择器
$('li:first-child')
$('li:last-child')
$('li:nth-child(2)')
$('li:nth-child(odd)')
$('tr:even')
$('input:not([type="hidden"])')
$('p:contains("specific text")')
// 选择多种类型
$('h1, h2, h3')
// 链式选择
$('.article').find('.title')
// 获取文本(包含子元素文本)
const text = $('h1').text();
// 获取修剪后的文本
const text = $('h1').text().trim();
// 获取 HTML
const html = $('div.content').html();
// 获取外部 HTML
const outerHtml = $.html($('div.content'));
// 获取属性
const href = $('a').attr('href');
const src = $('img').attr('src');
// 获取数据属性
const id = $('div').data('id'); // data-id 属性
// 检查属性是否存在
const hasClass = $('div').hasClass('active');
// 使用 each 迭代
const items = [];
$('.product').each((index, element) => {
items.push({
name: $(element).find('.name').text().trim(),
price: $(element).find('.price').text().trim(),
url: $(element).find('a').attr('href')
});
});
// 映射到数组
const titles = $('h2').map((i, el) => $(el).text()).get();
// 过滤元素
const featured = $('.product').filter('.featured');
// 第一个/最后一个
const first = $('li').first();
const last = $('li').last();
// 按索引获取
const third = $('li').eq(2);
// 父元素
$('span').parent()
$('span').parents() // 所有祖先元素
$('span').parents('.container') // 特定祖先元素
$('span').closest('.wrapper') // 匹配选择器的最近祖先元素
// 子元素
$('ul').children() // 直接子元素
$('ul').children('li.active') // 过滤后的子元素
$('div').contents() // 包括文本节点
// 兄弟元素
$('li').siblings()
$('li').next()
$('li').nextAll()
$('li').prev()
$('li').prevAll()
// 按选择器过滤
$('li').filter('.active')
// 按函数过滤
$('li').filter((i, el) => $(el).data('price') > 100)
// 在选择范围内查找
$('.article').find('img')
// 检查条件
$('li').is('.active') // 返回布尔值
$('li').has('span') // 拥有匹配选择器的后代元素
function extractTable(tableSelector) {
const $ = this;
const headers = [];
const rows = [];
// 获取表头
$(tableSelector).find('th').each((i, el) => {
headers.push($(el).text().trim());
});
// 获取行数据
$(tableSelector).find('tbody tr').each((i, row) => {
const rowData = {};
$(row).find('td').each((j, cell) => {
rowData[headers[j]] = $(cell).text().trim();
});
rows.push(rowData);
});
return rows;
}
function extractList(selector, itemExtractor) {
return $(selector).map((i, el) => itemExtractor($(el))).get();
}
// 用法
const products = extractList('.product', ($el) => ({
name: $el.find('.name').text().trim(),
price: parseFloat($el.find('.price').text().replace('$', '')),
image: $el.find('img').attr('src'),
link: $el.find('a').attr('href')
}));
function extractPaginationLinks() {
return $('.pagination a')
.map((i, el) => $(el).attr('href'))
.get()
.filter(href => href && !href.includes('#'));
}
// 带默认值的安全提取
function safeText(selector, defaultValue = '') {
const el = $(selector);
return el.length ? el.text().trim() : defaultValue;
}
function safeAttr(selector, attr, defaultValue = null) {
const el = $(selector);
return el.length ? el.attr(attr) : defaultValue;
}
// 可选链模式
const price = $('.price').first().text()?.trim() || 'N/A';
const { URL } = require('url');
function resolveUrl(baseUrl, relativeUrl) {
if (!relativeUrl) return null;
try {
return new URL(relativeUrl, baseUrl).href;
} catch {
return relativeUrl;
}
}
// 用法
const baseUrl = 'https://example.com/products/';
$('a').each((i, el) => {
const href = $(el).attr('href');
const absoluteUrl = resolveUrl(baseUrl, href);
console.log(absoluteUrl);
});
// 缓存选择结果
const $products = $('.product');
$products.each((i, el) => {
const $product = $(el); // 包装一次
// 多次使用 $product
});
// 限制解析范围
const $article = $('.article');
const title = $article.find('.title').text(); // 仅在文章内搜索
// 使用特定的选择器
// 好
$('div.product > h2.title')
// 效率较低
$('div').find('.product').find('h2').filter('.title')
const cheerio = require('cheerio');
const axios = require('axios');
async function scrapeProducts(url) {
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
}
});
const $ = cheerio.load(response.data);
const products = [];
$('.product-card').each((index, element) => {
const $el = $(element);
products.push({
name: $el.find('.product-title').text().trim(),
price: parseFloat(
$el.find('.price').text().replace(/[^0-9.]/g, '')
),
rating: parseFloat($el.find('.rating').attr('data-rating')) || null,
image: $el.find('img').attr('src'),
url: new URL($el.find('a').attr('href'), url).href,
inStock: !$el.find('.out-of-stock').length
});
});
return products;
}
// 带错误处理
async function safeScrape(url) {
try {
return await scrapeProducts(url);
} catch (error) {
console.error(`Failed to scrape ${url}:`, error.message);
return [];
}
}
每周安装次数
172
代码仓库
GitHub 星标数
42
首次出现
2026年1月25日
安全审计
安装于
gemini-cli145
opencode144
codex140
github-copilot134
cursor133
kimi-cli125
You are an expert in Cheerio, Node.js HTML parsing, DOM manipulation, and building efficient data extraction pipelines for web scraping.
npm install cheerio axios
const cheerio = require('cheerio');
const axios = require('axios');
// Load from string
const $ = cheerio.load('<html><body><h1>Hello</h1></body></html>');
// Load with options
const $ = cheerio.load(html, {
xmlMode: false, // Parse as XML
decodeEntities: true, // Decode HTML entities
lowerCaseTags: false, // Keep tag case
lowerCaseAttributeNames: false
});
// Fetch and parse
async function fetchAndParse(url) {
const response = await axios.get(url);
return cheerio.load(response.data);
}
// By tag
$('h1')
// By class
$('.article')
// By ID
$('#main-content')
// By attribute
$('[data-id="123"]')
$('a[href^="https://"]') // Starts with
$('a[href$=".pdf"]') // Ends with
$('a[href*="example"]') // Contains
// Combinations
$('div.article > h2') // Direct child
$('div.article h2') // Any descendant
$('h2 + p') // Adjacent sibling
$('h2 ~ p') // General sibling
// Pseudo-selectors
$('li:first-child')
$('li:last-child')
$('li:nth-child(2)')
$('li:nth-child(odd)')
$('tr:even')
$('input:not([type="hidden"])')
$('p:contains("specific text")')
// Select multiple types
$('h1, h2, h3')
// Chain selections
$('.article').find('.title')
// Get text (includes child text)
const text = $('h1').text();
// Get trimmed text
const text = $('h1').text().trim();
// Get HTML
const html = $('div.content').html();
// Get outer HTML
const outerHtml = $.html($('div.content'));
// Get attribute
const href = $('a').attr('href');
const src = $('img').attr('src');
// Get data attributes
const id = $('div').data('id'); // data-id attribute
// Check if attribute exists
const hasClass = $('div').hasClass('active');
// Iterate with each
const items = [];
$('.product').each((index, element) => {
items.push({
name: $(element).find('.name').text().trim(),
price: $(element).find('.price').text().trim(),
url: $(element).find('a').attr('href')
});
});
// Map to array
const titles = $('h2').map((i, el) => $(el).text()).get();
// Filter elements
const featured = $('.product').filter('.featured');
// First/Last
const first = $('li').first();
const last = $('li').last();
// Get by index
const third = $('li').eq(2);
// Parent
$('span').parent()
$('span').parents() // All ancestors
$('span').parents('.container') // Specific ancestor
$('span').closest('.wrapper') // Nearest ancestor matching selector
// Children
$('ul').children() // Direct children
$('ul').children('li.active') // Filtered children
$('div').contents() // Including text nodes
// Siblings
$('li').siblings()
$('li').next()
$('li').nextAll()
$('li').prev()
$('li').prevAll()
// Filter by selector
$('li').filter('.active')
// Filter by function
$('li').filter((i, el) => $(el).data('price') > 100)
// Find within selection
$('.article').find('img')
// Check conditions
$('li').is('.active') // Returns boolean
$('li').has('span') // Has descendant matching selector
function extractTable(tableSelector) {
const $ = this;
const headers = [];
const rows = [];
// Get headers
$(tableSelector).find('th').each((i, el) => {
headers.push($(el).text().trim());
});
// Get rows
$(tableSelector).find('tbody tr').each((i, row) => {
const rowData = {};
$(row).find('td').each((j, cell) => {
rowData[headers[j]] = $(cell).text().trim();
});
rows.push(rowData);
});
return rows;
}
function extractList(selector, itemExtractor) {
return $(selector).map((i, el) => itemExtractor($(el))).get();
}
// Usage
const products = extractList('.product', ($el) => ({
name: $el.find('.name').text().trim(),
price: parseFloat($el.find('.price').text().replace('$', '')),
image: $el.find('img').attr('src'),
link: $el.find('a').attr('href')
}));
function extractPaginationLinks() {
return $('.pagination a')
.map((i, el) => $(el).attr('href'))
.get()
.filter(href => href && !href.includes('#'));
}
// Safe extraction with defaults
function safeText(selector, defaultValue = '') {
const el = $(selector);
return el.length ? el.text().trim() : defaultValue;
}
function safeAttr(selector, attr, defaultValue = null) {
const el = $(selector);
return el.length ? el.attr(attr) : defaultValue;
}
// Optional chaining pattern
const price = $('.price').first().text()?.trim() || 'N/A';
const { URL } = require('url');
function resolveUrl(baseUrl, relativeUrl) {
if (!relativeUrl) return null;
try {
return new URL(relativeUrl, baseUrl).href;
} catch {
return relativeUrl;
}
}
// Usage
const baseUrl = 'https://example.com/products/';
$('a').each((i, el) => {
const href = $(el).attr('href');
const absoluteUrl = resolveUrl(baseUrl, href);
console.log(absoluteUrl);
});
// Cache selections
const $products = $('.product');
$products.each((i, el) => {
const $product = $(el); // Wrap once
// Use $product multiple times
});
// Limit parsing scope
const $article = $('.article');
const title = $article.find('.title').text(); // Searches only within article
// Use specific selectors
// Good
$('div.product > h2.title')
// Less efficient
$('div').find('.product').find('h2').filter('.title')
const cheerio = require('cheerio');
const axios = require('axios');
async function scrapeProducts(url) {
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
}
});
const $ = cheerio.load(response.data);
const products = [];
$('.product-card').each((index, element) => {
const $el = $(element);
products.push({
name: $el.find('.product-title').text().trim(),
price: parseFloat(
$el.find('.price').text().replace(/[^0-9.]/g, '')
),
rating: parseFloat($el.find('.rating').attr('data-rating')) || null,
image: $el.find('img').attr('src'),
url: new URL($el.find('a').attr('href'), url).href,
inStock: !$el.find('.out-of-stock').length
});
});
return products;
}
// With error handling
async function safeScrape(url) {
try {
return await scrapeProducts(url);
} catch (error) {
console.error(`Failed to scrape ${url}:`, error.message);
return [];
}
}
Weekly Installs
172
Repository
GitHub Stars
42
First Seen
Jan 25, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
gemini-cli145
opencode144
codex140
github-copilot134
cursor133
kimi-cli125
通过 LiteLLM 代理让 Claude Code 对接 GitHub Copilot 运行 | 高级变通方案指南
33,600 周安装