beautifulsoup-parsing by mindrally/skills
npx skills add https://github.com/mindrally/skills --skill beautifulsoup-parsing您是 BeautifulSoup、Python HTML/XML 解析、DOM 导航以及为网络爬虫构建高效数据提取管道的专家。
pip install beautifulsoup4 requests lxml
from bs4 import BeautifulSoup
import requests
# 从字符串
html = '<html><body><h1>Hello</h1></body></html>'
soup = BeautifulSoup(html, 'lxml')
# 从文件
with open('page.html', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'lxml')
# 从 URL
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'lxml')
# lxml - 快速,容错性强(推荐)
soup = BeautifulSoup(html, 'lxml')
# html.parser - 内置,无依赖
soup = BeautifulSoup(html, 'html.parser')
# html5lib - 最容错,最慢
soup = BeautifulSoup(html, 'html5lib')
# lxml-xml - 用于 XML 文档
soup = BeautifulSoup(xml, 'lxml-xml')
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 第一个匹配的元素
soup.find('h1')
# 所有匹配的元素
soup.find_all('p')
# 简写
soup.h1 # 等同于 soup.find('h1')
# 按类
soup.find('div', class_='article')
soup.find_all('div', class_='article')
# 按 ID
soup.find(id='main-content')
# 按任意属性
soup.find('a', href='https://example.com')
soup.find_all('input', attrs={'type': 'text', 'name': 'email'})
# 按 data 属性
soup.find('div', attrs={'data-id': '123'})
# 单个元素
soup.select_one('div.article > h2')
# 多个元素
soup.select('div.article h2')
# 复杂选择器
soup.select('a[href^="https://"]') # 以...开头
soup.select('a[href$=".pdf"]') # 以...结尾
soup.select('a[href*="example"]') # 包含
soup.select('li:nth-child(2)')
soup.select('h1, h2, h3') # 多个
import re
# 按正则表达式
soup.find_all('a', href=re.compile(r'^https://'))
# 按函数
def has_data_attr(tag):
return tag.has_attr('data-id')
soup.find_all(has_data_attr)
# 字符串匹配
soup.find_all(string='exact text')
soup.find_all(string=re.compile('pattern'))
# 获取文本
element.text
element.get_text()
# 获取带分隔符的文本
element.get_text(separator=' ')
# 获取去除空白的文本
element.get_text(strip=True)
# 获取字符串(生成器)
for string in element.stripped_strings:
print(string)
# 获取属性
element['href']
element.get('href') # 如果缺失则返回 None
element.get('href', 'default') # 带默认值
# 获取所有属性
element.attrs # 返回字典
# 检查属性是否存在
element.has_attr('class')
# 内部 HTML
str(element)
# 仅标签名
element.name
# 格式化的 HTML
element.prettify()
element.parent
element.parents # 所有祖先的生成器
# 查找特定祖先
for parent in element.parents:
if parent.name == 'div' and 'article' in parent.get('class', []):
break
element.children # 直接子级(生成器)
list(element.children)
element.contents # 直接子级(列表)
element.descendants # 所有后代(生成器)
# 在子级中查找
element.find('span') # 搜索后代
element.next_sibling
element.previous_sibling
element.next_siblings # 生成器
element.previous_siblings # 生成器
# 下一个/上一个元素(跳过空白)
element.next_element
element.previous_element
def safe_text(element, selector, default=''):
"""安全地从元素中提取文本。"""
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def safe_attr(element, selector, attr, default=None):
"""安全地从元素中提取属性。"""
found = element.select_one(selector)
return found.get(attr, default) if found else default
def extract_table(table):
"""将表格数据提取为字典列表。"""
headers = [th.get_text(strip=True) for th in table.select('th')]
rows = []
for tr in table.select('tbody tr'):
cells = [td.get_text(strip=True) for td in tr.select('td')]
if cells:
rows.append(dict(zip(headers, cells)))
return rows
def extract_items(soup, selector, extractor):
"""使用自定义提取器函数提取多个项目。"""
return [extractor(item) for item in soup.select(selector)]
# 用法示例
def extract_product(item):
return {
'name': safe_text(item, '.name'),
'price': safe_text(item, '.price'),
'url': safe_attr(item, 'a', 'href')
}
products = extract_items(soup, '.product', extract_product)
from urllib.parse import urljoin
def resolve_url(base_url, relative_url):
"""将相对 URL 转换为绝对 URL。"""
if not relative_url:
return None
return urljoin(base_url, relative_url)
# 用法示例
base_url = 'https://example.com/products/'
for link in soup.select('a'):
href = link.get('href')
absolute_url = resolve_url(base_url, href)
print(absolute_url)
# lxml 解析器对格式错误的 HTML 容错性强
soup = BeautifulSoup(malformed_html, 'lxml')
# 对于非常破碎的 HTML,使用 html5lib
soup = BeautifulSoup(very_broken_html, 'html5lib')
# 处理编码问题
response = requests.get(url)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
class ProductScraper:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
})
def fetch_page(self, url):
"""获取并解析页面。"""
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'lxml')
def extract_product(self, item):
"""从卡片元素中提取产品数据。"""
return {
'name': self._safe_text(item, '.product-title'),
'price': self._parse_price(item.select_one('.price')),
'rating': self._safe_attr(item, '.rating', 'data-rating'),
'image': self._resolve(self._safe_attr(item, 'img', 'src')),
'url': self._resolve(self._safe_attr(item, 'a', 'href')),
'in_stock': not item.select_one('.out-of-stock')
}
def scrape_products(self, url):
"""从页面抓取所有产品。"""
soup = self.fetch_page(url)
items = soup.select('.product-card')
return [self.extract_product(item) for item in items]
def _safe_text(self, element, selector, default=''):
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def _safe_attr(self, element, selector, attr, default=None):
found = element.select_one(selector)
return found.get(attr, default) if found else default
def _parse_price(self, element):
if not element:
return None
text = element.get_text(strip=True)
try:
return float(text.replace('$', '').replace(',', ''))
except ValueError:
return None
def _resolve(self, url):
return urljoin(self.base_url, url) if url else None
# 用法示例
scraper = ProductScraper('https://example.com')
products = scraper.scrape_products('https://example.com/products')
for product in products:
print(product)
# 使用 SoupStrainer 仅解析需要的元素
from bs4 import SoupStrainer
only_articles = SoupStrainer('article')
soup = BeautifulSoup(html, 'lxml', parse_only=only_articles)
# 使用 lxml 解析器以提高速度
soup = BeautifulSoup(html, 'lxml') # 最快
# 分解不需要的元素
for script in soup.find_all('script'):
script.decompose()
# 使用生成器以提高内存效率
for item in soup.select('.item'):
yield extract_data(item)
select() 和 select_one() 进行 CSS 选择器查询get_text(strip=True) 进行干净的文本提取每周安装次数
136
代码仓库
GitHub 星标数
42
首次出现
2026年1月25日
安全审计
安装于
opencode114
gemini-cli114
codex107
cursor106
github-copilot99
claude-code94
You are an expert in BeautifulSoup, Python HTML/XML parsing, DOM navigation, and building efficient data extraction pipelines for web scraping.
pip install beautifulsoup4 requests lxml
from bs4 import BeautifulSoup
import requests
# From string
html = '<html><body><h1>Hello</h1></body></html>'
soup = BeautifulSoup(html, 'lxml')
# From file
with open('page.html', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'lxml')
# From URL
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'lxml')
# lxml - Fast, lenient (recommended)
soup = BeautifulSoup(html, 'lxml')
# html.parser - Built-in, no dependencies
soup = BeautifulSoup(html, 'html.parser')
# html5lib - Most lenient, slowest
soup = BeautifulSoup(html, 'html5lib')
# lxml-xml - For XML documents
soup = BeautifulSoup(xml, 'lxml-xml')
# First matching element
soup.find('h1')
# All matching elements
soup.find_all('p')
# Shorthand
soup.h1 # Same as soup.find('h1')
# By class
soup.find('div', class_='article')
soup.find_all('div', class_='article')
# By ID
soup.find(id='main-content')
# By any attribute
soup.find('a', href='https://example.com')
soup.find_all('input', attrs={'type': 'text', 'name': 'email'})
# By data attributes
soup.find('div', attrs={'data-id': '123'})
# Single element
soup.select_one('div.article > h2')
# Multiple elements
soup.select('div.article h2')
# Complex selectors
soup.select('a[href^="https://"]') # Starts with
soup.select('a[href$=".pdf"]') # Ends with
soup.select('a[href*="example"]') # Contains
soup.select('li:nth-child(2)')
soup.select('h1, h2, h3') # Multiple
import re
# By regex
soup.find_all('a', href=re.compile(r'^https://'))
# By function
def has_data_attr(tag):
return tag.has_attr('data-id')
soup.find_all(has_data_attr)
# String matching
soup.find_all(string='exact text')
soup.find_all(string=re.compile('pattern'))
# Get text
element.text
element.get_text()
# Get text with separator
element.get_text(separator=' ')
# Get stripped text
element.get_text(strip=True)
# Get strings (generator)
for string in element.stripped_strings:
print(string)
# Get attribute
element['href']
element.get('href') # Returns None if missing
element.get('href', 'default') # With default
# Get all attributes
element.attrs # Returns dict
# Check attribute exists
element.has_attr('class')
# Inner HTML
str(element)
# Just the tag
element.name
# Prettified HTML
element.prettify()
element.parent
element.parents # Generator of all ancestors
# Find specific ancestor
for parent in element.parents:
if parent.name == 'div' and 'article' in parent.get('class', []):
break
element.children # Direct children (generator)
list(element.children)
element.contents # Direct children (list)
element.descendants # All descendants (generator)
# Find in children
element.find('span') # Searches descendants
element.next_sibling
element.previous_sibling
element.next_siblings # Generator
element.previous_siblings # Generator
# Next/previous element (skips whitespace)
element.next_element
element.previous_element
def safe_text(element, selector, default=''):
"""Safely extract text from element."""
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def safe_attr(element, selector, attr, default=None):
"""Safely extract attribute from element."""
found = element.select_one(selector)
return found.get(attr, default) if found else default
def extract_table(table):
"""Extract table data as list of dictionaries."""
headers = [th.get_text(strip=True) for th in table.select('th')]
rows = []
for tr in table.select('tbody tr'):
cells = [td.get_text(strip=True) for td in tr.select('td')]
if cells:
rows.append(dict(zip(headers, cells)))
return rows
def extract_items(soup, selector, extractor):
"""Extract multiple items using a custom extractor function."""
return [extractor(item) for item in soup.select(selector)]
# Usage
def extract_product(item):
return {
'name': safe_text(item, '.name'),
'price': safe_text(item, '.price'),
'url': safe_attr(item, 'a', 'href')
}
products = extract_items(soup, '.product', extract_product)
from urllib.parse import urljoin
def resolve_url(base_url, relative_url):
"""Convert relative URL to absolute."""
if not relative_url:
return None
return urljoin(base_url, relative_url)
# Usage
base_url = 'https://example.com/products/'
for link in soup.select('a'):
href = link.get('href')
absolute_url = resolve_url(base_url, href)
print(absolute_url)
# lxml parser is lenient with malformed HTML
soup = BeautifulSoup(malformed_html, 'lxml')
# For very broken HTML, use html5lib
soup = BeautifulSoup(very_broken_html, 'html5lib')
# Handle encoding issues
response = requests.get(url)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
class ProductScraper:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
})
def fetch_page(self, url):
"""Fetch and parse a page."""
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'lxml')
def extract_product(self, item):
"""Extract product data from a card element."""
return {
'name': self._safe_text(item, '.product-title'),
'price': self._parse_price(item.select_one('.price')),
'rating': self._safe_attr(item, '.rating', 'data-rating'),
'image': self._resolve(self._safe_attr(item, 'img', 'src')),
'url': self._resolve(self._safe_attr(item, 'a', 'href')),
'in_stock': not item.select_one('.out-of-stock')
}
def scrape_products(self, url):
"""Scrape all products from a page."""
soup = self.fetch_page(url)
items = soup.select('.product-card')
return [self.extract_product(item) for item in items]
def _safe_text(self, element, selector, default=''):
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def _safe_attr(self, element, selector, attr, default=None):
found = element.select_one(selector)
return found.get(attr, default) if found else default
def _parse_price(self, element):
if not element:
return None
text = element.get_text(strip=True)
try:
return float(text.replace('$', '').replace(',', ''))
except ValueError:
return None
def _resolve(self, url):
return urljoin(self.base_url, url) if url else None
# Usage
scraper = ProductScraper('https://example.com')
products = scraper.scrape_products('https://example.com/products')
for product in products:
print(product)
# Use SoupStrainer to parse only needed elements
from bs4 import SoupStrainer
only_articles = SoupStrainer('article')
soup = BeautifulSoup(html, 'lxml', parse_only=only_articles)
# Use lxml parser for speed
soup = BeautifulSoup(html, 'lxml') # Fastest
# Decompose unneeded elements
for script in soup.find_all('script'):
script.decompose()
# Use generators for memory efficiency
for item in soup.select('.item'):
yield extract_data(item)
select() and select_one() for CSS selectorsget_text(strip=True) for clean text extractionWeekly Installs
136
Repository
GitHub Stars
42
First Seen
Jan 25, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
opencode114
gemini-cli114
codex107
cursor106
github-copilot99
claude-code94
agent-browser 浏览器自动化工具 - Vercel Labs 命令行网页操作与测试
159,700 周安装
依赖解析器技能:推送前自动检测修复本地与CI环境不匹配,节省45分钟调试时间
85 周安装
dotnet-install:自动化 .NET SDK 和运行时安装指南 - 支持 Windows/macOS/Linux
85 周安装
UltraThink Orchestrator - 已弃用的AI工作流编排工具,推荐使用dev-orchestrator替代
85 周安装
数据库管理员技能:PostgreSQL/MySQL/MongoDB高可用架构、性能调优与灾难恢复
86 周安装
PDF编程技能:使用PDFKit、PDF.js、Puppeteer生成、解析、合并PDF文档
86 周安装
Spring Boot 3 工程师技能指南:微服务、云原生与响应式编程最佳实践
86 周安装