duckdb by silvainfm/claude-skills
npx skills add https://github.com/silvainfm/claude-skills --skill duckdbDuckDB 是一个高性能的进程内分析数据库管理系统(通常被称为“用于分析的 SQLite”)。可以直接在 CSV、Parquet、JSON 文件以及 Python DataFrame(pandas、Polars)上执行复杂的 SQL 查询,而无需导入数据或运行单独的数据库服务器。
当用户出现以下情况时激活此技能:
检查是否已安装 DuckDB:
python3 -c "import duckdb; print(duckdb.__version__)"
如果未安装:
pip3 install duckdb
如需 Polars 集成:
pip3 install duckdb 'polars[pyarrow]'
DuckDB 可以在不将文件加载到内存的情况下查询它们:
import duckdb
# 查询 CSV 文件
result = duckdb.sql("SELECT * FROM 'data.csv' WHERE age > 25")
print(result.df()) # 转换为 pandas DataFrame
# 查询 Parquet 文件
result = duckdb.sql("""
SELECT category, SUM(amount) as total
FROM 'sales.parquet'
GROUP BY category
ORDER BY total DESC
""")
# 查询 JSON 文件
result = duckdb.sql("SELECT * FROM 'users.json' LIMIT 10")
# 使用通配符查询多个文件
result = duckdb.sql("SELECT * FROM 'data/*.parquet'")
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
DuckDB 可以直接查询 pandas DataFrame:
import duckdb
import pandas as pd
# 创建或加载一个 DataFrame
df = pd.read_csv('data.csv')
# 使用 SQL 查询 DataFrame
result = duckdb.sql("""
SELECT
category,
AVG(price) as avg_price,
COUNT(*) as count
FROM df
WHERE price > 100
GROUP BY category
HAVING count > 5
""")
# 将结果转换为 pandas DataFrame
result_df = result.df()
print(result_df)
DuckDB 通过 Apache Arrow 与 Polars 无缝集成:
import duckdb
import polars as pl
# 创建或加载一个 Polars DataFrame
df = pl.read_csv('data.csv')
# 使用 DuckDB 查询 Polars DataFrame
result = duckdb.sql("""
SELECT
date_trunc('month', date) as month,
SUM(revenue) as monthly_revenue
FROM df
GROUP BY month
ORDER BY month
""")
# 将结果转换为 Polars DataFrame
result_df = result.pl()
# 对于惰性求值,使用 lazy=True
lazy_result = result.pl(lazy=True)
为持久化存储创建数据库文件:
import duckdb
# 连接到持久化数据库(如果文件不存在则创建)
con = duckdb.connect('my_database.duckdb')
# 创建表并插入数据
con.execute("""
CREATE TABLE users AS
SELECT * FROM 'users.csv'
""")
# 查询数据库
result = con.execute("SELECT * FROM users WHERE age > 30").fetchdf()
# 关闭连接
con.close()
DuckDB 擅长分析查询:
import duckdb
# 窗口函数
result = duckdb.sql("""
SELECT
name,
department,
salary,
AVG(salary) OVER (PARTITION BY department) as dept_avg,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) as dept_rank
FROM 'employees.csv'
""")
# CTE 和子查询
result = duckdb.sql("""
WITH monthly_sales AS (
SELECT
date_trunc('month', sale_date) as month,
product_id,
SUM(amount) as total_sales
FROM 'sales.parquet'
GROUP BY month, product_id
)
SELECT
m.month,
p.product_name,
m.total_sales,
LAG(m.total_sales) OVER (
PARTITION BY m.product_id
ORDER BY m.month
) as prev_month_sales
FROM monthly_sales m
JOIN 'products.csv' p ON m.product_id = p.id
ORDER BY m.month DESC, m.total_sales DESC
""")
连接来自多个文件和 DataFrame 的数据:
import duckdb
import pandas as pd
# 加载 DataFrame
customers_df = pd.read_csv('customers.csv')
# 将 DataFrame 与 Parquet 文件连接
result = duckdb.sql("""
SELECT
c.customer_name,
c.email,
o.order_date,
o.total_amount
FROM customers_df c
JOIN 'orders.parquet' o ON c.customer_id = o.customer_id
WHERE o.order_date >= '2024-01-01'
ORDER BY o.order_date DESC
""")
import duckdb
# 获取表结构
duckdb.sql("DESCRIBE SELECT * FROM 'data.parquet'").show()
# 快速统计
duckdb.sql("""
SELECT
COUNT(*) as rows,
COUNT(DISTINCT user_id) as unique_users,
MIN(created_at) as earliest_date,
MAX(created_at) as latest_date
FROM 'data.csv'
""").show()
# 抽样数据
duckdb.sql("SELECT * FROM 'large_file.parquet' USING SAMPLE 1000").show()
import duckdb
# 使用 DuckDB 的 ETL 流水线
con = duckdb.connect('analytics.duckdb')
# 提取和转换
con.execute("""
CREATE TABLE clean_sales AS
SELECT
date_trunc('day', timestamp) as sale_date,
UPPER(TRIM(product_name)) as product_name,
quantity,
price,
quantity * price as total_amount,
CASE
WHEN quantity > 10 THEN 'bulk'
ELSE 'retail'
END as sale_type
FROM 'raw_sales.csv'
WHERE price > 0 AND quantity > 0
""")
# 创建聚合视图
con.execute("""
CREATE VIEW daily_summary AS
SELECT
sale_date,
sale_type,
COUNT(*) as num_sales,
SUM(total_amount) as revenue
FROM clean_sales
GROUP BY sale_date, sale_type
""")
result = con.execute("SELECT * FROM daily_summary ORDER BY sale_date DESC").fetchdf()
con.close()
import duckdb
import polars as pl
# 使用 Polars 读取多个 parquet 文件
df = pl.read_parquet('data/*.parquet')
# 使用 DuckDB 进行复杂的 SQL 分析
result = duckdb.sql("""
SELECT
customer_segment,
product_category,
COUNT(DISTINCT customer_id) as customers,
SUM(revenue) as total_revenue,
AVG(revenue) as avg_revenue,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY revenue) as median_revenue
FROM df
WHERE order_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY customer_segment, product_category
HAVING total_revenue > 10000
ORDER BY total_revenue DESC
""").pl() # 返回为 Polars DataFrame
# 使用 Polars 继续处理
final_result = result.with_columns([
(pl.col('total_revenue') / pl.col('customers')).alias('revenue_per_customer')
])
import duckdb
# 导出到 CSV
duckdb.sql("""
COPY (
SELECT * FROM 'input.parquet' WHERE status = 'active'
) TO 'output.csv' (HEADER, DELIMITER ',')
""")
# 导出到 Parquet
duckdb.sql("""
COPY (
SELECT date, category, SUM(amount) as total
FROM 'sales.csv'
GROUP BY date, category
) TO 'summary.parquet' (FORMAT PARQUET)
""")
# 导出到 JSON
duckdb.sql("""
COPY (SELECT * FROM users WHERE age > 21)
TO 'filtered_users.json' (FORMAT JSON)
""")
# 良好做法:尽早过滤和投影
duckdb.sql("SELECT name, age FROM 'users.parquet' WHERE age > 25")
# 效率较低:先选择所有数据再过滤
duckdb.sql("SELECT * FROM 'users.parquet'").df()[lambda x: x['age'] > 25]
DuckDB 和 Polars 通过 Apache Arrow 无缝协作:
import duckdb
import polars as pl
# Polars 用于数据加载和转换
df = (
pl.scan_parquet('data/*.parquet')
.filter(pl.col('date') >= '2024-01-01')
.collect()
)
# DuckDB 用于复杂的 SQL 分析
result = duckdb.sql("""
SELECT
user_id,
COUNT(*) as sessions,
SUM(duration) as total_duration,
AVG(duration) as avg_duration,
MAX(duration) as max_duration
FROM df
GROUP BY user_id
HAVING sessions > 5
""").pl()
# 返回 Polars 进行最终处理
top_users = result.top_k(10, by='total_duration')
有关更多 Polars 特定操作,请参阅 polars 技能,以及 references/integration.md 文件以获取详细的集成示例。
常见问题及解决方案:
import duckdb
try:
result = duckdb.sql("SELECT * FROM 'data.csv'")
except duckdb.Error as e:
print(f"DuckDB 错误: {e}")
except FileNotFoundError:
print("文件未找到")
except Exception as e:
print(f"意外错误: {e}")
每周安装次数
143
代码仓库
GitHub 星标数
2
首次出现
2026年1月22日
安全审计
已安装于
opencode127
codex123
gemini-cli122
github-copilot114
kimi-cli108
amp108
DuckDB is a high-performance, in-process analytical database management system (often called "SQLite for analytics"). Execute complex SQL queries directly on CSV, Parquet, JSON files, and Python DataFrames (pandas, Polars) without importing data or running a separate database server.
Activate when the user:
Check if DuckDB is installed:
python3 -c "import duckdb; print(duckdb.__version__)"
If not installed:
pip3 install duckdb
For Polars integration:
pip3 install duckdb 'polars[pyarrow]'
DuckDB can query files without loading them into memory:
import duckdb
# Query CSV file
result = duckdb.sql("SELECT * FROM 'data.csv' WHERE age > 25")
print(result.df()) # Convert to pandas DataFrame
# Query Parquet file
result = duckdb.sql("""
SELECT category, SUM(amount) as total
FROM 'sales.parquet'
GROUP BY category
ORDER BY total DESC
""")
# Query JSON file
result = duckdb.sql("SELECT * FROM 'users.json' LIMIT 10")
# Query multiple files with wildcards
result = duckdb.sql("SELECT * FROM 'data/*.parquet'")
DuckDB can directly query pandas DataFrames:
import duckdb
import pandas as pd
# Create or load a DataFrame
df = pd.read_csv('data.csv')
# Query the DataFrame using SQL
result = duckdb.sql("""
SELECT
category,
AVG(price) as avg_price,
COUNT(*) as count
FROM df
WHERE price > 100
GROUP BY category
HAVING count > 5
""")
# Convert result to pandas DataFrame
result_df = result.df()
print(result_df)
DuckDB integrates seamlessly with Polars using Apache Arrow:
import duckdb
import polars as pl
# Create or load a Polars DataFrame
df = pl.read_csv('data.csv')
# Query Polars DataFrame with DuckDB
result = duckdb.sql("""
SELECT
date_trunc('month', date) as month,
SUM(revenue) as monthly_revenue
FROM df
GROUP BY month
ORDER BY month
""")
# Convert result to Polars DataFrame
result_df = result.pl()
# For lazy evaluation, use lazy=True
lazy_result = result.pl(lazy=True)
Create database files for persistent storage:
import duckdb
# Connect to a persistent database (creates file if doesn't exist)
con = duckdb.connect('my_database.duckdb')
# Create table and insert data
con.execute("""
CREATE TABLE users AS
SELECT * FROM 'users.csv'
""")
# Query the database
result = con.execute("SELECT * FROM users WHERE age > 30").fetchdf()
# Close connection
con.close()
DuckDB excels at analytical queries:
import duckdb
# Window functions
result = duckdb.sql("""
SELECT
name,
department,
salary,
AVG(salary) OVER (PARTITION BY department) as dept_avg,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) as dept_rank
FROM 'employees.csv'
""")
# CTEs and subqueries
result = duckdb.sql("""
WITH monthly_sales AS (
SELECT
date_trunc('month', sale_date) as month,
product_id,
SUM(amount) as total_sales
FROM 'sales.parquet'
GROUP BY month, product_id
)
SELECT
m.month,
p.product_name,
m.total_sales,
LAG(m.total_sales) OVER (
PARTITION BY m.product_id
ORDER BY m.month
) as prev_month_sales
FROM monthly_sales m
JOIN 'products.csv' p ON m.product_id = p.id
ORDER BY m.month DESC, m.total_sales DESC
""")
Join data from multiple files and DataFrames:
import duckdb
import pandas as pd
# Load DataFrame
customers_df = pd.read_csv('customers.csv')
# Join DataFrame with Parquet file
result = duckdb.sql("""
SELECT
c.customer_name,
c.email,
o.order_date,
o.total_amount
FROM customers_df c
JOIN 'orders.parquet' o ON c.customer_id = o.customer_id
WHERE o.order_date >= '2024-01-01'
ORDER BY o.order_date DESC
""")
import duckdb
# Get table schema
duckdb.sql("DESCRIBE SELECT * FROM 'data.parquet'").show()
# Quick statistics
duckdb.sql("""
SELECT
COUNT(*) as rows,
COUNT(DISTINCT user_id) as unique_users,
MIN(created_at) as earliest_date,
MAX(created_at) as latest_date
FROM 'data.csv'
""").show()
# Sample data
duckdb.sql("SELECT * FROM 'large_file.parquet' USING SAMPLE 1000").show()
import duckdb
# ETL pipeline using DuckDB
con = duckdb.connect('analytics.duckdb')
# Extract and transform
con.execute("""
CREATE TABLE clean_sales AS
SELECT
date_trunc('day', timestamp) as sale_date,
UPPER(TRIM(product_name)) as product_name,
quantity,
price,
quantity * price as total_amount,
CASE
WHEN quantity > 10 THEN 'bulk'
ELSE 'retail'
END as sale_type
FROM 'raw_sales.csv'
WHERE price > 0 AND quantity > 0
""")
# Create aggregated view
con.execute("""
CREATE VIEW daily_summary AS
SELECT
sale_date,
sale_type,
COUNT(*) as num_sales,
SUM(total_amount) as revenue
FROM clean_sales
GROUP BY sale_date, sale_type
""")
result = con.execute("SELECT * FROM daily_summary ORDER BY sale_date DESC").fetchdf()
con.close()
import duckdb
import polars as pl
# Read multiple parquet files with Polars
df = pl.read_parquet('data/*.parquet')
# Use DuckDB for complex SQL analytics
result = duckdb.sql("""
SELECT
customer_segment,
product_category,
COUNT(DISTINCT customer_id) as customers,
SUM(revenue) as total_revenue,
AVG(revenue) as avg_revenue,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY revenue) as median_revenue
FROM df
WHERE order_date >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY customer_segment, product_category
HAVING total_revenue > 10000
ORDER BY total_revenue DESC
""").pl() # Return as Polars DataFrame
# Continue processing with Polars
final_result = result.with_columns([
(pl.col('total_revenue') / pl.col('customers')).alias('revenue_per_customer')
])
import duckdb
# Export to CSV
duckdb.sql("""
COPY (
SELECT * FROM 'input.parquet' WHERE status = 'active'
) TO 'output.csv' (HEADER, DELIMITER ',')
""")
# Export to Parquet
duckdb.sql("""
COPY (
SELECT date, category, SUM(amount) as total
FROM 'sales.csv'
GROUP BY date, category
) TO 'summary.parquet' (FORMAT PARQUET)
""")
# Export to JSON
duckdb.sql("""
COPY (SELECT * FROM users WHERE age > 21)
TO 'filtered_users.json' (FORMAT JSON)
""")
# Good: Filter and project early
duckdb.sql("SELECT name, age FROM 'users.parquet' WHERE age > 25")
# Less efficient: Select all then filter
duckdb.sql("SELECT * FROM 'users.parquet'").df()[lambda x: x['age'] > 25]
DuckDB and Polars work together seamlessly via Apache Arrow:
import duckdb
import polars as pl
# Polars for data loading and transformation
df = (
pl.scan_parquet('data/*.parquet')
.filter(pl.col('date') >= '2024-01-01')
.collect()
)
# DuckDB for complex SQL analytics
result = duckdb.sql("""
SELECT
user_id,
COUNT(*) as sessions,
SUM(duration) as total_duration,
AVG(duration) as avg_duration,
MAX(duration) as max_duration
FROM df
GROUP BY user_id
HAVING sessions > 5
""").pl()
# Back to Polars for final processing
top_users = result.top_k(10, by='total_duration')
See the polars skill for more Polars-specific operations and the references/integration.md file for detailed integration examples.
Common issues and solutions:
import duckdb
try:
result = duckdb.sql("SELECT * FROM 'data.csv'")
except duckdb.Error as e:
print(f"DuckDB error: {e}")
except FileNotFoundError:
print("File not found")
except Exception as e:
print(f"Unexpected error: {e}")
Weekly Installs
143
Repository
GitHub Stars
2
First Seen
Jan 22, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode127
codex123
gemini-cli122
github-copilot114
kimi-cli108
amp108
Excel财务建模规范与xlsx文件处理指南:专业格式、零错误公式与数据分析
46,700 周安装