hugging-face-datasets by huggingface/skills
npx skills add https://github.com/huggingface/skills --skill hugging-face-datasets本技能提供用于管理 Hugging Face Hub 上数据集的工具,重点关注创建、配置、内容管理和基于 SQL 的数据操作。它旨在通过提供数据集编辑和查询功能来补充现有的 Hugging Face MCP 服务器。
2.1.0
uv run scripts/script_name.py通过 scripts/sql_manager.py 使用 DuckDB SQL 查询任何 Hugging Face 数据集:
hf:// 协议对数据集运行 SQL广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
通过模板系统支持多种数据集类型:
本技能包含两个使用 PEP 723 内联依赖管理的 Python 脚本:
所有路径均相对于包含此 SKILL.md 文件的目录。 脚本通过以下方式运行:
uv run scripts/script_name.py [arguments]
scripts/dataset_manager.py - 数据集创建和管理scripts/sql_manager.py - 基于 SQL 的数据集查询和转换uv 包管理器HF_TOKEN 环境变量,其中包含具有写入权限的令牌使用 DuckDB SQL 查询、转换和推送 Hugging Face 数据集。hf:// 协议提供对任何公共数据集(或使用令牌的私有数据集)的直接访问。
# 查询数据集
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
# 获取数据集模式
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
# 随机行采样
uv run scripts/sql_manager.py sample --dataset "cais/mmlu" --n 5
# 带过滤条件的行计数
uv run scripts/sql_manager.py count --dataset "cais/mmlu" --where "subject='nutrition'"
在 SQL 中使用 data 作为表名 - 它会被替换为实际的 hf:// 路径:
-- 基本选择
SELECT * FROM data LIMIT 10
-- 过滤
SELECT * FROM data WHERE subject='nutrition'
-- 聚合
SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject ORDER BY cnt DESC
-- 列选择和转换
SELECT question, choices[answer] AS correct_answer FROM data
-- 正则表达式匹配
SELECT * FROM data WHERE regexp_matches(question, 'nutrition|diet')
-- 字符串函数
SELECT regexp_replace(question, '\n', '') AS cleaned FROM data
# 获取模式
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
# 获取列中的唯一值
uv run scripts/sql_manager.py unique --dataset "cais/mmlu" --column "subject"
# 获取值分布
uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" --bins 20
# 使用 SQL 进行复杂过滤
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject HAVING cnt > 100"
# 使用 transform 命令
uv run scripts/sql_manager.py transform \
--dataset "cais/mmlu" \
--select "subject, COUNT(*) as cnt" \
--group-by "subject" \
--order-by "cnt DESC" \
--limit 10
# 查询并推送到新数据集
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject='nutrition'" \
--push-to "username/mmlu-nutrition-subset" \
--private
# 转换并推送
uv run scripts/sql_manager.py transform \
--dataset "ibm/duorc" \
--config "ParaphraseRC" \
--select "question, answers" \
--where "LENGTH(question) > 50" \
--push-to "username/duorc-long-questions"
# 导出到 Parquet
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject='nutrition'" \
--output "nutrition.parquet" \
--format parquet
# 导出到 JSONL
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data LIMIT 100" \
--output "sample.jsonl" \
--format jsonl
# 指定配置(子集)
uv run scripts/sql_manager.py query \
--dataset "ibm/duorc" \
--config "ParaphraseRC" \
--sql "SELECT * FROM data LIMIT 5"
# 指定拆分
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--split "test" \
--sql "SELECT COUNT(*) FROM data"
# 查询所有拆分
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--split "*" \
--sql "SELECT * FROM data LIMIT 10"
用于复杂查询或连接数据集:
uv run scripts/sql_manager.py raw --sql "
SELECT a.*, b.*
FROM 'hf://datasets/dataset1@~parquet/default/train/*.parquet' a
JOIN 'hf://datasets/dataset2@~parquet/default/train/*.parquet' b
ON a.id = b.id
LIMIT 100
"
from sql_manager import HFDatasetSQL
sql = HFDatasetSQL()
# 查询
results = sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
# 获取模式
schema = sql.describe("cais/mmlu")
# 采样
samples = sql.sample("cais/mmlu", n=5, seed=42)
# 计数
count = sql.count("cais/mmlu", where="subject='nutrition'")
# 直方图
dist = sql.histogram("cais/mmlu", "subject")
# 过滤和转换
results = sql.filter_and_transform(
"cais/mmlu",
select="subject, COUNT(*) as cnt",
group_by="subject",
order_by="cnt DESC",
limit=10
)
# 推送到 Hub
url = sql.push_to_hub(
"cais/mmlu",
"username/nutrition-subset",
sql="SELECT * FROM data WHERE subject='nutrition'",
private=True
)
# 本地导出
sql.export_to_parquet("cais/mmlu", "output.parquet", sql="SELECT * FROM data LIMIT 100")
sql.close()
DuckDB 使用 hf:// 协议访问数据集:
hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet
示例:
hf://datasets/cais/mmlu@~parquet/default/train/*.parquethf://datasets/ibm/duorc@~parquet/ParaphraseRC/test/*.parquet@~parquet 修订版为任何数据集格式提供自动转换的 Parquet 文件。
-- 字符串函数
LENGTH(column) -- 字符串长度
regexp_replace(col, '\n', '') -- 正则表达式替换
regexp_matches(col, 'pattern') -- 正则表达式匹配
LOWER(col), UPPER(col) -- 大小写转换
-- 数组函数
choices[0] -- 数组索引(从0开始)
array_length(choices) -- 数组长度
unnest(choices) -- 将数组展开为行
-- 聚合
COUNT(*), SUM(col), AVG(col)
GROUP BY col HAVING condition
-- 采样
USING SAMPLE 10 -- 随机样本
USING SAMPLE 10 (RESERVOIR, 42) -- 可重现样本
-- 窗口函数
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
1. 发现(使用 HF MCP 服务器):
# 使用 HF MCP 工具查找现有数据集
search_datasets("conversational AI training")
get_dataset_details("username/dataset-name")
2. 创建(使用本技能):
# 初始化新数据集
uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
# 使用详细的系统提示进行配置
uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "$(cat system_prompt.txt)"
3. 内容管理(使用本技能):
# 使用任何模板快速设置
uv run scripts/dataset_manager.py quick_setup \
--repo_id "your-username/dataset-name" \
--template classification
# 使用模板验证添加数据
uv run scripts/dataset_manager.py add_rows \
--repo_id "your-username/dataset-name" \
--template qa \
--rows_json "$(cat your_qa_data.json)"
1. 聊天模板 (--template chat)
{
"messages": [
{"role": "user", "content": "Natural user request"},
{"role": "assistant", "content": "Response with tool usage"},
{"role": "tool", "content": "Tool response", "tool_call_id": "call_123"}
],
"scenario": "Description of use case",
"complexity": "simple|intermediate|advanced"
}
2. 分类模板 (--template classification)
{
"text": "Input text to be classified",
"label": "classification_label",
"confidence": 0.95,
"metadata": {"domain": "technology", "language": "en"}
}
3. 问答模板 (--template qa)
{
"question": "What is the question being asked?",
"answer": "The complete answer",
"context": "Additional context if needed",
"answer_type": "factual|explanatory|opinion",
"difficulty": "easy|medium|hard"
}
4. 补全模板 (--template completion)
{
"prompt": "The beginning text or context",
"completion": "The expected continuation",
"domain": "code|creative|technical|conversational",
"style": "description of writing style"
}
5. 表格模板 (--template tabular)
{
"columns": [
{"name": "feature1", "type": "numeric", "description": "First feature"},
{"name": "target", "type": "categorical", "description": "Target variable"}
],
"data": [
{"feature1": 123, "target": "class_a"},
{"feature1": 456, "target": "class_b"}
]
}
用于高质量训练数据生成:
You are an AI assistant expert at using MCP tools effectively.
## MCP SERVER DEFINITIONS
[Define available servers and tools]
## TRAINING EXAMPLE STRUCTURE
[Specify exact JSON schema for chat templating]
## QUALITY GUIDELINES
[Detail requirements for realistic scenarios, progressive complexity, proper tool usage]
## EXAMPLE CATEGORIES
[List development workflows, debugging scenarios, data management tasks]
本技能包含除 MCP 使用之外的多样化训练示例:
可用示例集:
training_examples.json - MCP 工具使用示例(调试、项目设置、数据库分析)diverse_training_examples.json - 更广泛的场景,包括:
使用不同的示例集:
# 添加以 MCP 为重点的示例
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
--rows_json "$(cat examples/training_examples.json)"
# 添加多样化的对话示例
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
--rows_json "$(cat examples/diverse_training_examples.json)"
# 混合两者以获得全面的训练数据
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
--rows_json "$(jq -s '.[0] + .[1]' examples/training_examples.json examples/diverse_training_examples.json)"
列出可用模板:
uv run scripts/dataset_manager.py list_templates
快速设置(推荐):
uv run scripts/dataset_manager.py quick_setup --repo_id "your-username/dataset-name" --template classification
手动设置:
# 初始化仓库
uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
# 使用系统提示进行配置
uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "Your prompt here"
# 添加带验证的数据
uv run scripts/dataset_manager.py add_rows \
--repo_id "your-username/dataset-name" \
--template qa \
--rows_json '[{"question": "What is AI?", "answer": "Artificial Intelligence..."}]'
查看数据集统计信息:
uv run scripts/dataset_manager.py stats --repo_id "your-username/dataset-name"
# 1. 探索源数据集
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
# 2. 查询并创建子集
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject IN ('nutrition', 'anatomy', 'clinical_knowledge')" \
--push-to "username/mmlu-medical-subset" \
--private
# 将 MMLU 转换为 QA 格式并提取正确答案
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT question, choices[answer] as correct_answer, subject FROM data" \
--push-to "username/mmlu-qa-format"
# 导出多个拆分并合并
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--split "*" \
--output "mmlu_all.parquet"
# 过滤高质量示例
uv run scripts/sql_manager.py query \
--dataset "squad" \
--sql "SELECT * FROM data WHERE LENGTH(context) > 500 AND LENGTH(question) > 20" \
--push-to "username/squad-filtered"
# 1. 查询源数据
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--sql "SELECT question, subject FROM data WHERE subject='nutrition'" \
--output "nutrition_source.jsonl" \
--format jsonl
# 2. 使用您的流水线进行处理(添加答案、格式化等)
# 3. 推送处理后的数据
uv run scripts/dataset_manager.py init --repo_id "username/nutrition-training"
uv run scripts/dataset_manager.py add_rows \
--repo_id "username/nutrition-training" \
--template qa \
--rows_json "$(cat processed_data.json)"
每周安装次数
366
仓库
GitHub 星标数
9.7K
首次出现
2026年1月20日
安全审计
安装于
opencode288
codex284
gemini-cli283
github-copilot268
cursor264
claude-code263
This skill provides tools to manage datasets on the Hugging Face Hub with a focus on creation, configuration, content management, and SQL-based data manipulation. It is designed to complement the existing Hugging Face MCP server by providing dataset editing and querying capabilities.
2.1.0
Query any Hugging Face dataset using DuckDB SQL via scripts/sql_manager.py:
hf:// protocolSupports diverse dataset types through template system:
The skill includes two Python scripts that use PEP 723 inline dependency management:
All paths are relative to the directory containing this SKILL.md file. Scripts are run with:
uv run scripts/script_name.py [arguments]
scripts/dataset_manager.py - Dataset creation and managementscripts/sql_manager.py - SQL-based dataset querying and transformationuv package manager installedHF_TOKEN environment variable must be set with a Write-access tokenQuery, transform, and push Hugging Face datasets using DuckDB SQL. The hf:// protocol provides direct access to any public dataset (or private with token).
# Query a dataset
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10"
# Get dataset schema
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
# Sample random rows
uv run scripts/sql_manager.py sample --dataset "cais/mmlu" --n 5
# Count rows with filter
uv run scripts/sql_manager.py count --dataset "cais/mmlu" --where "subject='nutrition'"
Use data as the table name in your SQL - it gets replaced with the actual hf:// path:
-- Basic select
SELECT * FROM data LIMIT 10
-- Filtering
SELECT * FROM data WHERE subject='nutrition'
-- Aggregations
SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject ORDER BY cnt DESC
-- Column selection and transformation
SELECT question, choices[answer] AS correct_answer FROM data
-- Regex matching
SELECT * FROM data WHERE regexp_matches(question, 'nutrition|diet')
-- String functions
SELECT regexp_replace(question, '\n', '') AS cleaned FROM data
# Get schema
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
# Get unique values in column
uv run scripts/sql_manager.py unique --dataset "cais/mmlu" --column "subject"
# Get value distribution
uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" --bins 20
# Complex filtering with SQL
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject HAVING cnt > 100"
# Using transform command
uv run scripts/sql_manager.py transform \
--dataset "cais/mmlu" \
--select "subject, COUNT(*) as cnt" \
--group-by "subject" \
--order-by "cnt DESC" \
--limit 10
# Query and push to new dataset
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject='nutrition'" \
--push-to "username/mmlu-nutrition-subset" \
--private
# Transform and push
uv run scripts/sql_manager.py transform \
--dataset "ibm/duorc" \
--config "ParaphraseRC" \
--select "question, answers" \
--where "LENGTH(question) > 50" \
--push-to "username/duorc-long-questions"
# Export to Parquet
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject='nutrition'" \
--output "nutrition.parquet" \
--format parquet
# Export to JSONL
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data LIMIT 100" \
--output "sample.jsonl" \
--format jsonl
# Specify config (subset)
uv run scripts/sql_manager.py query \
--dataset "ibm/duorc" \
--config "ParaphraseRC" \
--sql "SELECT * FROM data LIMIT 5"
# Specify split
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--split "test" \
--sql "SELECT COUNT(*) FROM data"
# Query all splits
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--split "*" \
--sql "SELECT * FROM data LIMIT 10"
For complex queries or joining datasets:
uv run scripts/sql_manager.py raw --sql "
SELECT a.*, b.*
FROM 'hf://datasets/dataset1@~parquet/default/train/*.parquet' a
JOIN 'hf://datasets/dataset2@~parquet/default/train/*.parquet' b
ON a.id = b.id
LIMIT 100
"
from sql_manager import HFDatasetSQL
sql = HFDatasetSQL()
# Query
results = sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10")
# Get schema
schema = sql.describe("cais/mmlu")
# Sample
samples = sql.sample("cais/mmlu", n=5, seed=42)
# Count
count = sql.count("cais/mmlu", where="subject='nutrition'")
# Histogram
dist = sql.histogram("cais/mmlu", "subject")
# Filter and transform
results = sql.filter_and_transform(
"cais/mmlu",
select="subject, COUNT(*) as cnt",
group_by="subject",
order_by="cnt DESC",
limit=10
)
# Push to Hub
url = sql.push_to_hub(
"cais/mmlu",
"username/nutrition-subset",
sql="SELECT * FROM data WHERE subject='nutrition'",
private=True
)
# Export locally
sql.export_to_parquet("cais/mmlu", "output.parquet", sql="SELECT * FROM data LIMIT 100")
sql.close()
DuckDB uses the hf:// protocol to access datasets:
hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet
Examples:
hf://datasets/cais/mmlu@~parquet/default/train/*.parquethf://datasets/ibm/duorc@~parquet/ParaphraseRC/test/*.parquetThe @~parquet revision provides auto-converted Parquet files for any dataset format.
-- String functions
LENGTH(column) -- String length
regexp_replace(col, '\n', '') -- Regex replace
regexp_matches(col, 'pattern') -- Regex match
LOWER(col), UPPER(col) -- Case conversion
-- Array functions
choices[0] -- Array indexing (0-based)
array_length(choices) -- Array length
unnest(choices) -- Expand array to rows
-- Aggregations
COUNT(*), SUM(col), AVG(col)
GROUP BY col HAVING condition
-- Sampling
USING SAMPLE 10 -- Random sample
USING SAMPLE 10 (RESERVOIR, 42) -- Reproducible sample
-- Window functions
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
1. Discovery (Use HF MCP Server):
# Use HF MCP tools to find existing datasets
search_datasets("conversational AI training")
get_dataset_details("username/dataset-name")
2. Creation (Use This Skill):
# Initialize new dataset
uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
# Configure with detailed system prompt
uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "$(cat system_prompt.txt)"
3. Content Management (Use This Skill):
# Quick setup with any template
uv run scripts/dataset_manager.py quick_setup \
--repo_id "your-username/dataset-name" \
--template classification
# Add data with template validation
uv run scripts/dataset_manager.py add_rows \
--repo_id "your-username/dataset-name" \
--template qa \
--rows_json "$(cat your_qa_data.json)"
1. Chat Template (--template chat)
{
"messages": [
{"role": "user", "content": "Natural user request"},
{"role": "assistant", "content": "Response with tool usage"},
{"role": "tool", "content": "Tool response", "tool_call_id": "call_123"}
],
"scenario": "Description of use case",
"complexity": "simple|intermediate|advanced"
}
2. Classification Template (--template classification)
{
"text": "Input text to be classified",
"label": "classification_label",
"confidence": 0.95,
"metadata": {"domain": "technology", "language": "en"}
}
3. QA Template (--template qa)
{
"question": "What is the question being asked?",
"answer": "The complete answer",
"context": "Additional context if needed",
"answer_type": "factual|explanatory|opinion",
"difficulty": "easy|medium|hard"
}
4. Completion Template (--template completion)
{
"prompt": "The beginning text or context",
"completion": "The expected continuation",
"domain": "code|creative|technical|conversational",
"style": "description of writing style"
}
5. Tabular Template (--template tabular)
{
"columns": [
{"name": "feature1", "type": "numeric", "description": "First feature"},
{"name": "target", "type": "categorical", "description": "Target variable"}
],
"data": [
{"feature1": 123, "target": "class_a"},
{"feature1": 456, "target": "class_b"}
]
}
For high-quality training data generation:
You are an AI assistant expert at using MCP tools effectively.
## MCP SERVER DEFINITIONS
[Define available servers and tools]
## TRAINING EXAMPLE STRUCTURE
[Specify exact JSON schema for chat templating]
## QUALITY GUIDELINES
[Detail requirements for realistic scenarios, progressive complexity, proper tool usage]
## EXAMPLE CATEGORIES
[List development workflows, debugging scenarios, data management tasks]
The skill includes diverse training examples beyond just MCP usage:
Available Example Sets:
training_examples.json - MCP tool usage examples (debugging, project setup, database analysis)diverse_training_examples.json - Broader scenarios including:
Using Different Example Sets:
# Add MCP-focused examples
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
--rows_json "$(cat examples/training_examples.json)"
# Add diverse conversational examples
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
--rows_json "$(cat examples/diverse_training_examples.json)"
# Mix both for comprehensive training data
uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \
--rows_json "$(jq -s '.[0] + .[1]' examples/training_examples.json examples/diverse_training_examples.json)"
List Available Templates:
uv run scripts/dataset_manager.py list_templates
Quick Setup (Recommended):
uv run scripts/dataset_manager.py quick_setup --repo_id "your-username/dataset-name" --template classification
Manual Setup:
# Initialize repository
uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private]
# Configure with system prompt
uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "Your prompt here"
# Add data with validation
uv run scripts/dataset_manager.py add_rows \
--repo_id "your-username/dataset-name" \
--template qa \
--rows_json '[{"question": "What is AI?", "answer": "Artificial Intelligence..."}]'
View Dataset Statistics:
uv run scripts/dataset_manager.py stats --repo_id "your-username/dataset-name"
# 1. Explore the source dataset
uv run scripts/sql_manager.py describe --dataset "cais/mmlu"
uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject"
# 2. Query and create subset
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT * FROM data WHERE subject IN ('nutrition', 'anatomy', 'clinical_knowledge')" \
--push-to "username/mmlu-medical-subset" \
--private
# Transform MMLU to QA format with correct answers extracted
uv run scripts/sql_manager.py query \
--dataset "cais/mmlu" \
--sql "SELECT question, choices[answer] as correct_answer, subject FROM data" \
--push-to "username/mmlu-qa-format"
# Export multiple splits and combine
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--split "*" \
--output "mmlu_all.parquet"
# Filter for high-quality examples
uv run scripts/sql_manager.py query \
--dataset "squad" \
--sql "SELECT * FROM data WHERE LENGTH(context) > 500 AND LENGTH(question) > 20" \
--push-to "username/squad-filtered"
# 1. Query source data
uv run scripts/sql_manager.py export \
--dataset "cais/mmlu" \
--sql "SELECT question, subject FROM data WHERE subject='nutrition'" \
--output "nutrition_source.jsonl" \
--format jsonl
# 2. Process with your pipeline (add answers, format, etc.)
# 3. Push processed data
uv run scripts/dataset_manager.py init --repo_id "username/nutrition-training"
uv run scripts/dataset_manager.py add_rows \
--repo_id "username/nutrition-training" \
--template qa \
--rows_json "$(cat processed_data.json)"
Weekly Installs
366
Repository
GitHub Stars
9.7K
First Seen
Jan 20, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
opencode288
codex284
gemini-cli283
github-copilot268
cursor264
claude-code263
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
56,200 周安装