nemo-curator by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill nemo-curatorNVIDIA 用于为大语言模型准备高质量训练数据的工具包。
在以下情况下使用 NeMo Curator:
性能表现:
替代方案:
# 文本整理(CUDA 12)
uv pip install "nemo-curator[text_cuda12]"
# 所有模态
uv pip install "nemo-curator[all_cuda12]"
# 仅 CPU(速度较慢)
uv pip install "nemo-curator[cpu]"
from nemo_curator import ScoreFilter, Modify
from nemo_curator.datasets import DocumentDataset
import pandas as pd
# 加载数据
df = pd.DataFrame({"text": ["Good document", "Bad doc", "Excellent text"]})
dataset = DocumentDataset(df)
# 质量过滤
def quality_score(doc):
return len(doc["text"].split()) > 5 # 过滤短文档
filtered = ScoreFilter(quality_score)(dataset)
# 去重
from nemo_curator.modules import ExactDuplicates
deduped = ExactDuplicates()(filtered)
# 保存
deduped.to_parquet("curated_data/")
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
from nemo_curator.filters import (
WordCountFilter,
RepeatedLinesFilter,
UrlRatioFilter,
NonAlphaNumericFilter
)
# 应用 30 多种启发式过滤器
from nemo_curator import ScoreFilter
# 词数过滤器
dataset = dataset.filter(WordCountFilter(min_words=50, max_words=100000))
# 移除重复内容
dataset = dataset.filter(RepeatedLinesFilter(max_repeated_line_fraction=0.3))
# URL 比例过滤器
dataset = dataset.filter(UrlRatioFilter(max_url_ratio=0.2))
精确去重:
from nemo_curator.modules import ExactDuplicates
# 移除完全重复项
deduped = ExactDuplicates(id_field="id", text_field="text")(dataset)
模糊去重(在 GPU 上快 16 倍):
from nemo_curator.modules import FuzzyDuplicates
# MinHash + LSH 去重
fuzzy_dedup = FuzzyDuplicates(
id_field="id",
text_field="text",
num_hashes=260, # MinHash 参数
num_buckets=20,
hash_method="md5"
)
deduped = fuzzy_dedup(dataset)
语义去重:
from nemo_curator.modules import SemanticDuplicates
# 基于嵌入的去重
semantic_dedup = SemanticDuplicates(
id_field="id",
text_field="text",
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
threshold=0.8 # 余弦相似度阈值
)
deduped = semantic_dedup(dataset)
from nemo_curator.modules import Modify
from nemo_curator.modifiers import PIIRedactor
# 脱敏个人可识别信息
pii_redactor = PIIRedactor(
supported_entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON", "LOCATION"],
anonymize_action="replace" # 或 "redact"
)
redacted = Modify(pii_redactor)(dataset)
from nemo_curator.classifiers import QualityClassifier
# 质量分类
quality_clf = QualityClassifier(
model_path="nvidia/quality-classifier-deberta",
batch_size=256,
device="cuda"
)
# 过滤低质量文档
high_quality = dataset.filter(lambda doc: quality_clf(doc["text"]) > 0.5)
| 操作 | CPU (16 核) | GPU (A100) | 加速比 |
|---|---|---|---|
| 模糊去重 (8TB) | 120 小时 | 7.5 小时 | 16× |
| 精确去重 (1TB) | 8 小时 | 0.5 小时 | 16× |
| 质量过滤 | 2 小时 | 0.2 小时 | 10× |
from nemo_curator import get_client
import dask_cuda
# 初始化 GPU 集群
client = get_client(cluster_type="gpu", n_workers=8)
# 使用 8 个 GPU 处理
deduped = FuzzyDuplicates(...)(dataset)
from nemo_curator.image import (
AestheticFilter,
NSFWFilter,
CLIPEmbedder
)
# 美学评分
aesthetic_filter = AestheticFilter(threshold=5.0)
filtered_images = aesthetic_filter(image_dataset)
# NSFW 检测
nsfw_filter = NSFWFilter(threshold=0.9)
safe_images = nsfw_filter(filtered_images)
# 生成 CLIP 嵌入
clip_embedder = CLIPEmbedder(model="openai/clip-vit-base-patch32")
image_embeddings = clip_embedder(safe_images)
from nemo_curator.video import (
SceneDetector,
ClipExtractor,
InternVideo2Embedder
)
# 检测场景
scene_detector = SceneDetector(threshold=27.0)
scenes = scene_detector(video_dataset)
# 提取片段
clip_extractor = ClipExtractor(min_duration=2.0, max_duration=10.0)
clips = clip_extractor(scenes)
# 生成嵌入
video_embedder = InternVideo2Embedder()
video_embeddings = video_embedder(clips)
from nemo_curator.audio import (
ASRInference,
WERFilter,
DurationFilter
)
# ASR 转录
asr = ASRInference(model="nvidia/stt_en_fastconformer_hybrid_large_pc")
transcribed = asr(audio_dataset)
# 根据 WER(词错误率)过滤
wer_filter = WERFilter(max_wer=0.3)
high_quality_audio = wer_filter(transcribed)
# 时长过滤
duration_filter = DurationFilter(min_duration=1.0, max_duration=30.0)
filtered_audio = duration_filter(high_quality_audio)
from nemo_curator import ScoreFilter, Modify
from nemo_curator.filters import *
from nemo_curator.modules import *
from nemo_curator.datasets import DocumentDataset
# 加载 Common Crawl 数据
dataset = DocumentDataset.read_parquet("common_crawl/*.parquet")
# 流程
pipeline = [
# 1. 质量过滤
WordCountFilter(min_words=100, max_words=50000),
RepeatedLinesFilter(max_repeated_line_fraction=0.2),
SymbolToWordRatioFilter(max_symbol_to_word_ratio=0.3),
UrlRatioFilter(max_url_ratio=0.3),
# 2. 语言过滤
LanguageIdentificationFilter(target_languages=["en"]),
# 3. 去重
ExactDuplicates(id_field="id", text_field="text"),
FuzzyDuplicates(id_field="id", text_field="text", num_hashes=260),
# 4. PII 脱敏
PIIRedactor(),
# 5. NSFW 过滤
NSFWClassifier(threshold=0.8)
]
# 执行
for stage in pipeline:
dataset = stage(dataset)
# 保存
dataset.to_parquet("curated_common_crawl/")
from nemo_curator import get_client
from dask_cuda import LocalCUDACluster
# 多 GPU 集群
cluster = LocalCUDACluster(n_workers=8)
client = get_client(cluster=cluster)
# 处理大型数据集
dataset = DocumentDataset.read_parquet("s3://large_dataset/*.parquet")
deduped = FuzzyDuplicates(...)(dataset)
# 清理
client.close()
cluster.close()
基于 CPU 的整理(AWS c5.18xlarge × 10):
基于 GPU 的整理(AWS p4d.24xlarge × 2):
节省:成本降低 89%(节省 $3,828)
生产部署:
每周安装量
183
代码仓库
GitHub 星标数
23.4K
首次出现
2026 年 1 月 21 日
安全审计
安装于
opencode148
claude-code147
gemini-cli141
cursor133
codex128
antigravity118
NVIDIA's toolkit for preparing high-quality training data for LLMs.
Use NeMo Curator when:
Performance :
Use alternatives instead :
# Text curation (CUDA 12)
uv pip install "nemo-curator[text_cuda12]"
# All modalities
uv pip install "nemo-curator[all_cuda12]"
# CPU-only (slower)
uv pip install "nemo-curator[cpu]"
from nemo_curator import ScoreFilter, Modify
from nemo_curator.datasets import DocumentDataset
import pandas as pd
# Load data
df = pd.DataFrame({"text": ["Good document", "Bad doc", "Excellent text"]})
dataset = DocumentDataset(df)
# Quality filtering
def quality_score(doc):
return len(doc["text"].split()) > 5 # Filter short docs
filtered = ScoreFilter(quality_score)(dataset)
# Deduplication
from nemo_curator.modules import ExactDuplicates
deduped = ExactDuplicates()(filtered)
# Save
deduped.to_parquet("curated_data/")
from nemo_curator.filters import (
WordCountFilter,
RepeatedLinesFilter,
UrlRatioFilter,
NonAlphaNumericFilter
)
# Apply 30+ heuristic filters
from nemo_curator import ScoreFilter
# Word count filter
dataset = dataset.filter(WordCountFilter(min_words=50, max_words=100000))
# Remove repetitive content
dataset = dataset.filter(RepeatedLinesFilter(max_repeated_line_fraction=0.3))
# URL ratio filter
dataset = dataset.filter(UrlRatioFilter(max_url_ratio=0.2))
Exact deduplication :
from nemo_curator.modules import ExactDuplicates
# Remove exact duplicates
deduped = ExactDuplicates(id_field="id", text_field="text")(dataset)
Fuzzy deduplication (16× faster on GPU):
from nemo_curator.modules import FuzzyDuplicates
# MinHash + LSH deduplication
fuzzy_dedup = FuzzyDuplicates(
id_field="id",
text_field="text",
num_hashes=260, # MinHash parameters
num_buckets=20,
hash_method="md5"
)
deduped = fuzzy_dedup(dataset)
Semantic deduplication :
from nemo_curator.modules import SemanticDuplicates
# Embedding-based deduplication
semantic_dedup = SemanticDuplicates(
id_field="id",
text_field="text",
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
threshold=0.8 # Cosine similarity threshold
)
deduped = semantic_dedup(dataset)
from nemo_curator.modules import Modify
from nemo_curator.modifiers import PIIRedactor
# Redact personally identifiable information
pii_redactor = PIIRedactor(
supported_entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON", "LOCATION"],
anonymize_action="replace" # or "redact"
)
redacted = Modify(pii_redactor)(dataset)
from nemo_curator.classifiers import QualityClassifier
# Quality classification
quality_clf = QualityClassifier(
model_path="nvidia/quality-classifier-deberta",
batch_size=256,
device="cuda"
)
# Filter low-quality documents
high_quality = dataset.filter(lambda doc: quality_clf(doc["text"]) > 0.5)
| Operation | CPU (16 cores) | GPU (A100) | Speedup |
|---|---|---|---|
| Fuzzy dedup (8TB) | 120 hours | 7.5 hours | 16× |
| Exact dedup (1TB) | 8 hours | 0.5 hours | 16× |
| Quality filtering | 2 hours | 0.2 hours | 10× |
from nemo_curator import get_client
import dask_cuda
# Initialize GPU cluster
client = get_client(cluster_type="gpu", n_workers=8)
# Process with 8 GPUs
deduped = FuzzyDuplicates(...)(dataset)
from nemo_curator.image import (
AestheticFilter,
NSFWFilter,
CLIPEmbedder
)
# Aesthetic scoring
aesthetic_filter = AestheticFilter(threshold=5.0)
filtered_images = aesthetic_filter(image_dataset)
# NSFW detection
nsfw_filter = NSFWFilter(threshold=0.9)
safe_images = nsfw_filter(filtered_images)
# Generate CLIP embeddings
clip_embedder = CLIPEmbedder(model="openai/clip-vit-base-patch32")
image_embeddings = clip_embedder(safe_images)
from nemo_curator.video import (
SceneDetector,
ClipExtractor,
InternVideo2Embedder
)
# Detect scenes
scene_detector = SceneDetector(threshold=27.0)
scenes = scene_detector(video_dataset)
# Extract clips
clip_extractor = ClipExtractor(min_duration=2.0, max_duration=10.0)
clips = clip_extractor(scenes)
# Generate embeddings
video_embedder = InternVideo2Embedder()
video_embeddings = video_embedder(clips)
from nemo_curator.audio import (
ASRInference,
WERFilter,
DurationFilter
)
# ASR transcription
asr = ASRInference(model="nvidia/stt_en_fastconformer_hybrid_large_pc")
transcribed = asr(audio_dataset)
# Filter by WER (word error rate)
wer_filter = WERFilter(max_wer=0.3)
high_quality_audio = wer_filter(transcribed)
# Duration filtering
duration_filter = DurationFilter(min_duration=1.0, max_duration=30.0)
filtered_audio = duration_filter(high_quality_audio)
from nemo_curator import ScoreFilter, Modify
from nemo_curator.filters import *
from nemo_curator.modules import *
from nemo_curator.datasets import DocumentDataset
# Load Common Crawl data
dataset = DocumentDataset.read_parquet("common_crawl/*.parquet")
# Pipeline
pipeline = [
# 1. Quality filtering
WordCountFilter(min_words=100, max_words=50000),
RepeatedLinesFilter(max_repeated_line_fraction=0.2),
SymbolToWordRatioFilter(max_symbol_to_word_ratio=0.3),
UrlRatioFilter(max_url_ratio=0.3),
# 2. Language filtering
LanguageIdentificationFilter(target_languages=["en"]),
# 3. Deduplication
ExactDuplicates(id_field="id", text_field="text"),
FuzzyDuplicates(id_field="id", text_field="text", num_hashes=260),
# 4. PII redaction
PIIRedactor(),
# 5. NSFW filtering
NSFWClassifier(threshold=0.8)
]
# Execute
for stage in pipeline:
dataset = stage(dataset)
# Save
dataset.to_parquet("curated_common_crawl/")
from nemo_curator import get_client
from dask_cuda import LocalCUDACluster
# Multi-GPU cluster
cluster = LocalCUDACluster(n_workers=8)
client = get_client(cluster=cluster)
# Process large dataset
dataset = DocumentDataset.read_parquet("s3://large_dataset/*.parquet")
deduped = FuzzyDuplicates(...)(dataset)
# Cleanup
client.close()
cluster.close()
CPU-based curation (AWS c5.18xlarge × 10):
GPU-based curation (AWS p4d.24xlarge × 2):
Savings : 89% reduction ($3,828 saved)
Production deployments :
Weekly Installs
183
Repository
GitHub Stars
23.4K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
opencode148
claude-code147
gemini-cli141
cursor133
codex128
antigravity118
超能力技能使用指南:AI助手技能调用优先级与工作流程详解
46,500 周安装
高级安全工具包 - 自动化威胁建模、安全审计与渗透测试脚本
558 周安装
LangChain教程:使用智能体与RAG构建LLM应用,快速开发AI助手
557 周安装
translate-book-parallel:使用并行子代理翻译整本书籍(PDF/DOCX/EPUB)的Claude Code技能
180 周安装
简洁代码规范指南 - 实用AI编码原则与最佳实践 | 提升代码质量
560 周安装
Edict 多智能体编排系统:基于唐朝三省六部制的 AI 智能体治理架构
569 周安装
Slack消息最佳实践指南:mrkdwn格式、消息结构与频道礼仪完整教程
576 周安装