molfeat by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill molfeatMolfeat 是一个全面的 Python 库,用于分子特征化,它统一了 100 多种预训练嵌入和手工制作的特征化器。可将化学结构(SMILES 字符串或 RDKit 分子)转换为数值表示,用于机器学习任务,包括 QSAR 建模、虚拟筛选、相似性搜索和深度学习应用。具有快速并行处理、scikit-learn 兼容的转换器以及内置缓存功能。
此技能适用于以下场景:
uv pip install molfeat
# 安装所有可选依赖项
uv pip install "molfeat[all]"
特定特征化器的可选依赖项:
molfeat[dgl] - GNN 模型(GIN 变体)molfeat[graphormer] - Graphormer 模型molfeat[transformer] - ChemBERTa、ChemGPT、MolT5广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
molfeat[fcd] - FCD 描述符molfeat[map4] - MAP4 指纹Molfeat 将特征化组织为三个层次化的类:
molfeat.calc)可调用对象,将单个分子转换为特征向量。接受 RDKit Chem.Mol 对象或 SMILES 字符串。
使用计算器进行:
示例:
from molfeat.calc import FPCalculator
calc = FPCalculator("ecfp", radius=3, fpSize=2048)
features = calc("CCO") # 返回 numpy 数组 (2048,)
molfeat.trans)scikit-learn 兼容的转换器,封装计算器以进行并行化的批量处理。
使用转换器进行:
示例:
from molfeat.trans import MoleculeTransformer
from molfeat.calc import FPCalculator
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
features = transformer(smiles_list) # 并行处理
molfeat.trans.pretrained)用于深度学习模型的专用转换器,具有批量推理和缓存功能。
使用预训练转换器进行:
示例:
from molfeat.trans.pretrained import PretrainedMolTransformer
transformer = PretrainedMolTransformer("ChemBERTa-77M-MLM", n_jobs=-1)
embeddings = transformer(smiles_list) # 深度学习嵌入
import datamol as dm
from molfeat.calc import FPCalculator
from molfeat.trans import MoleculeTransformer
# 加载分子数据
smiles = ["CCO", "CC(=O)O", "c1ccccc1", "CC(C)O"]
# 创建计算器和转换器
calc = FPCalculator("ecfp", radius=3)
transformer = MoleculeTransformer(calc, n_jobs=-1)
# 特征化分子
features = transformer(smiles)
print(f"Shape: {features.shape}") # (4, 2048)
# 保存特征化器配置以实现可重复性
transformer.to_state_yaml_file("featurizer_config.yml")
# 重新加载精确配置
loaded = MoleculeTransformer.from_state_yaml_file("featurizer_config.yml")
# 处理可能包含无效 SMILES 的数据集
transformer = MoleculeTransformer(
calc,
n_jobs=-1,
ignore_errors=True, # 失败时继续
verbose=True # 记录错误详情
)
features = transformer(smiles_with_errors)
# 对于失败的分子返回 None
从指纹开始:
# ECFP - 最流行,通用
FPCalculator("ecfp", radius=3, fpSize=2048)
# MACCS - 快速,适用于骨架跃迁
FPCalculator("maccs")
# MAP4 - 适用于大规模筛选的高效方法
FPCalculator("map4")
对于可解释的模型:
# RDKit 2D 描述符(200+ 个命名属性)
from molfeat.calc import RDKitDescriptors2D
RDKitDescriptors2D()
# Mordred(1800+ 个全面描述符)
from molfeat.calc import MordredDescriptors
MordredDescriptors()
组合多个特征化器:
from molfeat.trans import FeatConcat
concat = FeatConcat([
FPCalculator("maccs"), # 167 维
FPCalculator("ecfp") # 2048 维
]) # 结果:2215 维组合特征
基于 Transformer 的嵌入:
# ChemBERTa - 在 7700 万 PubChem 化合物上预训练
PretrainedMolTransformer("ChemBERTa-77M-MLM")
# ChemGPT - 自回归语言模型
PretrainedMolTransformer("ChemGPT-1.2B")
图神经网络:
# 具有不同预训练目标的 GIN 模型
PretrainedMolTransformer("gin-supervised-masking")
PretrainedMolTransformer("gin-supervised-infomax")
# 用于量子化学的 Graphormer
PretrainedMolTransformer("Graphormer-pcqm4mv2")
# ECFP - 通用,使用最广泛
FPCalculator("ecfp")
# MACCS - 快速,基于骨架的相似性
FPCalculator("maccs")
# MAP4 - 适用于大型数据库的高效方法
FPCalculator("map4")
# USR/USRCAT - 3D 形状相似性
from molfeat.calc import USRDescriptors
USRDescriptors()
# FCFP - 基于官能团
FPCalculator("fcfp")
# CATS - 药效团对分布
from molfeat.calc import CATSCalculator
CATSCalculator(mode="2D")
# Gobbi - 显式药效团特征
FPCalculator("gobbi2D")
from molfeat.trans import MoleculeTransformer
from molfeat.calc import FPCalculator
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
# 特征化分子
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
X = transformer(smiles_train)
# 训练模型
model = RandomForestRegressor(n_estimators=100)
scores = cross_val_score(model, X, y_train, cv=5)
print(f"R² = {scores.mean():.3f}")
# 保存配置以供部署
transformer.to_state_yaml_file("production_featurizer.yml")
from sklearn.ensemble import RandomForestClassifier
# 在已知活性/非活性化合物上训练
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
X_train = transformer(train_smiles)
clf = RandomForestClassifier(n_estimators=500)
clf.fit(X_train, train_labels)
# 筛选大型库
X_screen = transformer(screening_library) # 例如,100 万个化合物
predictions = clf.predict_proba(X_screen)[:, 1]
# 排序并选择 top hits
top_indices = predictions.argsort()[::-1][:1000]
top_hits = [screening_library[i] for i in top_indices]
from sklearn.metrics.pairwise import cosine_similarity
# 查询分子
calc = FPCalculator("ecfp")
query_fp = calc(query_smiles).reshape(1, -1)
# 数据库指纹
transformer = MoleculeTransformer(calc, n_jobs=-1)
database_fps = transformer(database_smiles)
# 计算相似性
similarities = cosine_similarity(query_fp, database_fps)[0]
top_similar = similarities.argsort()[-10:][::-1]
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
# 创建端到端流程
pipeline = Pipeline([
('featurizer', MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)),
('classifier', RandomForestClassifier(n_estimators=100))
])
# 直接在 SMILES 上训练和预测
pipeline.fit(smiles_train, y_train)
predictions = pipeline.predict(smiles_test)
featurizers = {
'ECFP': FPCalculator("ecfp"),
'MACCS': FPCalculator("maccs"),
'Descriptors': RDKitDescriptors2D(),
'ChemBERTa': PretrainedMolTransformer("ChemBERTa-77M-MLM")
}
results = {}
for name, feat in featurizers.items():
transformer = MoleculeTransformer(feat, n_jobs=-1)
X = transformer(smiles)
# 使用您的 ML 模型进行评估
score = evaluate_model(X, y)
results[name] = score
使用 ModelStore 探索所有可用的特征化器:
from molfeat.store.modelstore import ModelStore
store = ModelStore()
# 列出所有可用模型
all_models = store.available_models
print(f"特征化器总数:{len(all_models)}")
# 搜索特定模型
chemberta_models = store.search(name="ChemBERTa")
for model in chemberta_models:
print(f"- {model.name}: {model.description}")
# 获取使用信息
model_card = store.search(name="ChemBERTa-77M-MLM")[0]
model_card.usage() # 显示使用示例
# 加载模型
transformer = store.load("ChemBERTa-77M-MLM")
class CustomTransformer(MoleculeTransformer):
def preprocess(self, mol):
"""自定义预处理流程"""
if isinstance(mol, str):
mol = dm.to_mol(mol)
mol = dm.standardize_mol(mol)
mol = dm.remove_salts(mol)
return mol
transformer = CustomTransformer(FPCalculator("ecfp"), n_jobs=-1)
def featurize_in_chunks(smiles_list, transformer, chunk_size=10000):
"""分块处理大型数据集以管理内存"""
all_features = []
for i in range(0, len(smiles_list), chunk_size):
chunk = smiles_list[i:i+chunk_size]
features = transformer(chunk)
all_features.append(features)
return np.vstack(all_features)
import pickle
cache_file = "embeddings_cache.pkl"
transformer = PretrainedMolTransformer("ChemBERTa-77M-MLM", n_jobs=-1)
try:
with open(cache_file, "rb") as f:
embeddings = pickle.load(f)
except FileNotFoundError:
embeddings = transformer(smiles_list)
with open(cache_file, "wb") as f:
pickle.dump(embeddings, f)
n_jobs=-1 以利用所有 CPU 核心dtype=np.float32ignore_errors=True常用特征化器的快速参考:
| 特征化器 | 类型 | 维度 | 速度 | 使用场景 |
|---|---|---|---|---|
ecfp | 指纹 | 2048 | 快速 | 通用 |
maccs | 指纹 | 167 | 非常快 | 骨架相似性 |
desc2D | 描述符 | 200+ | 快速 | 可解释模型 |
mordred | 描述符 | 1800+ | 中等 | 全面特征 |
map4 | 指纹 | 1024 | 快速 | 大规模筛选 |
ChemBERTa-77M-MLM | 深度学习 | 768 | 慢* | 迁移学习 |
gin-supervised-masking | GNN | 可变 | 慢* | 基于图的模型 |
*首次运行较慢;后续运行受益于缓存
此技能包含全面的参考文档:
完整的 API 文档,涵盖:
molfeat.calc - 所有计算器类和参数molfeat.trans - 转换器类和方法molfeat.store - ModelStore 使用何时加载: 在实现特定计算器、理解转换器参数或与 scikit-learn/PyTorch 集成时参考。
按类别组织的所有 100 多种特征化器的综合目录:
何时加载: 在为特定任务选择最佳特征化器、探索可用选项或理解特征化器特性时参考。
搜索提示: 使用 grep 查找特定类型的特征化器:
grep -i "chembert" references/available_featurizers.md
grep -i "pharmacophore" references/available_featurizers.md
常见场景的实用代码示例:
何时加载: 在实现特定工作流程、排除问题或学习 molfeat 模式时参考。
启用错误处理以跳过无效的 SMILES:
transformer = MoleculeTransformer(
calc,
ignore_errors=True,
verbose=True
)
对于超过 10 万个分子的数据集,请分块处理或使用流式处理方法。
某些模型需要额外的包。安装特定的额外项:
uv pip install "molfeat[transformer]" # 用于 ChemBERTa/ChemGPT
uv pip install "molfeat[dgl]" # 用于 GIN 模型
保存精确配置并记录版本:
transformer.to_state_yaml_file("config.yml")
import molfeat
print(f"molfeat 版本:{molfeat.__version__}")
每周安装次数
117
仓库
GitHub 星标数
22.6K
首次出现
2026 年 1 月 21 日
安全审计
安装于
claude-code102
opencode93
cursor91
gemini-cli89
antigravity83
codex78
Molfeat is a comprehensive Python library for molecular featurization that unifies 100+ pre-trained embeddings and hand-crafted featurizers. Convert chemical structures (SMILES strings or RDKit molecules) into numerical representations for machine learning tasks including QSAR modeling, virtual screening, similarity searching, and deep learning applications. Features fast parallel processing, scikit-learn compatible transformers, and built-in caching.
This skill should be used when working with:
uv pip install molfeat
# With all optional dependencies
uv pip install "molfeat[all]"
Optional dependencies for specific featurizers:
molfeat[dgl] - GNN models (GIN variants)molfeat[graphormer] - Graphormer modelsmolfeat[transformer] - ChemBERTa, ChemGPT, MolT5molfeat[fcd] - FCD descriptorsmolfeat[map4] - MAP4 fingerprintsMolfeat organizes featurization into three hierarchical classes:
molfeat.calc)Callable objects that convert individual molecules into feature vectors. Accept RDKit Chem.Mol objects or SMILES strings.
Use calculators for:
Example:
from molfeat.calc import FPCalculator
calc = FPCalculator("ecfp", radius=3, fpSize=2048)
features = calc("CCO") # Returns numpy array (2048,)
molfeat.trans)Scikit-learn compatible transformers that wrap calculators for batch processing with parallelization.
Use transformers for:
Example:
from molfeat.trans import MoleculeTransformer
from molfeat.calc import FPCalculator
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
features = transformer(smiles_list) # Parallel processing
molfeat.trans.pretrained)Specialized transformers for deep learning models with batched inference and caching.
Use pretrained transformers for:
Example:
from molfeat.trans.pretrained import PretrainedMolTransformer
transformer = PretrainedMolTransformer("ChemBERTa-77M-MLM", n_jobs=-1)
embeddings = transformer(smiles_list) # Deep learning embeddings
import datamol as dm
from molfeat.calc import FPCalculator
from molfeat.trans import MoleculeTransformer
# Load molecular data
smiles = ["CCO", "CC(=O)O", "c1ccccc1", "CC(C)O"]
# Create calculator and transformer
calc = FPCalculator("ecfp", radius=3)
transformer = MoleculeTransformer(calc, n_jobs=-1)
# Featurize molecules
features = transformer(smiles)
print(f"Shape: {features.shape}") # (4, 2048)
# Save featurizer configuration for reproducibility
transformer.to_state_yaml_file("featurizer_config.yml")
# Reload exact configuration
loaded = MoleculeTransformer.from_state_yaml_file("featurizer_config.yml")
# Process dataset with potentially invalid SMILES
transformer = MoleculeTransformer(
calc,
n_jobs=-1,
ignore_errors=True, # Continue on failures
verbose=True # Log error details
)
features = transformer(smiles_with_errors)
# Returns None for failed molecules
Start with fingerprints:
# ECFP - Most popular, general-purpose
FPCalculator("ecfp", radius=3, fpSize=2048)
# MACCS - Fast, good for scaffold hopping
FPCalculator("maccs")
# MAP4 - Efficient for large-scale screening
FPCalculator("map4")
For interpretable models:
# RDKit 2D descriptors (200+ named properties)
from molfeat.calc import RDKitDescriptors2D
RDKitDescriptors2D()
# Mordred (1800+ comprehensive descriptors)
from molfeat.calc import MordredDescriptors
MordredDescriptors()
Combine multiple featurizers:
from molfeat.trans import FeatConcat
concat = FeatConcat([
FPCalculator("maccs"), # 167 dimensions
FPCalculator("ecfp") # 2048 dimensions
]) # Result: 2215-dimensional combined features
Transformer-based embeddings:
# ChemBERTa - Pre-trained on 77M PubChem compounds
PretrainedMolTransformer("ChemBERTa-77M-MLM")
# ChemGPT - Autoregressive language model
PretrainedMolTransformer("ChemGPT-1.2B")
Graph neural networks:
# GIN models with different pre-training objectives
PretrainedMolTransformer("gin-supervised-masking")
PretrainedMolTransformer("gin-supervised-infomax")
# Graphormer for quantum chemistry
PretrainedMolTransformer("Graphormer-pcqm4mv2")
# ECFP - General purpose, most widely used
FPCalculator("ecfp")
# MACCS - Fast, scaffold-based similarity
FPCalculator("maccs")
# MAP4 - Efficient for large databases
FPCalculator("map4")
# USR/USRCAT - 3D shape similarity
from molfeat.calc import USRDescriptors
USRDescriptors()
# FCFP - Functional group based
FPCalculator("fcfp")
# CATS - Pharmacophore pair distributions
from molfeat.calc import CATSCalculator
CATSCalculator(mode="2D")
# Gobbi - Explicit pharmacophore features
FPCalculator("gobbi2D")
from molfeat.trans import MoleculeTransformer
from molfeat.calc import FPCalculator
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
# Featurize molecules
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
X = transformer(smiles_train)
# Train model
model = RandomForestRegressor(n_estimators=100)
scores = cross_val_score(model, X, y_train, cv=5)
print(f"R² = {scores.mean():.3f}")
# Save configuration for deployment
transformer.to_state_yaml_file("production_featurizer.yml")
from sklearn.ensemble import RandomForestClassifier
# Train on known actives/inactives
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
X_train = transformer(train_smiles)
clf = RandomForestClassifier(n_estimators=500)
clf.fit(X_train, train_labels)
# Screen large library
X_screen = transformer(screening_library) # e.g., 1M compounds
predictions = clf.predict_proba(X_screen)[:, 1]
# Rank and select top hits
top_indices = predictions.argsort()[::-1][:1000]
top_hits = [screening_library[i] for i in top_indices]
from sklearn.metrics.pairwise import cosine_similarity
# Query molecule
calc = FPCalculator("ecfp")
query_fp = calc(query_smiles).reshape(1, -1)
# Database fingerprints
transformer = MoleculeTransformer(calc, n_jobs=-1)
database_fps = transformer(database_smiles)
# Compute similarity
similarities = cosine_similarity(query_fp, database_fps)[0]
top_similar = similarities.argsort()[-10:][::-1]
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
# Create end-to-end pipeline
pipeline = Pipeline([
('featurizer', MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)),
('classifier', RandomForestClassifier(n_estimators=100))
])
# Train and predict directly on SMILES
pipeline.fit(smiles_train, y_train)
predictions = pipeline.predict(smiles_test)
featurizers = {
'ECFP': FPCalculator("ecfp"),
'MACCS': FPCalculator("maccs"),
'Descriptors': RDKitDescriptors2D(),
'ChemBERTa': PretrainedMolTransformer("ChemBERTa-77M-MLM")
}
results = {}
for name, feat in featurizers.items():
transformer = MoleculeTransformer(feat, n_jobs=-1)
X = transformer(smiles)
# Evaluate with your ML model
score = evaluate_model(X, y)
results[name] = score
Use the ModelStore to explore all available featurizers:
from molfeat.store.modelstore import ModelStore
store = ModelStore()
# List all available models
all_models = store.available_models
print(f"Total featurizers: {len(all_models)}")
# Search for specific models
chemberta_models = store.search(name="ChemBERTa")
for model in chemberta_models:
print(f"- {model.name}: {model.description}")
# Get usage information
model_card = store.search(name="ChemBERTa-77M-MLM")[0]
model_card.usage() # Display usage examples
# Load model
transformer = store.load("ChemBERTa-77M-MLM")
class CustomTransformer(MoleculeTransformer):
def preprocess(self, mol):
"""Custom preprocessing pipeline"""
if isinstance(mol, str):
mol = dm.to_mol(mol)
mol = dm.standardize_mol(mol)
mol = dm.remove_salts(mol)
return mol
transformer = CustomTransformer(FPCalculator("ecfp"), n_jobs=-1)
def featurize_in_chunks(smiles_list, transformer, chunk_size=10000):
"""Process large datasets in chunks to manage memory"""
all_features = []
for i in range(0, len(smiles_list), chunk_size):
chunk = smiles_list[i:i+chunk_size]
features = transformer(chunk)
all_features.append(features)
return np.vstack(all_features)
import pickle
cache_file = "embeddings_cache.pkl"
transformer = PretrainedMolTransformer("ChemBERTa-77M-MLM", n_jobs=-1)
try:
with open(cache_file, "rb") as f:
embeddings = pickle.load(f)
except FileNotFoundError:
embeddings = transformer(smiles_list)
with open(cache_file, "wb") as f:
pickle.dump(embeddings, f)
n_jobs=-1 to utilize all CPU coresdtype=np.float32 when precision allowsignore_errors=True for large datasetsQuick reference for frequently used featurizers:
| Featurizer | Type | Dimensions | Speed | Use Case |
|---|---|---|---|---|
ecfp | Fingerprint | 2048 | Fast | General purpose |
maccs | Fingerprint | 167 | Very fast | Scaffold similarity |
desc2D | Descriptors | 200+ | Fast | Interpretable models |
mordred |
*First run is slow; subsequent runs benefit from caching
This skill includes comprehensive reference documentation:
Complete API documentation covering:
molfeat.calc - All calculator classes and parametersmolfeat.trans - Transformer classes and methodsmolfeat.store - ModelStore usageWhen to load: Reference when implementing specific calculators, understanding transformer parameters, or integrating with scikit-learn/PyTorch.
Comprehensive catalog of all 100+ featurizers organized by category:
When to load: Reference when selecting the optimal featurizer for a specific task, exploring available options, or understanding featurizer characteristics.
Search tip: Use grep to find specific featurizer types:
grep -i "chembert" references/available_featurizers.md
grep -i "pharmacophore" references/available_featurizers.md
Practical code examples for common scenarios:
When to load: Reference when implementing specific workflows, troubleshooting issues, or learning molfeat patterns.
Enable error handling to skip invalid SMILES:
transformer = MoleculeTransformer(
calc,
ignore_errors=True,
verbose=True
)
Process in chunks or use streaming approaches for datasets > 100K molecules.
Some models require additional packages. Install specific extras:
uv pip install "molfeat[transformer]" # For ChemBERTa/ChemGPT
uv pip install "molfeat[dgl]" # For GIN models
Save exact configurations and document versions:
transformer.to_state_yaml_file("config.yml")
import molfeat
print(f"molfeat version: {molfeat.__version__}")
Weekly Installs
117
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
claude-code102
opencode93
cursor91
gemini-cli89
antigravity83
codex78
超能力技能使用指南:AI助手技能调用优先级与工作流程详解
47,800 周安装
ASP.NET Core 开发指南:Web API、身份验证、中间件与性能优化实战
134 周安装
agent-browser 浏览器自动化工具 - 快速网页交互与测试命令行工具
134 周安装
find-skills技能:AI智能体技能搜索与安装工具,扩展Claude能力
134 周安装
Azure Functions 最佳实践指南:独立工作进程、Node.js/Python 编程模型与反模式详解
134 周安装
gentle-teaching 温和教学框架:AI辅助学习指南,培养独立解决问题能力
134 周安装
Symfony Scheduler 异步任务调度器:实现稳定重试与失败传输的工作流
134 周安装
| Descriptors |
| 1800+ |
| Medium |
| Comprehensive features |
map4 | Fingerprint | 1024 | Fast | Large-scale screening |
ChemBERTa-77M-MLM | Deep learning | 768 | Slow* | Transfer learning |
gin-supervised-masking | GNN | Variable | Slow* | Graph-based models |