datamol by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill datamolDatamol 是一个 Python 库,为 RDKit 提供了一个轻量级、符合 Python 风格的抽象层,用于分子化学信息学。通过合理的默认设置、高效的并行化和现代化的 I/O 功能,简化复杂的分子操作。所有分子对象都是原生的 rdkit.Chem.Mol 实例,确保与 RDKit 生态系统的完全兼容性。
核心功能:
指导用户安装 datamol:
uv pip install datamol
导入约定:
import datamol as dm
从 SMILES 创建分子:
import datamol as dm
# 单个分子
mol = dm.to_mol("CCO") # 乙醇
# 从 SMILES 列表
smiles_list = ["CCO", "c1ccccc1", "CC(=O)O"]
mols = [dm.to_mol(smi) for smi in smiles_list]
# 错误处理
mol = dm.to_mol("invalid_smiles") # 返回 None
if mol is None:
print("Failed to parse SMILES")
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
将分子转换为 SMILES:
# 规范 SMILES
smiles = dm.to_smiles(mol)
# 异构 SMILES(包含立体化学信息)
smiles = dm.to_smiles(mol, isomeric=True)
# 其他格式
inchi = dm.to_inchi(mol)
inchikey = dm.to_inchikey(mol)
selfies = dm.to_selfies(mol)
标准化和净化(始终推荐用于用户提供的分子):
# 净化分子
mol = dm.sanitize_mol(mol)
# 完全标准化(推荐用于数据集)
mol = dm.standardize_mol(
mol,
disconnect_metals=True,
normalize=True,
reionize=True
)
# 直接处理 SMILES 字符串
clean_smiles = dm.standardize_smiles(smiles)
有关全面的 I/O 文档,请参阅 references/io_module.md。
读取文件:
# SDF 文件(化学中最常见)
df = dm.read_sdf("compounds.sdf", mol_column='mol')
# SMILES 文件
df = dm.read_smi("molecules.smi", smiles_column='smiles', mol_column='mol')
# 包含 SMILES 列的 CSV
df = dm.read_csv("data.csv", smiles_column="SMILES", mol_column="mol")
# Excel 文件
df = dm.read_excel("compounds.xlsx", sheet_name=0, mol_column="mol")
# 通用读取器(自动检测格式)
df = dm.open_df("file.sdf") # 适用于 .sdf、.csv、.xlsx、.parquet、.json
写入文件:
# 保存为 SDF
dm.to_sdf(mols, "output.sdf")
# 或从 DataFrame
dm.to_sdf(df, "output.sdf", mol_column="mol")
# 保存为 SMILES 文件
dm.to_smi(mols, "output.smi")
# 包含渲染分子图像的 Excel
dm.to_xlsx(df, "output.xlsx", mol_columns=["mol"])
远程文件支持(S3、GCS、HTTP):
# 从云存储读取
df = dm.read_sdf("s3://bucket/compounds.sdf")
df = dm.read_csv("https://example.com/data.csv")
# 写入云存储
dm.to_sdf(mols, "s3://bucket/output.sdf")
有关详细的描述符文档,请参阅 references/descriptors_viz.md。
计算单个分子的描述符:
# 获取标准描述符集
descriptors = dm.descriptors.compute_many_descriptors(mol)
# 返回:{'mw': 46.07, 'logp': -0.03, 'hbd': 1, 'hba': 1,
# 'tpsa': 20.23, 'n_aromatic_atoms': 0, ...}
批量描述符计算(推荐用于数据集):
# 并行计算所有分子
desc_df = dm.descriptors.batch_compute_many_descriptors(
mols,
n_jobs=-1, # 使用所有 CPU 核心
progress=True # 显示进度条
)
特定描述符:
# 芳香性
n_aromatic = dm.descriptors.n_aromatic_atoms(mol)
aromatic_ratio = dm.descriptors.n_aromatic_atoms_proportion(mol)
# 立体化学
n_stereo = dm.descriptors.n_stereo_centers(mol)
n_unspec = dm.descriptors.n_stereo_centers_unspecified(mol)
# 柔性
n_rigid = dm.descriptors.n_rigid_bonds(mol)
类药性过滤(Lipinski 五规则):
# 过滤化合物
def is_druglike(mol):
desc = dm.descriptors.compute_many_descriptors(mol)
return (
desc['mw'] <= 500 and
desc['logp'] <= 5 and
desc['hbd'] <= 5 and
desc['hba'] <= 10
)
druglike_mols = [mol for mol in mols if is_druglike(mol)]
生成指纹:
# ECFP(扩展连接性指纹,默认)
fp = dm.to_fp(mol, fp_type='ecfp', radius=2, n_bits=2048)
# 其他指纹类型
fp_maccs = dm.to_fp(mol, fp_type='maccs')
fp_topological = dm.to_fp(mol, fp_type='topological')
fp_atompair = dm.to_fp(mol, fp_type='atompair')
相似性计算:
# 集合内的成对距离
distance_matrix = dm.pdist(mols, n_jobs=-1)
# 两个集合之间的距离
distances = dm.cdist(query_mols, library_mols, n_jobs=-1)
# 查找最相似的分子
from scipy.spatial.distance import squareform
dist_matrix = squareform(dm.pdist(mols))
# 距离越小 = 相似度越高(Tanimoto 距离 = 1 - Tanimoto 相似度)
有关聚类详细信息,请参阅 references/core_api.md。
Butina 聚类:
# 按结构相似性聚类分子
clusters = dm.cluster_mols(
mols,
cutoff=0.2, # Tanimoto 距离阈值(0=相同,1=完全不同)
n_jobs=-1 # 并行处理
)
# 每个聚类是一个分子索引列表
for i, cluster in enumerate(clusters):
print(f"Cluster {i}: {len(cluster)} molecules")
cluster_mols = [mols[idx] for idx in cluster]
重要提示:Butina 聚类会构建完整的距离矩阵 - 适用于约 1000 个分子,不适用于 10,000 个以上。
多样性选择:
# 选择多样性子集
diverse_mols = dm.pick_diverse(
mols,
npick=100 # 选择 100 个多样性分子
)
# 选择聚类中心
centroids = dm.pick_centroids(
mols,
npick=50 # 选择 50 个代表性分子
)
有关完整的骨架文档,请参阅 references/fragments_scaffolds.md。
提取 Murcko 骨架:
# 获取 Bemis-Murcko 骨架(核心结构)
scaffold = dm.to_scaffold_murcko(mol)
scaffold_smiles = dm.to_smiles(scaffold)
基于骨架的分析:
# 按骨架分组化合物
from collections import Counter
scaffolds = [dm.to_scaffold_murcko(mol) for mol in mols]
scaffold_smiles = [dm.to_smiles(s) for s in scaffolds]
# 统计骨架频率
scaffold_counts = Counter(scaffold_smiles)
most_common = scaffold_counts.most_common(10)
# 创建骨架到分子的映射
scaffold_groups = {}
for mol, scaf_smi in zip(mols, scaffold_smiles):
if scaf_smi not in scaffold_groups:
scaffold_groups[scaf_smi] = []
scaffold_groups[scaf_smi].append(mol)
基于骨架的训练/测试分割(用于机器学习):
# 确保训练集和测试集具有不同的骨架
scaffold_to_mols = {}
for mol, scaf in zip(mols, scaffold_smiles):
if scaf not in scaffold_to_mols:
scaffold_to_mols[scaf] = []
scaffold_to_mols[scaf].append(mol)
# 将骨架分割为训练/测试集
import random
scaffolds = list(scaffold_to_mols.keys())
random.shuffle(scaffolds)
split_idx = int(0.8 * len(scaffolds))
train_scaffolds = scaffolds[:split_idx]
test_scaffolds = scaffolds[split_idx:]
# 获取每个分割的分子
train_mols = [mol for scaf in train_scaffolds for mol in scaffold_to_mols[scaf]]
test_mols = [mol for scaf in test_scaffolds for mol in scaffold_to_mols[scaf]]
有关片段化详细信息,请参阅 references/fragments_scaffolds.md。
BRICS 片段化(16 种键类型):
# 片段化分子
fragments = dm.fragment.brics(mol)
# 返回:带有连接点(如 '[1*]CCN')的片段 SMILES 集合
RECAP 片段化(11 种键类型):
fragments = dm.fragment.recap(mol)
片段分析:
# 在化合物库中查找常见片段
from collections import Counter
all_fragments = []
for mol in mols:
frags = dm.fragment.brics(mol)
all_fragments.extend(frags)
fragment_counts = Counter(all_fragments)
common_frags = fragment_counts.most_common(20)
# 基于片段的评分
def fragment_score(mol, reference_fragments):
mol_frags = dm.fragment.brics(mol)
overlap = mol_frags.intersection(reference_fragments)
return len(overlap) / len(mol_frags) if mol_frags else 0
有关详细的构象文档,请参阅 references/conformers_module.md。
生成构象:
# 生成 3D 构象
mol_3d = dm.conformers.generate(
mol,
n_confs=50, # 生成数量(如果为 None 则自动)
rms_cutoff=0.5, # 过滤相似构象(埃)
minimize_energy=True, # 使用 UFF 力场最小化
method='ETKDGv3' # 嵌入方法(推荐)
)
# 访问构象
n_conformers = mol_3d.GetNumConformers()
conf = mol_3d.GetConformer(0) # 获取第一个构象
positions = conf.GetPositions() # 原子坐标的 Nx3 数组
构象聚类:
# 按 RMSD 聚类构象
clusters = dm.conformers.cluster(
mol_3d,
rms_cutoff=1.0,
centroids=False
)
# 获取代表性构象
centroids = dm.conformers.return_centroids(mol_3d, clusters)
SASA 计算:
# 计算溶剂可及表面积
sasa_values = dm.conformers.sasa(mol_3d, n_jobs=-1)
# 从构象属性访问 SASA
conf = mol_3d.GetConformer(0)
sasa = conf.GetDoubleProp('rdkit_free_sasa')
有关可视化文档,请参阅 references/descriptors_viz.md。
基本分子网格:
# 可视化分子
dm.viz.to_image(
mols[:20],
legends=[dm.to_smiles(m) for m in mols[:20]],
n_cols=5,
mol_size=(300, 300)
)
# 保存到文件
dm.viz.to_image(mols, outfile="molecules.png")
# 用于出版的 SVG
dm.viz.to_image(mols, outfile="molecules.svg", use_svg=True)
对齐可视化(用于 SAR 分析):
# 按共同子结构对齐分子
dm.viz.to_image(
similar_mols,
align=True, # 启用 MCS 对齐
legends=activity_labels,
n_cols=4
)
高亮显示子结构:
# 高亮特定原子和键
dm.viz.to_image(
mol,
highlight_atom=[0, 1, 2, 3], # 原子索引
highlight_bond=[0, 1, 2] # 键索引
)
构象可视化:
# 显示多个构象
dm.viz.conformers(
mol_3d,
n_confs=10,
align_conf=True,
n_cols=3
)
有关反应文档,请参阅 references/reactions_data.md。
应用反应:
from rdkit.Chem import rdChemReactions
# 从 SMARTS 定义反应
rxn_smarts = '[C:1](=[O:2])[OH:3]>>[C:1](=[O:2])[Cl:3]'
rxn = rdChemReactions.ReactionFromSmarts(rxn_smarts)
# 应用于分子
reactant = dm.to_mol("CC(=O)O") # 乙酸
product = dm.reactions.apply_reaction(
rxn,
(reactant,),
sanitize=True
)
# 转换为 SMILES
product_smiles = dm.to_smiles(product)
批量反应应用:
# 将反应应用于库
products = []
for mol in reactant_mols:
try:
prod = dm.reactions.apply_reaction(rxn, (mol,))
if prod is not None:
products.append(prod)
except Exception as e:
print(f"Reaction failed: {e}")
Datamol 包含许多操作的内置并行化。使用 n_jobs 参数:
n_jobs=1:顺序(无并行化)n_jobs=-1:使用所有可用的 CPU 核心n_jobs=4:使用 4 个核心支持并行化的函数:
dm.read_sdf(..., n_jobs=-1)dm.descriptors.batch_compute_many_descriptors(..., n_jobs=-1)dm.cluster_mols(..., n_jobs=-1)dm.pdist(..., n_jobs=-1)dm.conformers.sasa(..., n_jobs=-1)进度条:许多批处理操作支持 progress=True 参数。
import datamol as dm
import pandas as pd
# 1. 加载分子
df = dm.read_sdf("compounds.sdf")
# 2. 标准化
df['mol'] = df['mol'].apply(lambda m: dm.standardize_mol(m) if m else None)
df = df[df['mol'].notna()] # 移除失败的分子
# 3. 计算描述符
desc_df = dm.descriptors.batch_compute_many_descriptors(
df['mol'].tolist(),
n_jobs=-1,
progress=True
)
# 4. 按类药性过滤
druglike = (
(desc_df['mw'] <= 500) &
(desc_df['logp'] <= 5) &
(desc_df['hbd'] <= 5) &
(desc_df['hba'] <= 10)
)
filtered_df = df[druglike]
# 5. 聚类并选择多样性子集
diverse_mols = dm.pick_diverse(
filtered_df['mol'].tolist(),
npick=100
)
# 6. 可视化结果
dm.viz.to_image(
diverse_mols,
legends=[dm.to_smiles(m) for m in diverse_mols],
outfile="diverse_compounds.png",
n_cols=10
)
# 按骨架分组
scaffolds = [dm.to_scaffold_murcko(mol) for mol in mols]
scaffold_smiles = [dm.to_smiles(s) for s in scaffolds]
# 创建包含活性的 DataFrame
sar_df = pd.DataFrame({
'mol': mols,
'scaffold': scaffold_smiles,
'activity': activities # 用户提供的活性数据
})
# 分析每个骨架系列
for scaffold, group in sar_df.groupby('scaffold'):
if len(group) >= 3: # 需要多个示例
print(f"\nScaffold: {scaffold}")
print(f"Count: {len(group)}")
print(f"Activity range: {group['activity'].min():.2f} - {group['activity'].max():.2f}")
# 使用活性作为图例进行可视化
dm.viz.to_image(
group['mol'].tolist(),
legends=[f"Activity: {act:.2f}" for act in group['activity']],
align=True # 按共同子结构对齐
)
# 1. 为查询和库生成指纹
query_fps = [dm.to_fp(mol) for mol in query_actives]
library_fps = [dm.to_fp(mol) for mol in library_mols]
# 2. 计算相似性
from scipy.spatial.distance import cdist
import numpy as np
distances = dm.cdist(query_actives, library_mols, n_jobs=-1)
# 3. 查找最接近的匹配(到任何查询的最小距离)
min_distances = distances.min(axis=0)
similarities = 1 - min_distances # 将距离转换为相似度
# 4. 排序并选择 top hits
top_indices = np.argsort(similarities)[::-1][:100] # Top 100
top_hits = [library_mols[i] for i in top_indices]
top_scores = [similarities[i] for i in top_indices]
# 5. 可视化 hits
dm.viz.to_image(
top_hits[:20],
legends=[f"Sim: {score:.3f}" for score in top_scores[:20]],
outfile="screening_hits.png"
)
有关详细的 API 文档,请查阅这些参考文件:
references/core_api.md:核心命名空间函数(转换、标准化、指纹、聚类)references/io_module.md:文件 I/O 操作(读取/写入 SDF、CSV、Excel、远程文件)references/conformers_module.md:3D 构象生成、聚类、SASA 计算references/descriptors_viz.md:分子描述符和可视化函数references/fragments_scaffolds.md:骨架提取、BRICS/RECAP 片段化references/reactions_data.md:化学反应和玩具数据集始终标准化来自外部源的分子:
mol = dm.standardize_mol(mol, disconnect_metals=True, normalize=True, reionize=True)
检查 None 值在分子解析后:
mol = dm.to_mol(smiles)
if mol is None:
# 处理无效的 SMILES
使用并行处理处理大型数据集:
result = dm.operation(..., n_jobs=-1, progress=True)
利用 fsspec进行云存储:
df = dm.read_sdf("s3://bucket/compounds.sdf")
使用适当的指纹进行相似性计算:
考虑规模限制:
用于机器学习的骨架分割:确保通过骨架进行适当的训练/测试分离
对齐分子当可视化 SAR 系列时
# 安全的分子创建
def safe_to_mol(smiles):
try:
mol = dm.to_mol(smiles)
if mol is not None:
mol = dm.standardize_mol(mol)
return mol
except Exception as e:
print(f"Failed to process {smiles}: {e}")
return None
# 安全的批处理
valid_mols = []
for smiles in smiles_list:
mol = safe_to_mol(smiles)
if mol is not None:
valid_mols.append(mol)
# 特征生成
X = np.array([dm.to_fp(mol) for mol in mols])
# 或描述符
desc_df = dm.descriptors.batch_compute_many_descriptors(mols, n_jobs=-1)
X = desc_df.values
# 训练模型
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, y_target)
# 预测
predictions = model.predict(X_test)
问题:分子解析失败
dm.standardize_smiles() 或尝试 dm.fix_mol()问题:聚类时内存错误
dm.pick_diverse() 而不是完全聚类问题:构象生成缓慢
n_confs 或增加 rms_cutoff 以生成更少的构象问题:远程文件访问失败
每周安装次数
132
仓库
GitHub 星标数
23.4K
首次出现
2026年1月21日
安全审计
安装于
claude-code116
opencode108
gemini-cli103
cursor103
antigravity99
codex91
Datamol is a Python library that provides a lightweight, Pythonic abstraction layer over RDKit for molecular cheminformatics. Simplify complex molecular operations with sensible defaults, efficient parallelization, and modern I/O capabilities. All molecular objects are native rdkit.Chem.Mol instances, ensuring full compatibility with the RDKit ecosystem.
Key capabilities :
Guide users to install datamol:
uv pip install datamol
Import convention :
import datamol as dm
Creating molecules from SMILES :
import datamol as dm
# Single molecule
mol = dm.to_mol("CCO") # Ethanol
# From list of SMILES
smiles_list = ["CCO", "c1ccccc1", "CC(=O)O"]
mols = [dm.to_mol(smi) for smi in smiles_list]
# Error handling
mol = dm.to_mol("invalid_smiles") # Returns None
if mol is None:
print("Failed to parse SMILES")
Converting molecules to SMILES :
# Canonical SMILES
smiles = dm.to_smiles(mol)
# Isomeric SMILES (includes stereochemistry)
smiles = dm.to_smiles(mol, isomeric=True)
# Other formats
inchi = dm.to_inchi(mol)
inchikey = dm.to_inchikey(mol)
selfies = dm.to_selfies(mol)
Standardization and sanitization (always recommend for user-provided molecules):
# Sanitize molecule
mol = dm.sanitize_mol(mol)
# Full standardization (recommended for datasets)
mol = dm.standardize_mol(
mol,
disconnect_metals=True,
normalize=True,
reionize=True
)
# For SMILES strings directly
clean_smiles = dm.standardize_smiles(smiles)
Refer to references/io_module.md for comprehensive I/O documentation.
Reading files :
# SDF files (most common in chemistry)
df = dm.read_sdf("compounds.sdf", mol_column='mol')
# SMILES files
df = dm.read_smi("molecules.smi", smiles_column='smiles', mol_column='mol')
# CSV with SMILES column
df = dm.read_csv("data.csv", smiles_column="SMILES", mol_column="mol")
# Excel files
df = dm.read_excel("compounds.xlsx", sheet_name=0, mol_column="mol")
# Universal reader (auto-detects format)
df = dm.open_df("file.sdf") # Works with .sdf, .csv, .xlsx, .parquet, .json
Writing files :
# Save as SDF
dm.to_sdf(mols, "output.sdf")
# Or from DataFrame
dm.to_sdf(df, "output.sdf", mol_column="mol")
# Save as SMILES file
dm.to_smi(mols, "output.smi")
# Excel with rendered molecule images
dm.to_xlsx(df, "output.xlsx", mol_columns=["mol"])
Remote file support (S3, GCS, HTTP):
# Read from cloud storage
df = dm.read_sdf("s3://bucket/compounds.sdf")
df = dm.read_csv("https://example.com/data.csv")
# Write to cloud storage
dm.to_sdf(mols, "s3://bucket/output.sdf")
Refer to references/descriptors_viz.md for detailed descriptor documentation.
Computing descriptors for a single molecule :
# Get standard descriptor set
descriptors = dm.descriptors.compute_many_descriptors(mol)
# Returns: {'mw': 46.07, 'logp': -0.03, 'hbd': 1, 'hba': 1,
# 'tpsa': 20.23, 'n_aromatic_atoms': 0, ...}
Batch descriptor computation (recommended for datasets):
# Compute for all molecules in parallel
desc_df = dm.descriptors.batch_compute_many_descriptors(
mols,
n_jobs=-1, # Use all CPU cores
progress=True # Show progress bar
)
Specific descriptors :
# Aromaticity
n_aromatic = dm.descriptors.n_aromatic_atoms(mol)
aromatic_ratio = dm.descriptors.n_aromatic_atoms_proportion(mol)
# Stereochemistry
n_stereo = dm.descriptors.n_stereo_centers(mol)
n_unspec = dm.descriptors.n_stereo_centers_unspecified(mol)
# Flexibility
n_rigid = dm.descriptors.n_rigid_bonds(mol)
Drug-likeness filtering (Lipinski's Rule of Five) :
# Filter compounds
def is_druglike(mol):
desc = dm.descriptors.compute_many_descriptors(mol)
return (
desc['mw'] <= 500 and
desc['logp'] <= 5 and
desc['hbd'] <= 5 and
desc['hba'] <= 10
)
druglike_mols = [mol for mol in mols if is_druglike(mol)]
Generating fingerprints :
# ECFP (Extended Connectivity Fingerprint, default)
fp = dm.to_fp(mol, fp_type='ecfp', radius=2, n_bits=2048)
# Other fingerprint types
fp_maccs = dm.to_fp(mol, fp_type='maccs')
fp_topological = dm.to_fp(mol, fp_type='topological')
fp_atompair = dm.to_fp(mol, fp_type='atompair')
Similarity calculations :
# Pairwise distances within a set
distance_matrix = dm.pdist(mols, n_jobs=-1)
# Distances between two sets
distances = dm.cdist(query_mols, library_mols, n_jobs=-1)
# Find most similar molecules
from scipy.spatial.distance import squareform
dist_matrix = squareform(dm.pdist(mols))
# Lower distance = higher similarity (Tanimoto distance = 1 - Tanimoto similarity)
Refer to references/core_api.md for clustering details.
Butina clustering :
# Cluster molecules by structural similarity
clusters = dm.cluster_mols(
mols,
cutoff=0.2, # Tanimoto distance threshold (0=identical, 1=completely different)
n_jobs=-1 # Parallel processing
)
# Each cluster is a list of molecule indices
for i, cluster in enumerate(clusters):
print(f"Cluster {i}: {len(cluster)} molecules")
cluster_mols = [mols[idx] for idx in cluster]
Important : Butina clustering builds a full distance matrix - suitable for ~1000 molecules, not for 10,000+.
Diversity selection :
# Pick diverse subset
diverse_mols = dm.pick_diverse(
mols,
npick=100 # Select 100 diverse molecules
)
# Pick cluster centroids
centroids = dm.pick_centroids(
mols,
npick=50 # Select 50 representative molecules
)
Refer to references/fragments_scaffolds.md for complete scaffold documentation.
Extracting Murcko scaffolds :
# Get Bemis-Murcko scaffold (core structure)
scaffold = dm.to_scaffold_murcko(mol)
scaffold_smiles = dm.to_smiles(scaffold)
Scaffold-based analysis :
# Group compounds by scaffold
from collections import Counter
scaffolds = [dm.to_scaffold_murcko(mol) for mol in mols]
scaffold_smiles = [dm.to_smiles(s) for s in scaffolds]
# Count scaffold frequency
scaffold_counts = Counter(scaffold_smiles)
most_common = scaffold_counts.most_common(10)
# Create scaffold-to-molecules mapping
scaffold_groups = {}
for mol, scaf_smi in zip(mols, scaffold_smiles):
if scaf_smi not in scaffold_groups:
scaffold_groups[scaf_smi] = []
scaffold_groups[scaf_smi].append(mol)
Scaffold-based train/test splitting (for ML):
# Ensure train and test sets have different scaffolds
scaffold_to_mols = {}
for mol, scaf in zip(mols, scaffold_smiles):
if scaf not in scaffold_to_mols:
scaffold_to_mols[scaf] = []
scaffold_to_mols[scaf].append(mol)
# Split scaffolds into train/test
import random
scaffolds = list(scaffold_to_mols.keys())
random.shuffle(scaffolds)
split_idx = int(0.8 * len(scaffolds))
train_scaffolds = scaffolds[:split_idx]
test_scaffolds = scaffolds[split_idx:]
# Get molecules for each split
train_mols = [mol for scaf in train_scaffolds for mol in scaffold_to_mols[scaf]]
test_mols = [mol for scaf in test_scaffolds for mol in scaffold_to_mols[scaf]]
Refer to references/fragments_scaffolds.md for fragmentation details.
BRICS fragmentation (16 bond types):
# Fragment molecule
fragments = dm.fragment.brics(mol)
# Returns: set of fragment SMILES with attachment points like '[1*]CCN'
RECAP fragmentation (11 bond types):
fragments = dm.fragment.recap(mol)
Fragment analysis :
# Find common fragments across compound library
from collections import Counter
all_fragments = []
for mol in mols:
frags = dm.fragment.brics(mol)
all_fragments.extend(frags)
fragment_counts = Counter(all_fragments)
common_frags = fragment_counts.most_common(20)
# Fragment-based scoring
def fragment_score(mol, reference_fragments):
mol_frags = dm.fragment.brics(mol)
overlap = mol_frags.intersection(reference_fragments)
return len(overlap) / len(mol_frags) if mol_frags else 0
Refer to references/conformers_module.md for detailed conformer documentation.
Generating conformers :
# Generate 3D conformers
mol_3d = dm.conformers.generate(
mol,
n_confs=50, # Number to generate (auto if None)
rms_cutoff=0.5, # Filter similar conformers (Ångströms)
minimize_energy=True, # Minimize with UFF force field
method='ETKDGv3' # Embedding method (recommended)
)
# Access conformers
n_conformers = mol_3d.GetNumConformers()
conf = mol_3d.GetConformer(0) # Get first conformer
positions = conf.GetPositions() # Nx3 array of atom coordinates
Conformer clustering :
# Cluster conformers by RMSD
clusters = dm.conformers.cluster(
mol_3d,
rms_cutoff=1.0,
centroids=False
)
# Get representative conformers
centroids = dm.conformers.return_centroids(mol_3d, clusters)
SASA calculation :
# Calculate solvent accessible surface area
sasa_values = dm.conformers.sasa(mol_3d, n_jobs=-1)
# Access SASA from conformer properties
conf = mol_3d.GetConformer(0)
sasa = conf.GetDoubleProp('rdkit_free_sasa')
Refer to references/descriptors_viz.md for visualization documentation.
Basic molecule grid :
# Visualize molecules
dm.viz.to_image(
mols[:20],
legends=[dm.to_smiles(m) for m in mols[:20]],
n_cols=5,
mol_size=(300, 300)
)
# Save to file
dm.viz.to_image(mols, outfile="molecules.png")
# SVG for publications
dm.viz.to_image(mols, outfile="molecules.svg", use_svg=True)
Aligned visualization (for SAR analysis):
# Align molecules by common substructure
dm.viz.to_image(
similar_mols,
align=True, # Enable MCS alignment
legends=activity_labels,
n_cols=4
)
Highlighting substructures :
# Highlight specific atoms and bonds
dm.viz.to_image(
mol,
highlight_atom=[0, 1, 2, 3], # Atom indices
highlight_bond=[0, 1, 2] # Bond indices
)
Conformer visualization :
# Display multiple conformers
dm.viz.conformers(
mol_3d,
n_confs=10,
align_conf=True,
n_cols=3
)
Refer to references/reactions_data.md for reactions documentation.
Applying reactions :
from rdkit.Chem import rdChemReactions
# Define reaction from SMARTS
rxn_smarts = '[C:1](=[O:2])[OH:3]>>[C:1](=[O:2])[Cl:3]'
rxn = rdChemReactions.ReactionFromSmarts(rxn_smarts)
# Apply to molecule
reactant = dm.to_mol("CC(=O)O") # Acetic acid
product = dm.reactions.apply_reaction(
rxn,
(reactant,),
sanitize=True
)
# Convert to SMILES
product_smiles = dm.to_smiles(product)
Batch reaction application :
# Apply reaction to library
products = []
for mol in reactant_mols:
try:
prod = dm.reactions.apply_reaction(rxn, (mol,))
if prod is not None:
products.append(prod)
except Exception as e:
print(f"Reaction failed: {e}")
Datamol includes built-in parallelization for many operations. Use n_jobs parameter:
n_jobs=1: Sequential (no parallelization)n_jobs=-1: Use all available CPU coresn_jobs=4: Use 4 coresFunctions supporting parallelization :
dm.read_sdf(..., n_jobs=-1)dm.descriptors.batch_compute_many_descriptors(..., n_jobs=-1)dm.cluster_mols(..., n_jobs=-1)dm.pdist(..., n_jobs=-1)dm.conformers.sasa(..., n_jobs=-1)Progress bars : Many batch operations support progress=True parameter.
import datamol as dm
import pandas as pd
# 1. Load molecules
df = dm.read_sdf("compounds.sdf")
# 2. Standardize
df['mol'] = df['mol'].apply(lambda m: dm.standardize_mol(m) if m else None)
df = df[df['mol'].notna()] # Remove failed molecules
# 3. Compute descriptors
desc_df = dm.descriptors.batch_compute_many_descriptors(
df['mol'].tolist(),
n_jobs=-1,
progress=True
)
# 4. Filter by drug-likeness
druglike = (
(desc_df['mw'] <= 500) &
(desc_df['logp'] <= 5) &
(desc_df['hbd'] <= 5) &
(desc_df['hba'] <= 10)
)
filtered_df = df[druglike]
# 5. Cluster and select diverse subset
diverse_mols = dm.pick_diverse(
filtered_df['mol'].tolist(),
npick=100
)
# 6. Visualize results
dm.viz.to_image(
diverse_mols,
legends=[dm.to_smiles(m) for m in diverse_mols],
outfile="diverse_compounds.png",
n_cols=10
)
# Group by scaffold
scaffolds = [dm.to_scaffold_murcko(mol) for mol in mols]
scaffold_smiles = [dm.to_smiles(s) for s in scaffolds]
# Create DataFrame with activities
sar_df = pd.DataFrame({
'mol': mols,
'scaffold': scaffold_smiles,
'activity': activities # User-provided activity data
})
# Analyze each scaffold series
for scaffold, group in sar_df.groupby('scaffold'):
if len(group) >= 3: # Need multiple examples
print(f"\nScaffold: {scaffold}")
print(f"Count: {len(group)}")
print(f"Activity range: {group['activity'].min():.2f} - {group['activity'].max():.2f}")
# Visualize with activities as legends
dm.viz.to_image(
group['mol'].tolist(),
legends=[f"Activity: {act:.2f}" for act in group['activity']],
align=True # Align by common substructure
)
# 1. Generate fingerprints for query and library
query_fps = [dm.to_fp(mol) for mol in query_actives]
library_fps = [dm.to_fp(mol) for mol in library_mols]
# 2. Calculate similarities
from scipy.spatial.distance import cdist
import numpy as np
distances = dm.cdist(query_actives, library_mols, n_jobs=-1)
# 3. Find closest matches (min distance to any query)
min_distances = distances.min(axis=0)
similarities = 1 - min_distances # Convert distance to similarity
# 4. Rank and select top hits
top_indices = np.argsort(similarities)[::-1][:100] # Top 100
top_hits = [library_mols[i] for i in top_indices]
top_scores = [similarities[i] for i in top_indices]
# 5. Visualize hits
dm.viz.to_image(
top_hits[:20],
legends=[f"Sim: {score:.3f}" for score in top_scores[:20]],
outfile="screening_hits.png"
)
For detailed API documentation, consult these reference files:
references/core_api.md : Core namespace functions (conversions, standardization, fingerprints, clustering)references/io_module.md : File I/O operations (read/write SDF, CSV, Excel, remote files)references/conformers_module.md : 3D conformer generation, clustering, SASA calculationsreferences/descriptors_viz.md : Molecular descriptors and visualization functionsreferences/fragments_scaffolds.md : Scaffold extraction, BRICS/RECAP fragmentationreferences/reactions_data.md : Chemical reactions and toy datasetsAlways standardize molecules from external sources:
mol = dm.standardize_mol(mol, disconnect_metals=True, normalize=True, reionize=True)
Check for None values after molecule parsing:
mol = dm.to_mol(smiles)
if mol is None:
# Handle invalid SMILES
Use parallel processing for large datasets:
result = dm.operation(..., n_jobs=-1, progress=True)
Leverage fsspec for cloud storage:
df = dm.read_sdf("s3://bucket/compounds.sdf")
Use appropriate fingerprints for similarity:
Consider scale limitations :
# Safe molecule creation
def safe_to_mol(smiles):
try:
mol = dm.to_mol(smiles)
if mol is not None:
mol = dm.standardize_mol(mol)
return mol
except Exception as e:
print(f"Failed to process {smiles}: {e}")
return None
# Safe batch processing
valid_mols = []
for smiles in smiles_list:
mol = safe_to_mol(smiles)
if mol is not None:
valid_mols.append(mol)
# Feature generation
X = np.array([dm.to_fp(mol) for mol in mols])
# Or descriptors
desc_df = dm.descriptors.batch_compute_many_descriptors(mols, n_jobs=-1)
X = desc_df.values
# Train model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, y_target)
# Predict
predictions = model.predict(X_test)
Issue : Molecule parsing fails
dm.standardize_smiles() first or try dm.fix_mol()Issue : Memory errors with clustering
dm.pick_diverse() instead of full clustering for large setsIssue : Slow conformer generation
n_confs or increase rms_cutoff to generate fewer conformersIssue : Remote file access fails
Weekly Installs
132
Repository
GitHub Stars
23.4K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
claude-code116
opencode108
gemini-cli103
cursor103
antigravity99
codex91
FastAPI官方技能:Python Web开发最佳实践与CLI工具使用指南
1,000 周安装
AI 驱动的系统性文献综述写作工具 - 支持 LaTeX/BibTeX 输出,提升学术研究效率
358 周安装
AiCoin Market 加密货币市场数据工具包 - 200+交易所实时价格、K线、空投、新闻
360 周安装
Apollo Federation 模式编写指南:Federation 2 核心指令与最佳实践
359 周安装
如何将研究论文转化为交互式marimo笔记本 - 实现论文复现与可视化
362 周安装
赋能产品团队框架:从功能交付到问题解决的产品管理方法论
377 周安装
Firebase Hosting 入门指南:快速部署静态网站和动态内容托管教程
360 周安装
Scaffold splitting for ML : Ensure proper train/test separation by scaffold
Align molecules when visualizing SAR series