重要前提
安装AI Skills的关键前提是:必须科学上网,且开启TUN模式,这一点至关重要,直接决定安装能否顺利完成,在此郑重提醒三遍:科学上网,科学上网,科学上网。查看完整安装教程 →
rdkit by k-dense-ai/claude-scientific-skills
npx skills add https://github.com/k-dense-ai/claude-scientific-skills --skill rdkitRDKit 是一个全面的化学信息学库,为分子分析和操作提供 Python API。本技能提供了读取/写入分子结构、计算描述符、指纹生成、子结构搜索、化学反应、2D/3D 坐标生成和分子可视化的指导。将此技能用于药物发现、计算化学和化学信息学研究任务。
读取分子:
从各种格式读取分子结构:
from rdkit import Chem
# 从 SMILES 字符串
mol = Chem.MolFromSmiles('Cc1ccccc1') # 返回 Mol 对象或 None
# 从 MOL 文件
mol = Chem.MolFromMolFile('path/to/file.mol')
# 从 MOL 块(字符串数据)
mol = Chem.MolFromMolBlock(mol_block_string)
# 从 InChI
mol = Chem.MolFromInchi('InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H')
写入分子:
将分子转换为文本表示:
# 转换为规范 SMILES
smiles = Chem.MolToSmiles(mol)
# 转换为 MOL 块
mol_block = Chem.MolToMolBlock(mol)
# 转换为 InChI
inchi = Chem.MolToInchi(mol)
批量处理:
处理多个分子时,使用 Supplier/Writer 对象:
# 读取 SDF 文件
suppl = Chem.SDMolSupplier('molecules.sdf')
for mol in suppl:
if mol is not None: # 检查解析错误
# 处理分子
pass
# 读取 SMILES 文件
suppl = Chem.SmilesMolSupplier('molecules.smi', titleLine=False)
# 对于大文件或压缩数据
with gzip.open('molecules.sdf.gz') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# 处理分子
pass
# 大数据集的多线程处理
suppl = Chem.MultithreadedSDMolSupplier('molecules.sdf')
# 将分子写入 SDF
writer = Chem.SDWriter('output.sdf')
for mol in molecules:
writer.write(mol)
writer.close()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
重要注意事项:
MolFrom* 函数在失败时返回 None 并显示错误消息NoneRDKit 在解析过程中自动清理分子,执行包括化合价检查、芳香性感知和手性分配在内的 13 个步骤。
清理控制:
# 禁用自动清理
mol = Chem.MolFromSmiles('C1=CC=CC=C1', sanitize=False)
# 手动清理
Chem.SanitizeMol(mol)
# 在清理前检测问题
problems = Chem.DetectChemistryProblems(mol)
for problem in problems:
print(problem.GetType(), problem.Message())
# 部分清理(跳过特定步骤)
from rdkit.Chem import rdMolStandardize
Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES)
常见清理问题:
访问分子结构:
# 迭代原子和键
for atom in mol.GetAtoms():
print(atom.GetSymbol(), atom.GetIdx(), atom.GetDegree())
for bond in mol.GetBonds():
print(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType())
# 环信息
ring_info = mol.GetRingInfo()
ring_info.NumRings()
ring_info.AtomRings() # 返回原子索引的元组
# 检查原子是否在环中
atom = mol.GetAtomWithIdx(0)
atom.IsInRing()
atom.IsInRingSize(6) # 检查六元环
# 查找最小环的最小集合(SSSR)
from rdkit.Chem import GetSymmSSSR
rings = GetSymmSSSR(mol)
立体化学:
# 查找手性中心
from rdkit.Chem import FindMolChiralCenters
chiral_centers = FindMolChiralCenters(mol, includeUnassigned=True)
# 返回(原子索引,手性)元组列表
# 从 3D 坐标分配立体化学
from rdkit.Chem import AssignStereochemistryFrom3D
AssignStereochemistryFrom3D(mol)
# 检查键立体化学
bond = mol.GetBondWithIdx(0)
stereo = bond.GetStereo() # STEREONONE、STEREOZ、STEREOE 等
片段分析:
# 获取不连接的片段
frags = Chem.GetMolFrags(mol, asMols=True)
# 在特定键上断裂
from rdkit.Chem import FragmentOnBonds
frag_mol = FragmentOnBonds(mol, [bond_idx1, bond_idx2])
# 计数环系统
from rdkit.Chem.Scaffolds import MurckoScaffold
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
基本描述符:
from rdkit.Chem import Descriptors
# 分子量
mw = Descriptors.MolWt(mol)
exact_mw = Descriptors.ExactMolWt(mol)
# LogP(亲脂性)
logp = Descriptors.MolLogP(mol)
# 拓扑极性表面积
tpsa = Descriptors.TPSA(mol)
# 氢键供体/受体数量
hbd = Descriptors.NumHDonors(mol)
hba = Descriptors.NumHAcceptors(mol)
# 可旋转键数量
rot_bonds = Descriptors.NumRotatableBonds(mol)
# 芳香环数量
aromatic_rings = Descriptors.NumAromaticRings(mol)
批量描述符计算:
# 一次性计算所有描述符
all_descriptors = Descriptors.CalcMolDescriptors(mol)
# 返回字典:{'MolWt': 180.16, 'MolLogP': 1.23, ...}
# 获取可用描述符名称列表
descriptor_names = [desc[0] for desc in Descriptors._descList]
Lipinski 五规则:
# 检查类药性
mw = Descriptors.MolWt(mol) <= 500
logp = Descriptors.MolLogP(mol) <= 5
hbd = Descriptors.NumHDonors(mol) <= 5
hba = Descriptors.NumHAcceptors(mol) <= 10
is_drug_like = mw and logp and hbd and hba
指纹类型:
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import MACCSkeys
# RDKit 拓扑指纹
rdk_gen = rdFingerprintGenerator.GetRDKitFPGenerator(minPath=1, maxPath=7, fpSize=2048)
fp = rdk_gen.GetFingerprint(mol)
# Morgan 指纹(圆形指纹,类似于 ECFP)
# 使用 rdFingerprintGenerator 的现代 API
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fp = morgan_gen.GetFingerprint(mol)
# 计数指纹
fp_count = morgan_gen.GetCountFingerprint(mol)
# MACCS 键(166 位结构键)
fp = MACCSkeys.GenMACCSKeys(mol)
# 原子对指纹
ap_gen = rdFingerprintGenerator.GetAtomPairGenerator()
fp = ap_gen.GetFingerprint(mol)
# 拓扑扭转指纹
tt_gen = rdFingerprintGenerator.GetTopologicalTorsionGenerator()
fp = tt_gen.GetFingerprint(mol)
# Avalon 指纹(如果可用)
from rdkit.Avalon import pyAvalonTools
fp = pyAvalonTools.GetAvalonFP(mol)
相似性计算:
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
# 使用生成器生成指纹
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fp1 = mfpgen.GetFingerprint(mol1)
fp2 = mfpgen.GetFingerprint(mol2)
# 计算 Tanimoto 相似性
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
# 计算多个分子的相似性
fps = [mfpgen.GetFingerprint(m) for m in [mol2, mol3, mol4]]
similarities = DataStructs.BulkTanimotoSimilarity(fp1, fps)
# 其他相似性指标
dice = DataStructs.DiceSimilarity(fp1, fp2)
cosine = DataStructs.CosineSimilarity(fp1, fp2)
聚类与多样性:
# 基于指纹相似性的 Butina 聚类
from rdkit.ML.Cluster import Butina
# 计算距离矩阵
dists = []
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fps = [mfpgen.GetFingerprint(mol) for mol in mols]
for i in range(len(fps)):
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([1-sim for sim in sims])
# 使用距离截止值进行聚类
clusters = Butina.ClusterData(dists, len(fps), distThresh=0.3, isDistData=True)
基本子结构匹配:
# 使用 SMARTS 定义查询
query = Chem.MolFromSmarts('[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1') # 苯环
# 检查分子是否包含子结构
has_match = mol.HasSubstructMatch(query)
# 获取所有匹配(返回原子索引的元组元组)
matches = mol.GetSubstructMatches(query)
# 仅获取第一个匹配
match = mol.GetSubstructMatch(query)
常见 SMARTS 模式:
# 伯醇
primary_alcohol = Chem.MolFromSmarts('[CH2][OH1]')
# 羧酸
carboxylic_acid = Chem.MolFromSmarts('C(=O)[OH]')
# 酰胺
amide = Chem.MolFromSmarts('C(=O)N')
# 芳香杂环
aromatic_n = Chem.MolFromSmarts('[nR]') # 环中的芳香氮
# 大环(>12 个原子的环)
macrocycle = Chem.MolFromSmarts('[r{12-}]')
匹配规则:
反应 SMARTS:
from rdkit.Chem import AllChem
# 使用 SMARTS 定义反应:反应物 >> 产物
rxn = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1][O:2]') # 酮还原
# 将反应应用于分子
reactants = (mol1,)
products = rxn.RunReactants(reactants)
# 产物是元组的元组(每个产物集一个元组)
for product_set in products:
for product in product_set:
# 清理产物
Chem.SanitizeMol(product)
反应特性:
反应相似性:
# 生成反应指纹
fp = AllChem.CreateDifferenceFingerprintForReaction(rxn)
# 比较反应
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
2D 坐标生成:
from rdkit.Chem import AllChem
# 为描绘生成 2D 坐标
AllChem.Compute2DCoords(mol)
# 将分子与模板结构对齐
template = Chem.MolFromSmiles('c1ccccc1')
AllChem.Compute2DCoords(template)
AllChem.GenerateDepictionMatching2DStructure(mol, template)
3D 坐标生成与构象:
# 使用 ETKDG 生成单个 3D 构象
AllChem.EmbedMolecule(mol, randomSeed=42)
# 生成多个构象
conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=10, randomSeed=42)
# 使用力场优化几何结构
AllChem.UFFOptimizeMolecule(mol) # UFF 力场
AllChem.MMFFOptimizeMolecule(mol) # MMFF94 力场
# 优化所有构象
for conf_id in conf_ids:
AllChem.MMFFOptimizeMolecule(mol, confId=conf_id)
# 计算构象之间的 RMSD
from rdkit.Chem import AllChem
rms = AllChem.GetConformerRMS(mol, conf_id1, conf_id2)
# 对齐分子
AllChem.AlignMol(probe_mol, ref_mol)
约束嵌入:
# 将部分分子约束到特定坐标进行嵌入
AllChem.ConstrainedEmbed(mol, core_mol)
基本绘制:
from rdkit.Chem import Draw
# 将单个分子绘制为 PIL 图像
img = Draw.MolToImage(mol, size=(300, 300))
img.save('molecule.png')
# 直接绘制到文件
Draw.MolToFile(mol, 'molecule.png')
# 在网格中绘制多个分子
mols = [mol1, mol2, mol3, mol4]
img = Draw.MolsToGridImage(mols, molsPerRow=2, subImgSize=(200, 200))
高亮显示子结构:
# 高亮显示子结构匹配
query = Chem.MolFromSmarts('c1ccccc1')
match = mol.GetSubstructMatch(query)
img = Draw.MolToImage(mol, highlightAtoms=match)
# 自定义高亮颜色
highlight_colors = {atom_idx: (1, 0, 0) for atom_idx in match} # 红色
img = Draw.MolToImage(mol, highlightAtoms=match,
highlightAtomColors=highlight_colors)
自定义可视化:
from rdkit.Chem.Draw import rdMolDraw2D
# 使用自定义选项创建绘图器
drawer = rdMolDraw2D.MolDraw2DCairo(300, 300)
opts = drawer.drawOptions()
# 自定义选项
opts.addAtomIndices = True
opts.addStereoAnnotation = True
opts.bondLineWidth = 2
# 绘制分子
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
# 保存到文件
with open('molecule.png', 'wb') as f:
f.write(drawer.GetDrawingText())
Jupyter Notebook 集成:
# 在 Jupyter 中启用内联显示
from rdkit.Chem.Draw import IPythonConsole
# 自定义默认显示
IPythonConsole.ipython_useSVG = True # 使用 SVG 代替 PNG
IPythonConsole.molSize = (300, 300) # 默认大小
# 分子现在自动显示
mol # 显示分子图像
可视化指纹位:
# 显示指纹位代表的分子特征
from rdkit.Chem import Draw
# 对于 Morgan 指纹
bit_info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bit_info)
# 绘制特定位的环境
img = Draw.DrawMorganBit(mol, bit_id, bit_info)
添加/移除氢原子:
# 添加显式氢原子
mol_h = Chem.AddHs(mol)
# 移除显式氢原子
mol = Chem.RemoveHs(mol_h)
凯库勒化与芳香性:
# 将芳香键转换为交替的单/双键
Chem.Kekulize(mol)
# 设置芳香性
Chem.SetAromaticity(mol)
替换子结构:
# 将子结构替换为另一个结构
query = Chem.MolFromSmarts('c1ccccc1') # 苯
replacement = Chem.MolFromSmiles('C1CCCCC1') # 环己烷
new_mol = Chem.ReplaceSubstructs(mol, query, replacement)[0]
中和电荷:
# 通过添加/移除氢原子移除形式电荷
from rdkit.Chem.MolStandardize import rdMolStandardize
# 使用 Uncharger
uncharger = rdMolStandardize.Uncharger()
mol_neutral = uncharger.uncharge(mol)
分子哈希:
from rdkit.Chem import rdMolHash
# 生成 Murcko 骨架哈希
scaffold_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.MurckoScaffold)
# 规范 SMILES 哈希
canonical_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.CanonicalSmiles)
# 区域异构体哈希(忽略立体化学)
regio_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.Regioisomer)
随机化 SMILES:
# 生成随机 SMILES 表示(用于数据增强)
from rdkit.Chem import MolToRandomSmilesVect
random_smiles = MolToRandomSmilesVect(mol, numSmiles=10, randomSeed=42)
药效团特征:
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os
# 加载特征工厂
fdef_path = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdef_path)
# 获取药效团特征
features = factory.GetFeaturesForMol(mol)
for feat in features:
print(feat.GetFamily(), feat.GetType(), feat.GetAtomIds())
from rdkit import Chem
from rdkit.Chem import Descriptors
def analyze_druglikeness(smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
# 计算 Lipinski 描述符
results = {
'MW': Descriptors.MolWt(mol),
'LogP': Descriptors.MolLogP(mol),
'HBD': Descriptors.NumHDonors(mol),
'HBA': Descriptors.NumHAcceptors(mol),
'TPSA': Descriptors.TPSA(mol),
'RotBonds': Descriptors.NumRotatableBonds(mol)
}
# 检查 Lipinski 五规则
results['Lipinski'] = (
results['MW'] <= 500 and
results['LogP'] <= 5 and
results['HBD'] <= 5 and
results['HBA'] <= 10
)
return results
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
def similarity_screen(query_smiles, database_smiles, threshold=0.7):
query_mol = Chem.MolFromSmiles(query_smiles)
query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2)
hits = []
for idx, smiles in enumerate(database_smiles):
mol = Chem.MolFromSmiles(smiles)
if mol:
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
sim = DataStructs.TanimotoSimilarity(query_fp, fp)
if sim >= threshold:
hits.append((idx, smiles, sim))
return sorted(hits, key=lambda x: x[2], reverse=True)
from rdkit import Chem
def filter_by_substructure(smiles_list, pattern_smarts):
query = Chem.MolFromSmarts(pattern_smarts)
hits = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol and mol.HasSubstructMatch(query):
hits.append(smiles)
return hits
解析分子时始终检查是否为 None:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print(f"Failed to parse: {smiles}")
continue
使用二进制格式进行存储:
import pickle
# 将分子 pickle 化以便快速加载
with open('molecules.pkl', 'wb') as f:
pickle.dump(mols, f)
# 加载 pickle 化的分子(比重新解析快得多)
with open('molecules.pkl', 'rb') as f:
mols = pickle.load(f)
使用批量操作:
# 一次性计算所有分子的指纹
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
# 使用批量相似性计算
similarities = DataStructs.BulkTanimotoSimilarity(fps[0], fps[1:])
RDKit 操作通常在以下情况下是线程安全的:
非线程安全: 并发访问时的 MolSuppliers。
对于大型数据集:
# 使用 ForwardSDMolSupplier 避免加载整个文件
with open('large.sdf') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# 一次处理一个分子
pass
# 使用 MultithreadedSDMolSupplier 进行并行处理
suppl = Chem.MultithreadedSDMolSupplier('large.sdf', numWriterThreads=4)
DetectChemistryProblems() 进行调试AddHs()本技能包含详细的 API 参考文档:
api_reference.md - 按功能组织的 RDKit 模块、函数和类的全面列表descriptors_reference.md - 可用分子描述符的完整列表及描述smarts_patterns.md - 官能团和结构特征的常见 SMARTS 模式需要特定 API 详细信息、参数信息或模式示例时,请加载这些参考文档。
常见 RDKit 工作流程的示例脚本:
molecular_properties.py - 计算全面的分子属性和描述符similarity_search.py - 执行基于指纹的相似性筛选substructure_filter.py - 按子结构模式过滤分子这些脚本可以直接执行或用作自定义工作流程的模板。
每周安装次数
55
代码仓库
GitHub 星标数
17.3K
首次出现
2026年1月20日
安全审计
安装于
opencode48
gemini-cli47
codex47
cursor45
claude-code44
github-copilot43
RDKit is a comprehensive cheminformatics library providing Python APIs for molecular analysis and manipulation. This skill provides guidance for reading/writing molecular structures, calculating descriptors, fingerprinting, substructure searching, chemical reactions, 2D/3D coordinate generation, and molecular visualization. Use this skill for drug discovery, computational chemistry, and cheminformatics research tasks.
Reading Molecules:
Read molecular structures from various formats:
from rdkit import Chem
# From SMILES strings
mol = Chem.MolFromSmiles('Cc1ccccc1') # Returns Mol object or None
# From MOL files
mol = Chem.MolFromMolFile('path/to/file.mol')
# From MOL blocks (string data)
mol = Chem.MolFromMolBlock(mol_block_string)
# From InChI
mol = Chem.MolFromInchi('InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H')
Writing Molecules:
Convert molecules to text representations:
# To canonical SMILES
smiles = Chem.MolToSmiles(mol)
# To MOL block
mol_block = Chem.MolToMolBlock(mol)
# To InChI
inchi = Chem.MolToInchi(mol)
Batch Processing:
For processing multiple molecules, use Supplier/Writer objects:
# Read SDF files
suppl = Chem.SDMolSupplier('molecules.sdf')
for mol in suppl:
if mol is not None: # Check for parsing errors
# Process molecule
pass
# Read SMILES files
suppl = Chem.SmilesMolSupplier('molecules.smi', titleLine=False)
# For large files or compressed data
with gzip.open('molecules.sdf.gz') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# Process molecule
pass
# Multithreaded processing for large datasets
suppl = Chem.MultithreadedSDMolSupplier('molecules.sdf')
# Write molecules to SDF
writer = Chem.SDWriter('output.sdf')
for mol in molecules:
writer.write(mol)
writer.close()
Important Notes:
MolFrom* functions return None on failure with error messagesNone before processing moleculesRDKit automatically sanitizes molecules during parsing, executing 13 steps including valence checking, aromaticity perception, and chirality assignment.
Sanitization Control:
# Disable automatic sanitization
mol = Chem.MolFromSmiles('C1=CC=CC=C1', sanitize=False)
# Manual sanitization
Chem.SanitizeMol(mol)
# Detect problems before sanitization
problems = Chem.DetectChemistryProblems(mol)
for problem in problems:
print(problem.GetType(), problem.Message())
# Partial sanitization (skip specific steps)
from rdkit.Chem import rdMolStandardize
Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES)
Common Sanitization Issues:
Accessing Molecular Structure:
# Iterate atoms and bonds
for atom in mol.GetAtoms():
print(atom.GetSymbol(), atom.GetIdx(), atom.GetDegree())
for bond in mol.GetBonds():
print(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType())
# Ring information
ring_info = mol.GetRingInfo()
ring_info.NumRings()
ring_info.AtomRings() # Returns tuples of atom indices
# Check if atom is in ring
atom = mol.GetAtomWithIdx(0)
atom.IsInRing()
atom.IsInRingSize(6) # Check for 6-membered rings
# Find smallest set of smallest rings (SSSR)
from rdkit.Chem import GetSymmSSSR
rings = GetSymmSSSR(mol)
Stereochemistry:
# Find chiral centers
from rdkit.Chem import FindMolChiralCenters
chiral_centers = FindMolChiralCenters(mol, includeUnassigned=True)
# Returns list of (atom_idx, chirality) tuples
# Assign stereochemistry from 3D coordinates
from rdkit.Chem import AssignStereochemistryFrom3D
AssignStereochemistryFrom3D(mol)
# Check bond stereochemistry
bond = mol.GetBondWithIdx(0)
stereo = bond.GetStereo() # STEREONONE, STEREOZ, STEREOE, etc.
Fragment Analysis:
# Get disconnected fragments
frags = Chem.GetMolFrags(mol, asMols=True)
# Fragment on specific bonds
from rdkit.Chem import FragmentOnBonds
frag_mol = FragmentOnBonds(mol, [bond_idx1, bond_idx2])
# Count ring systems
from rdkit.Chem.Scaffolds import MurckoScaffold
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
Basic Descriptors:
from rdkit.Chem import Descriptors
# Molecular weight
mw = Descriptors.MolWt(mol)
exact_mw = Descriptors.ExactMolWt(mol)
# LogP (lipophilicity)
logp = Descriptors.MolLogP(mol)
# Topological polar surface area
tpsa = Descriptors.TPSA(mol)
# Number of hydrogen bond donors/acceptors
hbd = Descriptors.NumHDonors(mol)
hba = Descriptors.NumHAcceptors(mol)
# Number of rotatable bonds
rot_bonds = Descriptors.NumRotatableBonds(mol)
# Number of aromatic rings
aromatic_rings = Descriptors.NumAromaticRings(mol)
Batch Descriptor Calculation:
# Calculate all descriptors at once
all_descriptors = Descriptors.CalcMolDescriptors(mol)
# Returns dictionary: {'MolWt': 180.16, 'MolLogP': 1.23, ...}
# Get list of available descriptor names
descriptor_names = [desc[0] for desc in Descriptors._descList]
Lipinski's Rule of Five:
# Check drug-likeness
mw = Descriptors.MolWt(mol) <= 500
logp = Descriptors.MolLogP(mol) <= 5
hbd = Descriptors.NumHDonors(mol) <= 5
hba = Descriptors.NumHAcceptors(mol) <= 10
is_drug_like = mw and logp and hbd and hba
Fingerprint Types:
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import MACCSkeys
# RDKit topological fingerprint
rdk_gen = rdFingerprintGenerator.GetRDKitFPGenerator(minPath=1, maxPath=7, fpSize=2048)
fp = rdk_gen.GetFingerprint(mol)
# Morgan fingerprints (circular fingerprints, similar to ECFP)
# Modern API using rdFingerprintGenerator
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fp = morgan_gen.GetFingerprint(mol)
# Count-based fingerprint
fp_count = morgan_gen.GetCountFingerprint(mol)
# MACCS keys (166-bit structural key)
fp = MACCSkeys.GenMACCSKeys(mol)
# Atom pair fingerprints
ap_gen = rdFingerprintGenerator.GetAtomPairGenerator()
fp = ap_gen.GetFingerprint(mol)
# Topological torsion fingerprints
tt_gen = rdFingerprintGenerator.GetTopologicalTorsionGenerator()
fp = tt_gen.GetFingerprint(mol)
# Avalon fingerprints (if available)
from rdkit.Avalon import pyAvalonTools
fp = pyAvalonTools.GetAvalonFP(mol)
Similarity Calculation:
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
# Generate fingerprints using generator
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fp1 = mfpgen.GetFingerprint(mol1)
fp2 = mfpgen.GetFingerprint(mol2)
# Calculate Tanimoto similarity
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
# Calculate similarity for multiple molecules
fps = [mfpgen.GetFingerprint(m) for m in [mol2, mol3, mol4]]
similarities = DataStructs.BulkTanimotoSimilarity(fp1, fps)
# Other similarity metrics
dice = DataStructs.DiceSimilarity(fp1, fp2)
cosine = DataStructs.CosineSimilarity(fp1, fp2)
Clustering and Diversity:
# Butina clustering based on fingerprint similarity
from rdkit.ML.Cluster import Butina
# Calculate distance matrix
dists = []
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fps = [mfpgen.GetFingerprint(mol) for mol in mols]
for i in range(len(fps)):
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([1-sim for sim in sims])
# Cluster with distance cutoff
clusters = Butina.ClusterData(dists, len(fps), distThresh=0.3, isDistData=True)
Basic Substructure Matching:
# Define query using SMARTS
query = Chem.MolFromSmarts('[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1') # Benzene ring
# Check if molecule contains substructure
has_match = mol.HasSubstructMatch(query)
# Get all matches (returns tuple of tuples with atom indices)
matches = mol.GetSubstructMatches(query)
# Get only first match
match = mol.GetSubstructMatch(query)
Common SMARTS Patterns:
# Primary alcohols
primary_alcohol = Chem.MolFromSmarts('[CH2][OH1]')
# Carboxylic acids
carboxylic_acid = Chem.MolFromSmarts('C(=O)[OH]')
# Amides
amide = Chem.MolFromSmarts('C(=O)N')
# Aromatic heterocycles
aromatic_n = Chem.MolFromSmarts('[nR]') # Aromatic nitrogen in ring
# Macrocycles (rings > 12 atoms)
macrocycle = Chem.MolFromSmarts('[r{12-}]')
Matching Rules:
Reaction SMARTS:
from rdkit.Chem import AllChem
# Define reaction using SMARTS: reactants >> products
rxn = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1][O:2]') # Ketone reduction
# Apply reaction to molecules
reactants = (mol1,)
products = rxn.RunReactants(reactants)
# Products is tuple of tuples (one tuple per product set)
for product_set in products:
for product in product_set:
# Sanitize product
Chem.SanitizeMol(product)
Reaction Features:
Reaction Similarity:
# Generate reaction fingerprints
fp = AllChem.CreateDifferenceFingerprintForReaction(rxn)
# Compare reactions
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
2D Coordinate Generation:
from rdkit.Chem import AllChem
# Generate 2D coordinates for depiction
AllChem.Compute2DCoords(mol)
# Align molecule to template structure
template = Chem.MolFromSmiles('c1ccccc1')
AllChem.Compute2DCoords(template)
AllChem.GenerateDepictionMatching2DStructure(mol, template)
3D Coordinate Generation and Conformers:
# Generate single 3D conformer using ETKDG
AllChem.EmbedMolecule(mol, randomSeed=42)
# Generate multiple conformers
conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=10, randomSeed=42)
# Optimize geometry with force field
AllChem.UFFOptimizeMolecule(mol) # UFF force field
AllChem.MMFFOptimizeMolecule(mol) # MMFF94 force field
# Optimize all conformers
for conf_id in conf_ids:
AllChem.MMFFOptimizeMolecule(mol, confId=conf_id)
# Calculate RMSD between conformers
from rdkit.Chem import AllChem
rms = AllChem.GetConformerRMS(mol, conf_id1, conf_id2)
# Align molecules
AllChem.AlignMol(probe_mol, ref_mol)
Constrained Embedding:
# Embed with part of molecule constrained to specific coordinates
AllChem.ConstrainedEmbed(mol, core_mol)
Basic Drawing:
from rdkit.Chem import Draw
# Draw single molecule to PIL image
img = Draw.MolToImage(mol, size=(300, 300))
img.save('molecule.png')
# Draw to file directly
Draw.MolToFile(mol, 'molecule.png')
# Draw multiple molecules in grid
mols = [mol1, mol2, mol3, mol4]
img = Draw.MolsToGridImage(mols, molsPerRow=2, subImgSize=(200, 200))
Highlighting Substructures:
# Highlight substructure match
query = Chem.MolFromSmarts('c1ccccc1')
match = mol.GetSubstructMatch(query)
img = Draw.MolToImage(mol, highlightAtoms=match)
# Custom highlight colors
highlight_colors = {atom_idx: (1, 0, 0) for atom_idx in match} # Red
img = Draw.MolToImage(mol, highlightAtoms=match,
highlightAtomColors=highlight_colors)
Customizing Visualization:
from rdkit.Chem.Draw import rdMolDraw2D
# Create drawer with custom options
drawer = rdMolDraw2D.MolDraw2DCairo(300, 300)
opts = drawer.drawOptions()
# Customize options
opts.addAtomIndices = True
opts.addStereoAnnotation = True
opts.bondLineWidth = 2
# Draw molecule
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
# Save to file
with open('molecule.png', 'wb') as f:
f.write(drawer.GetDrawingText())
Jupyter Notebook Integration:
# Enable inline display in Jupyter
from rdkit.Chem.Draw import IPythonConsole
# Customize default display
IPythonConsole.ipython_useSVG = True # Use SVG instead of PNG
IPythonConsole.molSize = (300, 300) # Default size
# Molecules now display automatically
mol # Shows molecule image
Visualizing Fingerprint Bits:
# Show what molecular features a fingerprint bit represents
from rdkit.Chem import Draw
# For Morgan fingerprints
bit_info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bit_info)
# Draw environment for specific bit
img = Draw.DrawMorganBit(mol, bit_id, bit_info)
Adding/Removing Hydrogens:
# Add explicit hydrogens
mol_h = Chem.AddHs(mol)
# Remove explicit hydrogens
mol = Chem.RemoveHs(mol_h)
Kekulization and Aromaticity:
# Convert aromatic bonds to alternating single/double
Chem.Kekulize(mol)
# Set aromaticity
Chem.SetAromaticity(mol)
Replacing Substructures:
# Replace substructure with another structure
query = Chem.MolFromSmarts('c1ccccc1') # Benzene
replacement = Chem.MolFromSmiles('C1CCCCC1') # Cyclohexane
new_mol = Chem.ReplaceSubstructs(mol, query, replacement)[0]
Neutralizing Charges:
# Remove formal charges by adding/removing hydrogens
from rdkit.Chem.MolStandardize import rdMolStandardize
# Using Uncharger
uncharger = rdMolStandardize.Uncharger()
mol_neutral = uncharger.uncharge(mol)
Molecular Hashing:
from rdkit.Chem import rdMolHash
# Generate Murcko scaffold hash
scaffold_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.MurckoScaffold)
# Canonical SMILES hash
canonical_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.CanonicalSmiles)
# Regioisomer hash (ignores stereochemistry)
regio_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.Regioisomer)
Randomized SMILES:
# Generate random SMILES representations (for data augmentation)
from rdkit.Chem import MolToRandomSmilesVect
random_smiles = MolToRandomSmilesVect(mol, numSmiles=10, randomSeed=42)
Pharmacophore Features:
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os
# Load feature factory
fdef_path = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdef_path)
# Get pharmacophore features
features = factory.GetFeaturesForMol(mol)
for feat in features:
print(feat.GetFamily(), feat.GetType(), feat.GetAtomIds())
from rdkit import Chem
from rdkit.Chem import Descriptors
def analyze_druglikeness(smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
# Calculate Lipinski descriptors
results = {
'MW': Descriptors.MolWt(mol),
'LogP': Descriptors.MolLogP(mol),
'HBD': Descriptors.NumHDonors(mol),
'HBA': Descriptors.NumHAcceptors(mol),
'TPSA': Descriptors.TPSA(mol),
'RotBonds': Descriptors.NumRotatableBonds(mol)
}
# Check Lipinski's Rule of Five
results['Lipinski'] = (
results['MW'] <= 500 and
results['LogP'] <= 5 and
results['HBD'] <= 5 and
results['HBA'] <= 10
)
return results
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
def similarity_screen(query_smiles, database_smiles, threshold=0.7):
query_mol = Chem.MolFromSmiles(query_smiles)
query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2)
hits = []
for idx, smiles in enumerate(database_smiles):
mol = Chem.MolFromSmiles(smiles)
if mol:
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
sim = DataStructs.TanimotoSimilarity(query_fp, fp)
if sim >= threshold:
hits.append((idx, smiles, sim))
return sorted(hits, key=lambda x: x[2], reverse=True)
from rdkit import Chem
def filter_by_substructure(smiles_list, pattern_smarts):
query = Chem.MolFromSmarts(pattern_smarts)
hits = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol and mol.HasSubstructMatch(query):
hits.append(smiles)
return hits
Always check for None when parsing molecules:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print(f"Failed to parse: {smiles}")
continue
Use binary formats for storage:
import pickle
# Pickle molecules for fast loading
with open('molecules.pkl', 'wb') as f:
pickle.dump(mols, f)
# Load pickled molecules (much faster than reparsing)
with open('molecules.pkl', 'rb') as f:
mols = pickle.load(f)
Use bulk operations:
# Calculate fingerprints for all molecules at once
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
# Use bulk similarity calculations
similarities = DataStructs.BulkTanimotoSimilarity(fps[0], fps[1:])
RDKit operations are generally thread-safe for:
Not thread-safe: MolSuppliers when accessed concurrently.
For large datasets:
# Use ForwardSDMolSupplier to avoid loading entire file
with open('large.sdf') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# Process one molecule at a time
pass
# Use MultithreadedSDMolSupplier for parallel processing
suppl = Chem.MultithreadedSDMolSupplier('large.sdf', numWriterThreads=4)
DetectChemistryProblems() to debugAddHs() when calculating properties that depend on hydrogenThis skill includes detailed API reference documentation:
api_reference.md - Comprehensive listing of RDKit modules, functions, and classes organized by functionalitydescriptors_reference.md - Complete list of available molecular descriptors with descriptionssmarts_patterns.md - Common SMARTS patterns for functional groups and structural featuresLoad these references when needing specific API details, parameter information, or pattern examples.
Example scripts for common RDKit workflows:
molecular_properties.py - Calculate comprehensive molecular properties and descriptorssimilarity_search.py - Perform fingerprint-based similarity screeningsubstructure_filter.py - Filter molecules by substructure patternsThese scripts can be executed directly or used as templates for custom workflows.
Weekly Installs
55
Repository
GitHub Stars
17.3K
First Seen
Jan 20, 2026
Security Audits
Gen Agent Trust HubWarnSocketPassSnykPass
Installed on
opencode48
gemini-cli47
codex47
cursor45
claude-code44
github-copilot43
marimo-batch:Python批处理任务神器,Pydantic声明式数据源与UI/CLI双模式
973 周安装
Graphite CLI 分支栈管理指南:堆叠式 PR 与基于主干开发工作流
63 周安装
prompt-xray:AI提示词生成与解析工具,提升Claude等AI开发效率
63 周安装
Zotero MCP 代码执行技能:Python 代码安全搜索 Zotero 文献库,避免崩溃,自动去重排序
45 周安装
GitHub Actions自动化与AI集群协调 - 智能CI/CD工作流与仓库管理
63 周安装
gogcli:Google Workspace命令行工具,高效管理Gmail、日历、Drive等
63 周安装
开发者成长分析技能:基于Claude Code聊天历史,提供个性化编码反馈与学习资源
63 周安装