rdkit by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill rdkitRDKit 是一个全面的化学信息学库,为分子分析和操作提供 Python API。本技能提供读取/写入分子结构、计算描述符、指纹生成、子结构搜索、化学反应、2D/3D 坐标生成和分子可视化的指导。将此技能用于药物发现、计算化学和化学信息学研究任务。
读取分子:
从各种格式读取分子结构:
from rdkit import Chem
# 从 SMILES 字符串
mol = Chem.MolFromSmiles('Cc1ccccc1') # 返回 Mol 对象或 None
# 从 MOL 文件
mol = Chem.MolFromMolFile('path/to/file.mol')
# 从 MOL 块(字符串数据)
mol = Chem.MolFromMolBlock(mol_block_string)
# 从 InChI
mol = Chem.MolFromInchi('InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H')
写入分子:
将分子转换为文本表示:
# 转换为规范 SMILES
smiles = Chem.MolToSmiles(mol)
# 转换为 MOL 块
mol_block = Chem.MolToMolBlock(mol)
# 转换为 InChI
inchi = Chem.MolToInchi(mol)
批量处理:
处理多个分子时,使用 Supplier/Writer 对象:
# 读取 SDF 文件
suppl = Chem.SDMolSupplier('molecules.sdf')
for mol in suppl:
if mol is not None: # 检查解析错误
# 处理分子
pass
# 读取 SMILES 文件
suppl = Chem.SmilesMolSupplier('molecules.smi', titleLine=False)
# 对于大文件或压缩数据
with gzip.open('molecules.sdf.gz') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# 处理分子
pass
# 大数据集的多线程处理
suppl = Chem.MultithreadedSDMolSupplier('molecules.sdf')
# 将分子写入 SDF
writer = Chem.SDWriter('output.sdf')
for mol in molecules:
writer.write(mol)
writer.close()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
重要注意事项:
MolFrom* 函数在失败时返回 None 并带有错误消息NoneRDKit 在解析过程中自动清理分子,执行 13 个步骤,包括化合价检查、芳香性识别和手性分配。
清理控制:
# 禁用自动清理
mol = Chem.MolFromSmiles('C1=CC=CC=C1', sanitize=False)
# 手动清理
Chem.SanitizeMol(mol)
# 在清理前检测问题
problems = Chem.DetectChemistryProblems(mol)
for problem in problems:
print(problem.GetType(), problem.Message())
# 部分清理(跳过特定步骤)
from rdkit.Chem import rdMolStandardize
Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES)
常见清理问题:
访问分子结构:
# 迭代原子和键
for atom in mol.GetAtoms():
print(atom.GetSymbol(), atom.GetIdx(), atom.GetDegree())
for bond in mol.GetBonds():
print(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType())
# 环信息
ring_info = mol.GetRingInfo()
ring_info.NumRings()
ring_info.AtomRings() # 返回原子索引的元组
# 检查原子是否在环中
atom = mol.GetAtomWithIdx(0)
atom.IsInRing()
atom.IsInRingSize(6) # 检查 6 元环
# 查找最小环的最小集合(SSSR)
from rdkit.Chem import GetSymmSSSR
rings = GetSymmSSSR(mol)
立体化学:
# 查找手性中心
from rdkit.Chem import FindMolChiralCenters
chiral_centers = FindMolChiralCenters(mol, includeUnassigned=True)
# 返回(原子索引,手性)元组列表
# 从 3D 坐标分配立体化学
from rdkit.Chem import AssignStereochemistryFrom3D
AssignStereochemistryFrom3D(mol)
# 检查键的立体化学
bond = mol.GetBondWithIdx(0)
stereo = bond.GetStereo() # STEREONONE, STEREOZ, STEREOE 等
片段分析:
# 获取不连接的片段
frags = Chem.GetMolFrags(mol, asMols=True)
# 在特定键上断裂
from rdkit.Chem import FragmentOnBonds
frag_mol = FragmentOnBonds(mol, [bond_idx1, bond_idx2])
# 计数环系统
from rdkit.Chem.Scaffolds import MurckoScaffold
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
基本描述符:
from rdkit.Chem import Descriptors
# 分子量
mw = Descriptors.MolWt(mol)
exact_mw = Descriptors.ExactMolWt(mol)
# LogP(亲脂性)
logp = Descriptors.MolLogP(mol)
# 拓扑极性表面积
tpsa = Descriptors.TPSA(mol)
# 氢键供体/受体数量
hbd = Descriptors.NumHDonors(mol)
hba = Descriptors.NumHAcceptors(mol)
# 可旋转键数量
rot_bonds = Descriptors.NumRotatableBonds(mol)
# 芳香环数量
aromatic_rings = Descriptors.NumAromaticRings(mol)
批量描述符计算:
# 一次性计算所有描述符
all_descriptors = Descriptors.CalcMolDescriptors(mol)
# 返回字典:{'MolWt': 180.16, 'MolLogP': 1.23, ...}
# 获取可用描述符名称列表
descriptor_names = [desc[0] for desc in Descriptors._descList]
Lipinski 五规则:
# 检查类药性
mw = Descriptors.MolWt(mol) <= 500
logp = Descriptors.MolLogP(mol) <= 5
hbd = Descriptors.NumHDonors(mol) <= 5
hba = Descriptors.NumHAcceptors(mol) <= 10
is_drug_like = mw and logp and hbd and hba
指纹类型:
from rdkit.Chem import AllChem, RDKFingerprint
from rdkit.Chem.AtomPairs import Pairs, Torsions
from rdkit.Chem import MACCSkeys
# RDKit 拓扑指纹
fp = Chem.RDKFingerprint(mol)
# Morgan 指纹(圆形指纹,类似于 ECFP)
fp = AllChem.GetMorganFingerprint(mol, radius=2)
fp_bits = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
# MACCS 键(166 位结构键)
fp = MACCSkeys.GenMACCSKeys(mol)
# 原子对指纹
fp = Pairs.GetAtomPairFingerprint(mol)
# 拓扑扭转指纹
fp = Torsions.GetTopologicalTorsionFingerprint(mol)
# Avalon 指纹(如果可用)
from rdkit.Avalon import pyAvalonTools
fp = pyAvalonTools.GetAvalonFP(mol)
相似性计算:
from rdkit import DataStructs
# 计算 Tanimoto 相似性
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2)
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
# 计算多个分子的相似性
similarities = DataStructs.BulkTanimotoSimilarity(fp1, [fp2, fp3, fp4])
# 其他相似性指标
dice = DataStructs.DiceSimilarity(fp1, fp2)
cosine = DataStructs.CosineSimilarity(fp1, fp2)
聚类与多样性:
# 基于指纹相似性的 Butina 聚类
from rdkit.ML.Cluster import Butina
# 计算距离矩阵
dists = []
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
for i in range(len(fps)):
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([1-sim for sim in sims])
# 使用距离截止值进行聚类
clusters = Butina.ClusterData(dists, len(fps), distThresh=0.3, isDistData=True)
基本子结构匹配:
# 使用 SMARTS 定义查询
query = Chem.MolFromSmarts('[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1') # 苯环
# 检查分子是否包含子结构
has_match = mol.HasSubstructMatch(query)
# 获取所有匹配(返回原子索引的元组元组)
matches = mol.GetSubstructMatches(query)
# 仅获取第一个匹配
match = mol.GetSubstructMatch(query)
常见 SMARTS 模式:
# 伯醇
primary_alcohol = Chem.MolFromSmarts('[CH2][OH1]')
# 羧酸
carboxylic_acid = Chem.MolFromSmarts('C(=O)[OH]')
# 酰胺
amide = Chem.MolFromSmarts('C(=O)N')
# 芳香杂环
aromatic_n = Chem.MolFromSmarts('[nR]') # 环中的芳香氮
# 大环(> 12 个原子的环)
macrocycle = Chem.MolFromSmarts('[r{12-}]')
匹配规则:
反应 SMARTS:
from rdkit.Chem import AllChem
# 使用 SMARTS 定义反应:反应物 >> 产物
rxn = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1][O:2]') # 酮还原
# 将反应应用于分子
reactants = (mol1,)
products = rxn.RunReactants(reactants)
# 产物是元组的元组(每个产物集一个元组)
for product_set in products:
for product in product_set:
# 清理产物
Chem.SanitizeMol(product)
反应特性:
反应相似性:
# 生成反应指纹
fp = AllChem.CreateDifferenceFingerprintForReaction(rxn)
# 比较反应
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
2D 坐标生成:
from rdkit.Chem import AllChem
# 为描绘生成 2D 坐标
AllChem.Compute2DCoords(mol)
# 将分子与模板结构对齐
template = Chem.MolFromSmiles('c1ccccc1')
AllChem.Compute2DCoords(template)
AllChem.GenerateDepictionMatching2DStructure(mol, template)
3D 坐标生成与构象:
# 使用 ETKDG 生成单个 3D 构象
AllChem.EmbedMolecule(mol, randomSeed=42)
# 生成多个构象
conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=10, randomSeed=42)
# 使用力场优化几何结构
AllChem.UFFOptimizeMolecule(mol) # UFF 力场
AllChem.MMFFOptimizeMolecule(mol) # MMFF94 力场
# 优化所有构象
for conf_id in conf_ids:
AllChem.MMFFOptimizeMolecule(mol, confId=conf_id)
# 计算构象之间的 RMSD
from rdkit.Chem import AllChem
rms = AllChem.GetConformerRMS(mol, conf_id1, conf_id2)
# 对齐分子
AllChem.AlignMol(probe_mol, ref_mol)
约束嵌入:
# 将部分分子约束到特定坐标进行嵌入
AllChem.ConstrainedEmbed(mol, core_mol)
基本绘制:
from rdkit.Chem import Draw
# 将单个分子绘制为 PIL 图像
img = Draw.MolToImage(mol, size=(300, 300))
img.save('molecule.png')
# 直接绘制到文件
Draw.MolToFile(mol, 'molecule.png')
# 在网格中绘制多个分子
mols = [mol1, mol2, mol3, mol4]
img = Draw.MolsToGridImage(mols, molsPerRow=2, subImgSize=(200, 200))
高亮显示子结构:
# 高亮显示子结构匹配
query = Chem.MolFromSmarts('c1ccccc1')
match = mol.GetSubstructMatch(query)
img = Draw.MolToImage(mol, highlightAtoms=match)
# 自定义高亮颜色
highlight_colors = {atom_idx: (1, 0, 0) for atom_idx in match} # 红色
img = Draw.MolToImage(mol, highlightAtoms=match,
highlightAtomColors=highlight_colors)
自定义可视化:
from rdkit.Chem.Draw import rdMolDraw2D
# 使用自定义选项创建绘图器
drawer = rdMolDraw2D.MolDraw2DCairo(300, 300)
opts = drawer.drawOptions()
# 自定义选项
opts.addAtomIndices = True
opts.addStereoAnnotation = True
opts.bondLineWidth = 2
# 绘制分子
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
# 保存到文件
with open('molecule.png', 'wb') as f:
f.write(drawer.GetDrawingText())
Jupyter Notebook 集成:
# 在 Jupyter 中启用内联显示
from rdkit.Chem.Draw import IPythonConsole
# 自定义默认显示
IPythonConsole.ipython_useSVG = True # 使用 SVG 代替 PNG
IPythonConsole.molSize = (300, 300) # 默认大小
# 分子现在自动显示
mol # 显示分子图像
可视化指纹位:
# 显示指纹位代表的分子特征
from rdkit.Chem import Draw
# 对于 Morgan 指纹
bit_info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bit_info)
# 绘制特定位的环境
img = Draw.DrawMorganBit(mol, bit_id, bit_info)
添加/移除氢原子:
# 添加显式氢原子
mol_h = Chem.AddHs(mol)
# 移除显式氢原子
mol = Chem.RemoveHs(mol_h)
凯库勒化与芳香性:
# 将芳香键转换为交替的单/双键
Chem.Kekulize(mol)
# 设置芳香性
Chem.SetAromaticity(mol)
替换子结构:
# 用另一个结构替换子结构
query = Chem.MolFromSmarts('c1ccccc1') # 苯
replacement = Chem.MolFromSmiles('C1CCCCC1') # 环己烷
new_mol = Chem.ReplaceSubstructs(mol, query, replacement)[0]
中和电荷:
# 通过添加/移除氢原子移除形式电荷
from rdkit.Chem.MolStandardize import rdMolStandardize
# 使用 Uncharger
uncharger = rdMolStandardize.Uncharger()
mol_neutral = uncharger.uncharge(mol)
分子哈希:
from rdkit.Chem import rdMolHash
# 生成 Murcko 骨架哈希
scaffold_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.MurckoScaffold)
# 规范 SMILES 哈希
canonical_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.CanonicalSmiles)
# 区域异构体哈希(忽略立体化学)
regio_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.Regioisomer)
随机化 SMILES:
# 生成随机 SMILES 表示(用于数据增强)
from rdkit.Chem import MolToRandomSmilesVect
random_smiles = MolToRandomSmilesVect(mol, numSmiles=10, randomSeed=42)
药效团特征:
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os
# 加载特征工厂
fdef_path = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdef_path)
# 获取药效团特征
features = factory.GetFeaturesForMol(mol)
for feat in features:
print(feat.GetFamily(), feat.GetType(), feat.GetAtomIds())
from rdkit import Chem
from rdkit.Chem import Descriptors
def analyze_druglikeness(smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
# 计算 Lipinski 描述符
results = {
'MW': Descriptors.MolWt(mol),
'LogP': Descriptors.MolLogP(mol),
'HBD': Descriptors.NumHDonors(mol),
'HBA': Descriptors.NumHAcceptors(mol),
'TPSA': Descriptors.TPSA(mol),
'RotBonds': Descriptors.NumRotatableBonds(mol)
}
# 检查 Lipinski 五规则
results['Lipinski'] = (
results['MW'] <= 500 and
results['LogP'] <= 5 and
results['HBD'] <= 5 and
results['HBA'] <= 10
)
return results
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
def similarity_screen(query_smiles, database_smiles, threshold=0.7):
query_mol = Chem.MolFromSmiles(query_smiles)
query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2)
hits = []
for idx, smiles in enumerate(database_smiles):
mol = Chem.MolFromSmiles(smiles)
if mol:
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
sim = DataStructs.TanimotoSimilarity(query_fp, fp)
if sim >= threshold:
hits.append((idx, smiles, sim))
return sorted(hits, key=lambda x: x[2], reverse=True)
from rdkit import Chem
def filter_by_substructure(smiles_list, pattern_smarts):
query = Chem.MolFromSmarts(pattern_smarts)
hits = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol and mol.HasSubstructMatch(query):
hits.append(smiles)
return hits
解析分子时始终检查 None:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print(f"Failed to parse: {smiles}")
continue
使用二进制格式进行存储:
import pickle
# 使用 pickle 序列化分子以便快速加载
with open('molecules.pkl', 'wb') as f:
pickle.dump(mols, f)
# 加载 pickle 序列化的分子(比重新解析快得多)
with open('molecules.pkl', 'rb') as f:
mols = pickle.load(f)
使用批量操作:
# 一次性计算所有分子的指纹
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
# 使用批量相似性计算
similarities = DataStructs.BulkTanimotoSimilarity(fps[0], fps[1:])
RDKit 操作通常对以下情况是线程安全的:
非线程安全: 并发访问时的 MolSuppliers。
对于大型数据集:
# 使用 ForwardSDMolSupplier 避免加载整个文件
with open('large.sdf') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# 一次处理一个分子
pass
# 使用 MultithreadedSDMolSupplier 进行并行处理
suppl = Chem.MultithreadedSDMolSupplier('large.sdf', numWriterThreads=4)
DetectChemistryProblems() 进行调试AddHs()此技能包含详细的 API 参考文档:
api_reference.md - 按功能组织的 RDKit 模块、函数和类的全面列表descriptors_reference.md - 可用分子描述符的完整列表及描述smarts_patterns.md - 官能团和结构特征的常见 SMARTS 模式需要特定 API 详细信息、参数信息或模式示例时,请加载这些参考资料。
常见 RDKit 工作流程的示例脚本:
molecular_properties.py - 计算全面的分子属性和描述符similarity_search.py - 执行基于指纹的相似性筛选substructure_filter.py - 按子结构模式过滤分子这些脚本可以直接执行或用作自定义工作流程的模板。
每周安装次数
134
代码仓库
GitHub 星标数
22.6K
首次出现时间
2026 年 1 月 21 日
安全审计
已安装于
claude-code113
opencode108
gemini-cli98
cursor98
antigravity88
codex88
RDKit is a comprehensive cheminformatics library providing Python APIs for molecular analysis and manipulation. This skill provides guidance for reading/writing molecular structures, calculating descriptors, fingerprinting, substructure searching, chemical reactions, 2D/3D coordinate generation, and molecular visualization. Use this skill for drug discovery, computational chemistry, and cheminformatics research tasks.
Reading Molecules:
Read molecular structures from various formats:
from rdkit import Chem
# From SMILES strings
mol = Chem.MolFromSmiles('Cc1ccccc1') # Returns Mol object or None
# From MOL files
mol = Chem.MolFromMolFile('path/to/file.mol')
# From MOL blocks (string data)
mol = Chem.MolFromMolBlock(mol_block_string)
# From InChI
mol = Chem.MolFromInchi('InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H')
Writing Molecules:
Convert molecules to text representations:
# To canonical SMILES
smiles = Chem.MolToSmiles(mol)
# To MOL block
mol_block = Chem.MolToMolBlock(mol)
# To InChI
inchi = Chem.MolToInchi(mol)
Batch Processing:
For processing multiple molecules, use Supplier/Writer objects:
# Read SDF files
suppl = Chem.SDMolSupplier('molecules.sdf')
for mol in suppl:
if mol is not None: # Check for parsing errors
# Process molecule
pass
# Read SMILES files
suppl = Chem.SmilesMolSupplier('molecules.smi', titleLine=False)
# For large files or compressed data
with gzip.open('molecules.sdf.gz') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# Process molecule
pass
# Multithreaded processing for large datasets
suppl = Chem.MultithreadedSDMolSupplier('molecules.sdf')
# Write molecules to SDF
writer = Chem.SDWriter('output.sdf')
for mol in molecules:
writer.write(mol)
writer.close()
Important Notes:
MolFrom* functions return None on failure with error messagesNone before processing moleculesRDKit automatically sanitizes molecules during parsing, executing 13 steps including valence checking, aromaticity perception, and chirality assignment.
Sanitization Control:
# Disable automatic sanitization
mol = Chem.MolFromSmiles('C1=CC=CC=C1', sanitize=False)
# Manual sanitization
Chem.SanitizeMol(mol)
# Detect problems before sanitization
problems = Chem.DetectChemistryProblems(mol)
for problem in problems:
print(problem.GetType(), problem.Message())
# Partial sanitization (skip specific steps)
from rdkit.Chem import rdMolStandardize
Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES)
Common Sanitization Issues:
Accessing Molecular Structure:
# Iterate atoms and bonds
for atom in mol.GetAtoms():
print(atom.GetSymbol(), atom.GetIdx(), atom.GetDegree())
for bond in mol.GetBonds():
print(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType())
# Ring information
ring_info = mol.GetRingInfo()
ring_info.NumRings()
ring_info.AtomRings() # Returns tuples of atom indices
# Check if atom is in ring
atom = mol.GetAtomWithIdx(0)
atom.IsInRing()
atom.IsInRingSize(6) # Check for 6-membered rings
# Find smallest set of smallest rings (SSSR)
from rdkit.Chem import GetSymmSSSR
rings = GetSymmSSSR(mol)
Stereochemistry:
# Find chiral centers
from rdkit.Chem import FindMolChiralCenters
chiral_centers = FindMolChiralCenters(mol, includeUnassigned=True)
# Returns list of (atom_idx, chirality) tuples
# Assign stereochemistry from 3D coordinates
from rdkit.Chem import AssignStereochemistryFrom3D
AssignStereochemistryFrom3D(mol)
# Check bond stereochemistry
bond = mol.GetBondWithIdx(0)
stereo = bond.GetStereo() # STEREONONE, STEREOZ, STEREOE, etc.
Fragment Analysis:
# Get disconnected fragments
frags = Chem.GetMolFrags(mol, asMols=True)
# Fragment on specific bonds
from rdkit.Chem import FragmentOnBonds
frag_mol = FragmentOnBonds(mol, [bond_idx1, bond_idx2])
# Count ring systems
from rdkit.Chem.Scaffolds import MurckoScaffold
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
Basic Descriptors:
from rdkit.Chem import Descriptors
# Molecular weight
mw = Descriptors.MolWt(mol)
exact_mw = Descriptors.ExactMolWt(mol)
# LogP (lipophilicity)
logp = Descriptors.MolLogP(mol)
# Topological polar surface area
tpsa = Descriptors.TPSA(mol)
# Number of hydrogen bond donors/acceptors
hbd = Descriptors.NumHDonors(mol)
hba = Descriptors.NumHAcceptors(mol)
# Number of rotatable bonds
rot_bonds = Descriptors.NumRotatableBonds(mol)
# Number of aromatic rings
aromatic_rings = Descriptors.NumAromaticRings(mol)
Batch Descriptor Calculation:
# Calculate all descriptors at once
all_descriptors = Descriptors.CalcMolDescriptors(mol)
# Returns dictionary: {'MolWt': 180.16, 'MolLogP': 1.23, ...}
# Get list of available descriptor names
descriptor_names = [desc[0] for desc in Descriptors._descList]
Lipinski's Rule of Five:
# Check drug-likeness
mw = Descriptors.MolWt(mol) <= 500
logp = Descriptors.MolLogP(mol) <= 5
hbd = Descriptors.NumHDonors(mol) <= 5
hba = Descriptors.NumHAcceptors(mol) <= 10
is_drug_like = mw and logp and hbd and hba
Fingerprint Types:
from rdkit.Chem import AllChem, RDKFingerprint
from rdkit.Chem.AtomPairs import Pairs, Torsions
from rdkit.Chem import MACCSkeys
# RDKit topological fingerprint
fp = Chem.RDKFingerprint(mol)
# Morgan fingerprints (circular fingerprints, similar to ECFP)
fp = AllChem.GetMorganFingerprint(mol, radius=2)
fp_bits = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
# MACCS keys (166-bit structural key)
fp = MACCSkeys.GenMACCSKeys(mol)
# Atom pair fingerprints
fp = Pairs.GetAtomPairFingerprint(mol)
# Topological torsion fingerprints
fp = Torsions.GetTopologicalTorsionFingerprint(mol)
# Avalon fingerprints (if available)
from rdkit.Avalon import pyAvalonTools
fp = pyAvalonTools.GetAvalonFP(mol)
Similarity Calculation:
from rdkit import DataStructs
# Calculate Tanimoto similarity
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2)
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
# Calculate similarity for multiple molecules
similarities = DataStructs.BulkTanimotoSimilarity(fp1, [fp2, fp3, fp4])
# Other similarity metrics
dice = DataStructs.DiceSimilarity(fp1, fp2)
cosine = DataStructs.CosineSimilarity(fp1, fp2)
Clustering and Diversity:
# Butina clustering based on fingerprint similarity
from rdkit.ML.Cluster import Butina
# Calculate distance matrix
dists = []
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
for i in range(len(fps)):
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([1-sim for sim in sims])
# Cluster with distance cutoff
clusters = Butina.ClusterData(dists, len(fps), distThresh=0.3, isDistData=True)
Basic Substructure Matching:
# Define query using SMARTS
query = Chem.MolFromSmarts('[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1') # Benzene ring
# Check if molecule contains substructure
has_match = mol.HasSubstructMatch(query)
# Get all matches (returns tuple of tuples with atom indices)
matches = mol.GetSubstructMatches(query)
# Get only first match
match = mol.GetSubstructMatch(query)
Common SMARTS Patterns:
# Primary alcohols
primary_alcohol = Chem.MolFromSmarts('[CH2][OH1]')
# Carboxylic acids
carboxylic_acid = Chem.MolFromSmarts('C(=O)[OH]')
# Amides
amide = Chem.MolFromSmarts('C(=O)N')
# Aromatic heterocycles
aromatic_n = Chem.MolFromSmarts('[nR]') # Aromatic nitrogen in ring
# Macrocycles (rings > 12 atoms)
macrocycle = Chem.MolFromSmarts('[r{12-}]')
Matching Rules:
Reaction SMARTS:
from rdkit.Chem import AllChem
# Define reaction using SMARTS: reactants >> products
rxn = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1][O:2]') # Ketone reduction
# Apply reaction to molecules
reactants = (mol1,)
products = rxn.RunReactants(reactants)
# Products is tuple of tuples (one tuple per product set)
for product_set in products:
for product in product_set:
# Sanitize product
Chem.SanitizeMol(product)
Reaction Features:
Reaction Similarity:
# Generate reaction fingerprints
fp = AllChem.CreateDifferenceFingerprintForReaction(rxn)
# Compare reactions
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
2D Coordinate Generation:
from rdkit.Chem import AllChem
# Generate 2D coordinates for depiction
AllChem.Compute2DCoords(mol)
# Align molecule to template structure
template = Chem.MolFromSmiles('c1ccccc1')
AllChem.Compute2DCoords(template)
AllChem.GenerateDepictionMatching2DStructure(mol, template)
3D Coordinate Generation and Conformers:
# Generate single 3D conformer using ETKDG
AllChem.EmbedMolecule(mol, randomSeed=42)
# Generate multiple conformers
conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=10, randomSeed=42)
# Optimize geometry with force field
AllChem.UFFOptimizeMolecule(mol) # UFF force field
AllChem.MMFFOptimizeMolecule(mol) # MMFF94 force field
# Optimize all conformers
for conf_id in conf_ids:
AllChem.MMFFOptimizeMolecule(mol, confId=conf_id)
# Calculate RMSD between conformers
from rdkit.Chem import AllChem
rms = AllChem.GetConformerRMS(mol, conf_id1, conf_id2)
# Align molecules
AllChem.AlignMol(probe_mol, ref_mol)
Constrained Embedding:
# Embed with part of molecule constrained to specific coordinates
AllChem.ConstrainedEmbed(mol, core_mol)
Basic Drawing:
from rdkit.Chem import Draw
# Draw single molecule to PIL image
img = Draw.MolToImage(mol, size=(300, 300))
img.save('molecule.png')
# Draw to file directly
Draw.MolToFile(mol, 'molecule.png')
# Draw multiple molecules in grid
mols = [mol1, mol2, mol3, mol4]
img = Draw.MolsToGridImage(mols, molsPerRow=2, subImgSize=(200, 200))
Highlighting Substructures:
# Highlight substructure match
query = Chem.MolFromSmarts('c1ccccc1')
match = mol.GetSubstructMatch(query)
img = Draw.MolToImage(mol, highlightAtoms=match)
# Custom highlight colors
highlight_colors = {atom_idx: (1, 0, 0) for atom_idx in match} # Red
img = Draw.MolToImage(mol, highlightAtoms=match,
highlightAtomColors=highlight_colors)
Customizing Visualization:
from rdkit.Chem.Draw import rdMolDraw2D
# Create drawer with custom options
drawer = rdMolDraw2D.MolDraw2DCairo(300, 300)
opts = drawer.drawOptions()
# Customize options
opts.addAtomIndices = True
opts.addStereoAnnotation = True
opts.bondLineWidth = 2
# Draw molecule
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
# Save to file
with open('molecule.png', 'wb') as f:
f.write(drawer.GetDrawingText())
Jupyter Notebook Integration:
# Enable inline display in Jupyter
from rdkit.Chem.Draw import IPythonConsole
# Customize default display
IPythonConsole.ipython_useSVG = True # Use SVG instead of PNG
IPythonConsole.molSize = (300, 300) # Default size
# Molecules now display automatically
mol # Shows molecule image
Visualizing Fingerprint Bits:
# Show what molecular features a fingerprint bit represents
from rdkit.Chem import Draw
# For Morgan fingerprints
bit_info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bit_info)
# Draw environment for specific bit
img = Draw.DrawMorganBit(mol, bit_id, bit_info)
Adding/Removing Hydrogens:
# Add explicit hydrogens
mol_h = Chem.AddHs(mol)
# Remove explicit hydrogens
mol = Chem.RemoveHs(mol_h)
Kekulization and Aromaticity:
# Convert aromatic bonds to alternating single/double
Chem.Kekulize(mol)
# Set aromaticity
Chem.SetAromaticity(mol)
Replacing Substructures:
# Replace substructure with another structure
query = Chem.MolFromSmarts('c1ccccc1') # Benzene
replacement = Chem.MolFromSmiles('C1CCCCC1') # Cyclohexane
new_mol = Chem.ReplaceSubstructs(mol, query, replacement)[0]
Neutralizing Charges:
# Remove formal charges by adding/removing hydrogens
from rdkit.Chem.MolStandardize import rdMolStandardize
# Using Uncharger
uncharger = rdMolStandardize.Uncharger()
mol_neutral = uncharger.uncharge(mol)
Molecular Hashing:
from rdkit.Chem import rdMolHash
# Generate Murcko scaffold hash
scaffold_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.MurckoScaffold)
# Canonical SMILES hash
canonical_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.CanonicalSmiles)
# Regioisomer hash (ignores stereochemistry)
regio_hash = rdMolHash.MolHash(mol, rdMolHash.HashFunction.Regioisomer)
Randomized SMILES:
# Generate random SMILES representations (for data augmentation)
from rdkit.Chem import MolToRandomSmilesVect
random_smiles = MolToRandomSmilesVect(mol, numSmiles=10, randomSeed=42)
Pharmacophore Features:
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os
# Load feature factory
fdef_path = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdef_path)
# Get pharmacophore features
features = factory.GetFeaturesForMol(mol)
for feat in features:
print(feat.GetFamily(), feat.GetType(), feat.GetAtomIds())
from rdkit import Chem
from rdkit.Chem import Descriptors
def analyze_druglikeness(smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
# Calculate Lipinski descriptors
results = {
'MW': Descriptors.MolWt(mol),
'LogP': Descriptors.MolLogP(mol),
'HBD': Descriptors.NumHDonors(mol),
'HBA': Descriptors.NumHAcceptors(mol),
'TPSA': Descriptors.TPSA(mol),
'RotBonds': Descriptors.NumRotatableBonds(mol)
}
# Check Lipinski's Rule of Five
results['Lipinski'] = (
results['MW'] <= 500 and
results['LogP'] <= 5 and
results['HBD'] <= 5 and
results['HBA'] <= 10
)
return results
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
def similarity_screen(query_smiles, database_smiles, threshold=0.7):
query_mol = Chem.MolFromSmiles(query_smiles)
query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2)
hits = []
for idx, smiles in enumerate(database_smiles):
mol = Chem.MolFromSmiles(smiles)
if mol:
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
sim = DataStructs.TanimotoSimilarity(query_fp, fp)
if sim >= threshold:
hits.append((idx, smiles, sim))
return sorted(hits, key=lambda x: x[2], reverse=True)
from rdkit import Chem
def filter_by_substructure(smiles_list, pattern_smarts):
query = Chem.MolFromSmarts(pattern_smarts)
hits = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol and mol.HasSubstructMatch(query):
hits.append(smiles)
return hits
Always check for None when parsing molecules:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print(f"Failed to parse: {smiles}")
continue
Use binary formats for storage:
import pickle
# Pickle molecules for fast loading
with open('molecules.pkl', 'wb') as f:
pickle.dump(mols, f)
# Load pickled molecules (much faster than reparsing)
with open('molecules.pkl', 'rb') as f:
mols = pickle.load(f)
Use bulk operations:
# Calculate fingerprints for all molecules at once
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
# Use bulk similarity calculations
similarities = DataStructs.BulkTanimotoSimilarity(fps[0], fps[1:])
RDKit operations are generally thread-safe for:
Not thread-safe: MolSuppliers when accessed concurrently.
For large datasets:
# Use ForwardSDMolSupplier to avoid loading entire file
with open('large.sdf') as f:
suppl = Chem.ForwardSDMolSupplier(f)
for mol in suppl:
# Process one molecule at a time
pass
# Use MultithreadedSDMolSupplier for parallel processing
suppl = Chem.MultithreadedSDMolSupplier('large.sdf', numWriterThreads=4)
DetectChemistryProblems() to debugAddHs() when calculating properties that depend on hydrogenThis skill includes detailed API reference documentation:
api_reference.md - Comprehensive listing of RDKit modules, functions, and classes organized by functionalitydescriptors_reference.md - Complete list of available molecular descriptors with descriptionssmarts_patterns.md - Common SMARTS patterns for functional groups and structural featuresLoad these references when needing specific API details, parameter information, or pattern examples.
Example scripts for common RDKit workflows:
molecular_properties.py - Calculate comprehensive molecular properties and descriptorssimilarity_search.py - Perform fingerprint-based similarity screeningsubstructure_filter.py - Filter molecules by substructure patternsThese scripts can be executed directly or used as templates for custom workflows.
Weekly Installs
134
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code113
opencode108
gemini-cli98
cursor98
antigravity88
codex88
PPTX 文件处理全攻略:Python 脚本创建、编辑、分析 .pptx 文件内容与结构
877 周安装