RDKit | 化合物库的相似性分析

DrugAI

发布于 2021-01-29 07:42:15

1.3K0

发布于 2021-01-29 07:42:15

文章被收录于专栏：DrugAI

作者丨王建民

单位丨湖南大学

研究方向丨药物设计、生物医药大数据

展示一种小分子数据库的相似性分析策略。

实例中使用SMILES文件，该分析可以以相同的方式从分子的SDF或其他格式文件中加载数据，只需确保使用适当的方法将分子加载到RDKit中。

导入库

import osimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltfrom matplotlib import gridspec
from rdkit import Chem, DataStructsfrom rdkit.Chem.Fingerprints import FingerprintMolsfrom rdkit.Chem import Draw
# clusteringfrom scipy.cluster.hierarchy import dendrogram, linkage

载入数据

该库包含超过8 000 000个SMILES。

database=[]with open('mol_parent.smi','r') as file:    for index,line in enumerate(file):        if 0<index<=1:                   # 查看第一个smiles            print (index, line.split())

1 ['Cn1ncc2cc(CN)ccc12', '10025']

保留1000分子，可将SMILES转换为RDKit分子对象。

database=[]with open('mol_parent.smi','r') as file:    for index,line in enumerate(file):        if 0<index<=1000:                              # 保留1000个smiles            mol=Chem.MolFromSmiles(line.split()[0])    # 转换SMILES 为rdkit分子对象             mol.SetProp('_Name',line.split()[1])       # 为每个分子添加名字            database.append(mol)

可视化

Draw.MolsToGridImage(database,molsPerRow=10,subImgSize=(150,150),legends=[mol.GetProp('_Name') for mol in database])

计算分子指纹

fps= [FingerprintMols.FingerprintMol(mol) for mol in database]

print(len(database))print(len(fps))

1000

相似度计算

DataStructs.FingerprintSimilarity(fps[0],fps[1])

0.2966101694915254

size=len(database)hmap=np.empty(shape=(size,size))table=pd.DataFrame()for index, i in enumerate(fps):    for jndex, j in enumerate(fps):        similarity=DataStructs.FingerprintSimilarity(i,j)        hmap[index,jndex]=similarity        table.loc[database[index].GetProp('_Name'),database[jndex].GetProp('_Name')]=similarity

table.head(10)

层次聚类

linked = linkage(hmap,'single')labelList = [mol.GetProp('_Name') for mol in database]

plt.figure(figsize=(8,15))
ax1=plt.subplot()o=dendrogram(linked,              orientation='left',            labels=labelList,            distance_sort='descending',            show_leaf_counts=True)
ax1.spines['left'].set_visible(False)ax1.spines['top'].set_visible(False)ax1.spines['right'].set_visible(False)plt.title('Similarity clustering',fontsize=20,weight='bold')plt.tick_params ('both',width=2,labelsize=8)plt.tight_layout()plt.show()

# the clusters in order as the last plotnew_data=list(reversed(o['ivl']))
# create a new table with the order of HCLhmap_2=np.empty(shape=(size,size))for index,i in enumerate(new_data):    for jndex,j in enumerate(new_data):        hmap_2[index,jndex]=table.loc[i].at[j]

figure= plt.figure(figsize=(30,30))gs1 = gridspec.GridSpec(2,7)gs1.update(wspace=0.01)ax1 = plt.subplot(gs1[0:-1, :2])dendrogram(linked, orientation='left', distance_sort='descending',show_leaf_counts=True,no_labels=True)ax1.spines['left'].set_visible(False)ax1.spines['top'].set_visible(False)ax1.spines['right'].set_visible(False)
ax2 = plt.subplot(gs1[0:-1,2:6])f=ax2.imshow (hmap_2, cmap='PRGn_r', interpolation='nearest')
ax2.set_title('Fingerprint Similarity',fontsize=20,weight='bold')ax2.set_xticks (range(len(new_data)))ax2.set_yticks (range(len(new_data)))ax2.set_xticklabels (new_data,rotation=90,size=8)ax2.set_yticklabels (new_data,size=8)
ax3 = plt.subplot(gs1[0:-1,6:7])m=plt.colorbar(f,cax=ax3,shrink=0.75,orientation='vertical',spacing='uniform',pad=0.01)m.set_label ('Fingerprint Similarity')
plt.tick_params ('both',width=2)plt.plot()