作者丨王建民
单位丨湖南大学
研究方向丨药物设计、生物医药大数据
展示一种小分子数据库的相似性分析策略。
实例中使用SMILES文件,该分析可以以相同的方式从分子的SDF或其他格式文件中加载数据,只需确保使用适当的方法将分子加载到RDKit中。
导入库
import osimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltfrom matplotlib import gridspec
from rdkit import Chem, DataStructsfrom rdkit.Chem.Fingerprints import FingerprintMolsfrom rdkit.Chem import Draw
# clusteringfrom scipy.cluster.hierarchy import dendrogram, linkage
载入数据
该库包含超过8 000 000个SMILES。
database=[]with open('mol_parent.smi','r') as file: for index,line in enumerate(file): if 0<index<=1: # 查看第一个smiles print (index, line.split())
1 ['Cn1ncc2cc(CN)ccc12', '10025']
保留1000分子,可将SMILES转换为RDKit分子对象。
database=[]with open('mol_parent.smi','r') as file: for index,line in enumerate(file): if 0<index<=1000: # 保留1000个smiles mol=Chem.MolFromSmiles(line.split()[0]) # 转换SMILES 为rdkit分子对象 mol.SetProp('_Name',line.split()[1]) # 为每个分子添加名字 database.append(mol)
可视化
Draw.MolsToGridImage(database,molsPerRow=10,subImgSize=(150,150),legends=[mol.GetProp('_Name') for mol in database])
计算分子指纹
fps= [FingerprintMols.FingerprintMol(mol) for mol in database]
print(len(database))print(len(fps))
1000
1000
相似度计算
DataStructs.FingerprintSimilarity(fps[0],fps[1])
0.2966101694915254
size=len(database)hmap=np.empty(shape=(size,size))table=pd.DataFrame()for index, i in enumerate(fps): for jndex, j in enumerate(fps): similarity=DataStructs.FingerprintSimilarity(i,j) hmap[index,jndex]=similarity table.loc[database[index].GetProp('_Name'),database[jndex].GetProp('_Name')]=similarity
table.head(10)
层次聚类
linked = linkage(hmap,'single')labelList = [mol.GetProp('_Name') for mol in database]
plt.figure(figsize=(8,15))
ax1=plt.subplot()o=dendrogram(linked, orientation='left', labels=labelList, distance_sort='descending', show_leaf_counts=True)
ax1.spines['left'].set_visible(False)ax1.spines['top'].set_visible(False)ax1.spines['right'].set_visible(False)plt.title('Similarity clustering',fontsize=20,weight='bold')plt.tick_params ('both',width=2,labelsize=8)plt.tight_layout()plt.show()
# the clusters in order as the last plotnew_data=list(reversed(o['ivl']))
# create a new table with the order of HCLhmap_2=np.empty(shape=(size,size))for index,i in enumerate(new_data): for jndex,j in enumerate(new_data): hmap_2[index,jndex]=table.loc[i].at[j]
figure= plt.figure(figsize=(30,30))gs1 = gridspec.GridSpec(2,7)gs1.update(wspace=0.01)ax1 = plt.subplot(gs1[0:-1, :2])dendrogram(linked, orientation='left', distance_sort='descending',show_leaf_counts=True,no_labels=True)ax1.spines['left'].set_visible(False)ax1.spines['top'].set_visible(False)ax1.spines['right'].set_visible(False)
ax2 = plt.subplot(gs1[0:-1,2:6])f=ax2.imshow (hmap_2, cmap='PRGn_r', interpolation='nearest')
ax2.set_title('Fingerprint Similarity',fontsize=20,weight='bold')ax2.set_xticks (range(len(new_data)))ax2.set_yticks (range(len(new_data)))ax2.set_xticklabels (new_data,rotation=90,size=8)ax2.set_yticklabels (new_data,size=8)
ax3 = plt.subplot(gs1[0:-1,6:7])m=plt.colorbar(f,cax=ax3,shrink=0.75,orientation='vertical',spacing='uniform',pad=0.01)m.set_label ('Fingerprint Similarity')
plt.tick_params ('both',width=2)plt.plot()