1
简介
当通过深度学习输入有机物质中结构式的二维图像时,需要解决寻找分子式的问题。这是一个回归问题,需要计算结构式图像中包含的碳、氢、氧和氮等原子数。
2
环境
系统 :Win10
工具:RDKit、OpenCV、Keras、TensorFlow
3
实验步骤
使用RDKit将SMILES字符串转换为结构式图像,并进行学习以计算图像中的原子数。
4
数据预处理
提取获得类似如下数据
5
示例代码
导入库
import kerasfrom keras.layers import Input, Concatenate, GlobalAveragePooling2Dfrom keras.models import Modelfrom keras.layers.core import Dense, Dropout, Activation, Flattenfrom keras.callbacks import LearningRateScheduler, ModelCheckpointfrom keras.preprocessing.image import img_to_array, array_to_img
import numpy as npimport pandas as pdfrom sklearn.model_selection import train_test_splitimport matplotlib.pyplot as pltimport os.pathimport reimport tensorflow as tffrom keras import backend as Kimport cv2
from rdkit import Chemfrom rdkit.Chem import Draw, rdDepictor, rdMolDescriptors
import warningswarnings.filterwarnings("ignore")
载入数据
#Load SMILESdf = pd.read_csv('data.csv')SMILES = df['CAN_SMILES'].values
SMILES转numpy
# convert to arraySMILES = np.asarray(SMILES)SMILES_train, SMILES_test = train_test_split(SMILES, test_size=0.30, random_state=110)
print(SMILES_train.shape, SMILES_test.shape)
(210000,) (90000,)
定义DataGenerator数据生成函数
#Release memory for read datadel df, SMILES
class DataGenerator(): def __init__(self, X_input): self.X_input = X_input self.atom = ['C', 'H', 'N', 'O', 'S', 'P', 'Si', 'Na', 'F', 'Cl', 'Br', 'I'] self.reset() def reset(self): self.X = [] self.Y = [] def cal_img(self, mol): # Generate structural formula image img = Chem.Draw.MolToImage(mol, size=(300, 300)) return img_to_array(img)[:,:,:3] def chem_count(self, input): atom = ['C', 'H', 'N', 'O', 'S', 'P', 'Si', 'Na', 'F', 'Cl', 'Br', 'I'] count = {'C': 0, 'H': 0 ,'N': 0, 'O': 0, 'S': 0, 'P': 0, 'Si': 0, 'Na': 0, 'F': 0, 'Cl': 0, 'Br': 0, 'I': 0}
input = input + 'Z' # Add English capital letters at the end for i in range(len(atom)): if re.search(atom[i] + '\d{1,3}[A-Z]', input): #Atom + number (1 to 3 digits) + uppercase English iterator = re.finditer(atom[i] + '\d{1,3}[A-Z]', input) for match in iterator: count[atom[i]] = int(match.group()[len(atom[i]):-1]) elif re.search(atom[i] + '[A-Z]',input): #When containing atoms + uppercase English count[atom[i]] = 1 return count
def make_structural_image(self, batch_size=10): while True: for i in range(len(self.X_input)): # SConvert SMILES to mol mol = Chem.MolFromSmiles(self.X_input[i], sanitize=True) if not mol is None: # mol file is None # Calculate 2D coordinates rdDepictor.Compute2DCoords(mol)
# CalcMolWt MW = Chem.rdMolDescriptors._CalcMolWt(mol) # Calculate molecular formula from mol format chem_Formula = Chem.rdMolDescriptors.CalcMolFormula(mol) # Count atoms from molecular formula count = self.chem_count(chem_Formula)
# Add X, Y if MW < max_MW and count['C'] > 1.0: # A compound containing at least two carbon atoms and having a molar mass of 400 or less. (self.X).append(self.cal_img(mol)) (self.Y).append([count[self.atom[i]] for i in range(len(self.atom))]) # Returns the value of X, Y if len(self.X) == batch_size: inputs = np.asarray(self.X) outputs = np.asarray(self.Y) self.reset() yield inputs, outputs
建立CNN
使用RDKit读取SMILES数据并将其转换为(300 ,300 ,3 )大小的图像。同样,输出分子式['C','H','N','O','S','P','Si','Na','F','Cl','Br' ,'I']是12种原子的数量。
from keras.applications.xception import Xceptioninputs = Input(shape=(300,300,3)) # data_size (300,300,3)x = Xception(weights=None,input_shape=(300,300,3), include_top=False, pooling='avg')(inputs)y = Dense(12, activation='linear')(x)
model = Model(inputs, y)
训练模型并绘制训练曲线
# Learninghistory = model.fit_generator( generator=train_datagen.make_structural_image(batch_size=batchsize), steps_per_epoch=int(len(SMILES_train) / batchsize / 100), epochs=epochs, verbose=1, validation_data=test_datagen.make_structural_image(batch_size=batchsize), validation_steps=100, callbacks=callbacks)
#Plot training graphplt.figure()plt.plot(history.history['loss'])plt.plot(history.history['val_loss'])plt.title('model loss')plt.xlabel('epoch')plt.ylabel('loss')plt.legend(['loss', 'val_loss'], loc='upper right')plt.ylim([0,0.5])plt.show()
预测测试结果输出
#Output 50 test resultsatom = ['C', 'H', 'N', 'O', 'S', 'P', 'Si', 'Na', 'F', 'Cl', 'Br', 'I']for i in range(50): img = array_to_img(X_test[i]) real = '' pred = '' for j in range(len(atom)): if round(Y_test[i,j]) > 0: real += atom[j] + str(round(Y_test[i,j])) if round(Y_predict_test[i,j],1) > 0.5: pred += atom[j] + str(round(Y_predict_test[i,j],1)) img.save('test_result%04d_%s_%s.png' % (i, real, pred))
参考资料
1. http://www.rdkit.org/docs/GettingStartedInPython.html
2.https://blog.csdn.net/u012325865/article/details/82318084utm_source=blogxgwz3
3. https://blog.csdn.net/u012325865/article/details/103044562
4. https://blog.csdn.net/qq_35082030/article/details/79144374
5. https://blog.csdn.net/qq_19332527/article/details/79829087
作者&编辑
王建民