Dr. Pavel Polishchuk
Institute of Molecular and Translational Medicine, Faculty of Medicine and Dentistry, Palacký University and University Hospital in Olomouc, HnÄ›votÃnská 1333/5, 779 00 Olomouc, Czech Republic
http://imtm.cz
http://qsar4u.com
pavel_polishchuk@ukr.net
pavlo.polishchuk@upol.cz
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
# from rdkit.Chem.Draw import IPythonConsole
# from rdkit.Chem import Draw
# IPythonConsole.ipython_useSVG=True
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
from sklearn.externals import joblib
fname = "data/logBB.sdf"
mols = []
y = []
for mol in Chem.SDMolSupplier(fname):
if mol is not None:
mols.append(mol)
y.append(mol.GetIntProp("logBB_class"))
# generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
def rdkit_numpy_convert(fp):
output = []
for f in fp:
arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(f, arr)
output.append(arr)
return np.asarray(output)
x = rdkit_numpy_convert(fp)
x.shape
# check wether the data set is balanced
sum(y) / len(y)
seed = 42
# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed)
cv = StratifiedKFold(n_splits=5, random_state=seed)
# print out ids of folds
for i, (train_index, test_index) in enumerate(cv.split(x_tr, y_tr)):
print("\nFold_" + str(i+1))
print("TRAIN:", train_index)
print("TEST:", test_index)
This step may be crucial for certain modeling approaches lke SVM. In the case of binary fingerprints it may be less useful.
# obtain scale object which can be further applied to scale any data to fit the training set
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)
# it is a good idea to save it for future use
joblib.dump(scale, "data/logBB_scale.pkl", compress=3)
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3],
"n_estimators": [100, 250, 500]}
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)
# run model building
m.fit(x_tr, y_tr)
m.best_params_
m.best_score_
m.cv_results_
m.cv_results_['mean_test_score']
m.cv_results_['params']
joblib.dump(m, "data/logBB_rf_morgan.pkl", compress=3)
# load scale if necessary
scale = joblib.load("data/logBB_scale.pkl")
# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)
# predict logBB class
pred_rf = m.predict(x_ts)
pred_rf
accuracy_score(y_ts, pred_rf)
matthews_corrcoef(y_ts, pred_rf)
cohen_kappa_score(y_ts, pred_rf)
# if the model includes several ones like RF models or consensus models (or for probabilistic models)
# we can calculate consistency of predictions amongs those models and use it for estimation of applicability domain
pred_prob = m.predict_proba(x_ts)
# probablity
pred_prob
# setup threshold
threshold = 0.8
# calc maximum predicted probability for each row (compound) and compare to the threshold
da = np.amax(pred_prob, axis=1) > threshold
da
# calc statistics
accuracy_score(np.asarray(y_ts)[da], pred_rf[da])
matthews_corrcoef(np.asarray(y_ts)[da], pred_rf[da])
cohen_kappa_score(np.asarray(y_ts)[da], pred_rf[da])
# calc coverage
sum(da) / len(da)
# create grid search dictionary
param_grid = {"C": [10 ** i for i in range(0, 5)],
"gamma": [10 ** i for i in range(-6, 0)]}
# setup model building
svm = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, n_jobs=2, cv=cv, verbose=1)
# run model building
svm.fit(x_tr, y_tr)
svm.best_score_
svm.best_params_
# save model
joblib.dump(svm, "data/logBB_svm_morgan.pkl", compress=3)
# predict logBB for the test set compounds
pred_svm = svm.predict(x_ts)
pred_svm
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_svm))
print("MCC = ", matthews_corrcoef(y_ts, pred_svm))
print("Kappa = ", cohen_kappa_score(y_ts, pred_svm))
# estimate applicability domain and calc stat
pred_prob = svm.predict_proba(x_ts)
da = np.amax(pred_prob, axis=1) > threshold
print("Accuracy = ", accuracy_score(np.asarray(y_ts)[da], pred_svm[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_ts)[da], pred_svm[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_ts)[da], pred_svm[da]))
print("Coverage = ", sum(da) / len(da))
# setup model building
param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5),
param_grid, n_jobs=2, cv=cv, verbose=1)
# run model building
gbm.fit(x_tr, y_tr)
gbm.best_score_
gbm.best_params_
pred_gbm = gbm.predict(x_ts)
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_gbm))
print("MCC = ", matthews_corrcoef(y_ts, pred_gbm))
print("Kappa = ", cohen_kappa_score(y_ts, pred_gbm))
pred_c = 1 * (((pred_rf + pred_svm + pred_gbm) / 3) >= 0.5)
pred_c
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_c))
print("MCC = ", matthews_corrcoef(y_ts, pred_c))
print("Kappa = ", cohen_kappa_score(y_ts, pred_c))
# calc some descriptors
descr = []
for m in mols:
descr.append([Descriptors.MolLogP(m),
Descriptors.TPSA(m),
Descriptors.NHOHCount(m),
Descriptors.NOCount(m),
Descriptors.NumHAcceptors(m),
Descriptors.NumHDonors(m),
Descriptors.NumRotatableBonds(m),
Descriptors.NumHeteroatoms(m),
Descriptors.FractionCSP3(m)])
descr = np.asarray(descr)
descr.shape
# add them to morgan fingerprints
x = np.concatenate((x, descr), axis=1)
x.shape
# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed)
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3],
"n_estimators": [100, 250, 500]}
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)
# run model building
m.fit(x_tr, y_tr)
m.best_score_
# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)
# predict logBB for the test set compounds
pred = m.predict(x_ts)
pred
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred))
print("MCC = ", matthews_corrcoef(y_ts, pred))
print("Kappa = ", cohen_kappa_score(y_ts, pred))
# estimate applicability domain and calc stat
pred_prob = m.predict_proba(x_ts)
da = np.amax(pred_prob, axis=1) > threshold
print("Accuracy = ", accuracy_score(np.asarray(y_ts)[da], pred[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_ts)[da], pred[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_ts)[da], pred[da]))
print("Coverage = ", sum(da) / len(da))
The model has a better accuracy. Added descritors improved the model predictivity.
# rebuild RF model manually using best parameters to be able to extract additional information from the model
rf = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"],
max_features=m.best_params_["max_features"],
random_state=seed)
rf.fit(x_tr, y_tr)
imp = rf.feature_importances_
imp
indices = np.argsort(imp)[::-1]
print("Feature ranking:")
# print top 10 features
for i in range(10):
print("%d. feature %d (%f)" % (i + 1, indices[i], imp[indices[i]]))
2049 - MolLogP
2050 - TPSA(m)
2051 - NHOHCount
2052 - NOCount
2053 - NumHAcceptors
2054 - NumHDonors
2055 - NumRotatableBonds
2056 - NumHeteroatoms
2057 - FractionCSP3
features with numbers 1-2048 are different Morgan fingerprints