RandomForest.py

# -*- coding: utf-8 -*-
"""Copy of Random Forest model new.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1P156y9bfDbZ8SUsd_wPqLoA7ORZWYY3Y
"""

# Commented out IPython magic to ensure Python compatibility.
# % cd /content/drive/MyDrive/Colab_notebooks/

"""##Application of z-score in Machine learning
To standardize the data as a part of data pre-processing.
To compare the z-score values of different standard distributions for better results.

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df_y = pd.read_csv('cleaned_df.csv')

Y = df_y['pIC50']

import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 6)
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

plt.hist(df_y['pIC50'], bins=100);

from scipy import stats
df_y['pIC50_z'] = np.abs(stats.zscore(df_y['pIC50']))

df_y = df_y[df_y['pIC50_z'] <= 3]

df_y = df_y.drop('pIC50_z', axis=1)

plt.hist(df_y['pIC50'], bins=100)

df_y = df_y.reset_index()
df_y = df_y.drop('index', axis=1)

df_y1 = df_y[['Name']]
df_y1

import pandas as pd
import seaborn as sns
import numpy as np
from numpy import sqrt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import BayesianRidge
from sklearn import metrics

"""##Fingerprint

R² score tells us how well our model is fitted to the data by comparing it to the average line of the dependent variable. If the score is closer to 1, then it indicates that our model performs well versus if the score is farther from 1, then it indicates that our model does not perform so well. R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that’s explained by an independent variable or variables in a regression model. Whereas correlation explains the strength of the relationship between an independent and dependent variable, R-squared explains to what extent the variance of one variable explains the variance of the second variable. So, if the R2 of a model is 0.50, then approximately half of the observed variation can be explained by the model’s inputs.
"""

df = pd.read_csv('Fingerprints.csv')

df = pd.merge(df, df_y1, on=["Name"])

X = df.drop('Name', axis=1)
Y = df_y.pIC50
Y

#from sklearn.utils import shuffle
#import numpy as np
#X, Y = shuffle(X, Y)

# Splitting dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
   
# Creating and training model
np.random.seed(100)
model = RandomForestRegressor(n_estimators=100,max_depth=6, min_samples_leaf=6,
                               min_samples_split=2, random_state= 42)
model.fit(X_train, Y_train)
   
# Model making a prediction on test data
prediction = model.predict(X_test)
   
# Evaluation of r2 score of the model against the test set
print(f"r2 Score Of Test Set : {r2_score(Y_test, prediction)}")

# df = new data from smiles
# make a prediction
#pIC50 = model.predict([df])
# summarize prediction
#print('Predicted: %.3f' % pIC50)

Y_pred = model.predict(X_test)

"""In statistical modeling and particularly regression analyses, a common way of measuring the quality of the fit of the model is the RMSE (also called Root Mean Square Deviation), given by where yi is the ith observation of y and ŷ the predicted y value given the model. If the predicted responses are very close to the true responses the RMSE will be small. If the predicted and true responses differ substantially — at least for some observations — the RMSE will be large. A value of zero would indicate a perfect fit to the data. Since the RMSE is measured on the same scale, with the same units as y, one can expect 68% of the y values to be within 1 RMSE — given the data is normally distributed. 

The mean absolute error (MAE) is a quantity used to measure how close predictions are to the outcomes. The mean absolute error is an average of the all absolute errors. The MSE is a measure of the quality of an estimator, it is always positive, and values which are closer to zero are better. https://gzipwtf.com/what-is-mean-absolute-error-in-python/
"""

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test, prediction)
rmse = sqrt(mse)
mse

"""Differences among these evaluation metrics Mean Squared Error(MSE) and Root Mean Square Error penalizes the large prediction errors vi-a-vis Mean Absolute Error (MAE). MAE is more robust to data with outliers. The lower value of MAE, MSE, and RMSE implies higher accuracy of a regression model.In statistics, mean absolute error (MAE) is a measure of errors between paired observations expressing the same phenomenon. Examples of Y versus X include comparisons of predicted versus observed, subsequent time versus initial time, and one technique of measurement versus an alternative technique of measurement. https://machinelearningmastery.com/regression-metrics-for-machine-learning/"""

# example of calculate the mean absolute error
from sklearn.metrics import mean_absolute_error
# calculate errors
errors = mean_absolute_error(Y_test, prediction)
print(errors)

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='Fingerprints')
ax.set_xlim(3, 10)
ax.set_ylim(4, 10)
ax.figure.set_size_inches(5, 5)
plt.show

x_ax = range(len(Y_test))
plt.scatter(x_ax, Y_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction, lw=0.8, color="red", label="predicted")
plt.title("Fingerprint features")
plt.ylabel("pIC50")
plt.legend()
plt.show()

"""Might be possible that there is a problem with pIC50 score. However the test score might contain values that are lower than in the training. Uneven distribution of activity. Toght binding limit. Membrane bound receptor. Biological measures become challenging due to an assumption that the concentration of molecules are higher than the enzyme.

##2D
"""

df1 = pd.read_csv('2D.csv')

df1 = pd.merge(df1, df_y1, on=["Name"])

df1 = df1.dropna(axis='columns')

X1 = df1.drop('Name', axis=1)
Y1 = df_y.pIC50
X1

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X1 = selection.fit_transform(X1) #Remove low variance features

# Splitting dataset into training and testing sets
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size = 0.2, random_state = 42)
   
# Creating and training model
np.random.seed(100)
model1 = RandomForestRegressor(n_estimators=100)
model1.fit(X1_train, Y1_train)
   
# Model making a prediction on test data
prediction1 = model1.predict(X1_test)
   
# Evaluation of r2 score of the model against the test set
print(f"r2 Score Of Test Set : {r2_score(Y1_test, prediction1)}")

"""Overtraining of the model when there is a big change in the r2 score between the test and training set. Overfitting of data - diagnosis of the problem. """

from sklearn.metrics import r2_score
r2_score(Y1_test, prediction1)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test, prediction1)
#rmse = sqrt(mse)
mse

# example of calculate the mean absolute error
from sklearn.metrics import mean_absolute_error
# calculate errors
errors = mean_absolute_error(Y_test, prediction1)
print(errors)

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, prediction1, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='1D+2D')
ax.set_xlim(3, 10)
ax.set_ylim(4, 10)
ax.figure.set_size_inches(5, 5)
plt.show

x_ax = range(len(Y_test))
plt.scatter(x_ax, Y_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction1, lw=0.8, color="red", label="predicted")
plt.title("2D")
plt.ylabel("pIC50")
plt.legend()
plt.show()

"""##3D"""

df2 = pd.read_csv('3D.csv')

df2 = pd.merge(df2, df_y1, on=["Name"])

df2

X2 = df2.drop('Name', axis=1)
Y2 = df_y.pIC50
X2

from sklearn.utils import shuffle
import numpy as np
X2, Y2 = shuffle(X2, Y2)

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X2 = selection.fit_transform(X2) #Remove low variance features

# Splitting dataset into training and testing sets
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size = 0.2, random_state = 42)
   
# Creating and training model
model2 = RandomForestRegressor(n_estimators=100, random_state= 0)
model2.fit(X2_train, Y2_train)
   
# Model making a prediction on test data
prediction2 = model2.predict(X2_test)

# example of calculate the mean absolute error
from sklearn.metrics import mean_absolute_error
# calculate errors
errors = mean_absolute_error(Y2_test, prediction2)
print(errors)

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y2_test, prediction2, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='3D')
ax.set_xlim(3, 10)
ax.set_ylim(4, 10)
ax.figure.set_size_inches(5, 5)
plt.show

x_ax = range(len(Y_test))
plt.scatter(x_ax, Y_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction2, lw=0.8, color="red", label="predicted")
plt.title("3D")
plt.ylabel("pIC50")
plt.legend()
plt.show()


"""##2D+3D"""

df3= pd.merge(df1, df2, how="left", on=["Name"])

df3 = pd.merge(df3, df_y1, on=["Name"])

df3.to_csv('2D+3D.csv', index=False)

Y3 = df_y.pIC50

Y3

df3 = df3.dropna(axis='columns')

X3= df3.drop('Name', axis=1)
X3

X3.shape

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X3 = selection.fit_transform(X3)

# Splitting dataset into training and testing sets
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size = 0.2, random_state = 42)
   
# Creating and training model
model3 = RandomForestRegressor(n_estimators=100, random_state= 0)
model3.fit(X3_train, Y3_train)
   
# Model making a prediction on test data
prediction3 = model3.predict(X3_test)
   
# Evaluation of r2 score of the model against the test set
print(f"r2 Score Of Test Set : {r2_score(Y3_test, prediction3)}")

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y3_test, prediction3)
#rmse = sqrt(mse)
mse

# example of calculate the mean absolute error
from sklearn.metrics import mean_absolute_error
# calculate errors
errors = mean_absolute_error(Y3_test, prediction3)
print(errors)

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, prediction3, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='1D+2D+3D')
ax.set_xlim(3, 10)
ax.set_ylim(4, 10)
ax.figure.set_size_inches(5, 5)
plt.show

x_ax = range(len(Y_test))
plt.scatter(x_ax, Y_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction3, lw=0.8, color="red", label="predicted")
plt.title("1D+2D+3D")
plt.ylabel("pIC50")
plt.legend()
plt.show()

"""##2D+3D+F"""

df4 = pd.merge(df, df3, how="left", on=["Name"])

X4= df4.drop('Name', axis=1)
X4

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X4 = selection.fit_transform(X4)

X4.shape

Y

# Splitting dataset into training and testing sets
X4_train, X4_test, Y_train, Y_test = train_test_split(X4, Y, test_size = 0.2, random_state = 42)
   
# Creating and training model
model4 = RandomForestRegressor(n_estimators=100, random_state= 0)
model4.fit(X4_train, Y_train)
   
# Model making a prediction on test data
prediction4 = model4.predict(X4_test)
   
# Evaluation of r2 score of the model against the test set
print(f"r2 Score Of Test Set : {r2_score(Y_test, prediction4)}")

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test, prediction4)
#rmse = sqrt(mse)
mse

# example of calculate the mean absolute error
from sklearn.metrics import mean_absolute_error
# calculate errors
errors = mean_absolute_error(Y_test, prediction4)
print(errors)

x_ax = range(len(Y_test))
plt.scatter(x_ax, Y_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction4, lw=0.8, color="red", label="predicted")
plt.title("1D+2D+3D+CF")
plt.ylabel("pIC50")
plt.legend()
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, prediction4, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='1D+2D+3D+CF')
ax.set_xlim(3, 10)
ax.set_ylim(3, 10)
ax.figure.set_size_inches(5, 5)
plt.show

"""##2D+3D+F+USR"""

df5= pd.read_csv('USR_df.csv')

df5 = pd.merge(df5, df4, on=["Name"])

df_y2 = df_y[['Name', 'pIC50']]

Y5= pd.merge(df5['Name'], df_y2, on=["Name"])

df5 = pd.merge(df5, Y5, on=["Name"])

df5

Y5

X5 = pd.merge(df5, Y5['Name'], on=["Name"])
X5 = X5.drop('Name', axis= 1)
X5 = X5.drop('pIC50', axis=1)
X5

Y5 = Y5.pIC50
Y5

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X5 = selection.fit_transform(X5)

X5.shape

# Splitting dataset into training and testing sets
X5_train, X5_test, Y5_train, Y5_test = train_test_split(X5, Y5, test_size = 0.2, random_state = 42)
   
# Creating and training model
model5 = RandomForestRegressor(n_estimators=100, random_state= 0)
fit = model5.fit(X5_train, Y5_train)
   
# Model making a prediction on test data
prediction5 = model5.predict(X5_test)
   
# Evaluation of r2 score of the model against the test set
print(f"r2 Score Of Test Set : {r2_score(Y5_test, prediction5)}")

df5.to_csv('Descriptors.csv', index=False)

from sklearn import model_selection
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn import svm

# Import libraries
import matplotlib.pyplot as plt
import numpy as np
 
 
# Creating dataset
np.random.seed(10)
 
data_DTR = 0.55, 0.32, 0.41, 0.3, 0.3, 0.46, 0.55
data_RFM = 0.66, 0.64, 0.61, 0.66, 0.62, 0.68, 0.7
data_R = 0.8, 0.3,0.2,0.4, 0.7, 0.56, 0.72
data_LCV = 0.7, 0.3, 0.225, 0.111, 0.1, 0.223, 0.44
data_BR = 0.67, 0.22, 0.45, 0.4, 0.5, 0.48, 0.66

data = [data_DTR, data_RFM, data_R, data_LCV, data_BR]

 
fig = plt.figure(figsize =(10, 7))
 
# Creating axes instance
#ax = fig.add_axes([0, 0,1 , 1])
 
# Creating plot
bp = ax.boxplot(data)


fig.suptitle('Comparison of models', fontsize=14, fontweight='bold')

ax = fig.add_subplot(111)
ax.boxplot(data)

ax.set_ylabel('R2')
plt.xticks([1, 2, 3,4, 5], ['DTR', 'RF', 'R','LCV', 'BR'])
plt.show()

"""Mse for above 7 0.35, Mae is 0.47 and R2 id 0.20"""

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y5_test, prediction5)
#rmse = sqrt(mse)
mse

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y5_test, prediction5, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='1D+2D+3D+F+USR')
ax.set_xlim(3, 10)
ax.set_ylim(3, 10)
ax.figure.set_size_inches(5, 5)
plt.show

x_ax = range(len(Y5_test))
plt.scatter(x_ax, Y5_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction5, lw=0.8, color="red", label="predicted")
plt.title("1D+2D+3D+CF+USR")
plt.ylabel("pIC50")
plt.legend()
plt.show()

"""The positive improvement of the higher ones has affected the predictions of the lower one. Calculate the mse and mae for above 7 and below 7 for test set. Compare 2D+USR instead of 3D+2D. Some smiles that the descriptors and the 3D can not calculate.

## 2D+ USR
"""

df5= pd.read_csv('USR_df.csv')

df6 = pd.merge(df5, df1, on=["Name"])

Y6= pd.merge(df5['Name'], df1, on=["Name"])

Y6= pd.merge(df5['Name'], df_y2, on=["Name"])
Y6 = Y6.pIC50
Y6

X6= df6.drop('Name', axis=1)
X6

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X6 = selection.fit_transform(X6) #Remove low variance features

# Splitting dataset into training and testing sets
X6_train, X6_test, Y6_train, Y6_test = train_test_split(X6, Y6, test_size = 0.2, random_state = 42)
   
# Creating and training model
model6 = RandomForestRegressor(n_estimators=100, random_state= 0)
model6.fit(X6_train, Y6_train)
   
# Model making a prediction on test data
prediction6 = model6.predict(X6_test)
   
# Evaluation of r2 score of the model against the test set
print(f"r2 Score Of Test Set : {r2_score(Y6_test, prediction6)}")

X6.shape

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y6_test, prediction6)
#rmse = sqrt(mse)
mse

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y6_test, prediction6, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set(title='1D+2D+USR')
ax.set_xlim(3, 10)
ax.set_ylim(3, 10)
ax.figure.set_size_inches(5, 5)
plt.show

x_ax = range(len(Y6_test))
plt.scatter(x_ax, Y6_test, s=5, color="blue", label="original")
plt.plot(x_ax, prediction6, lw=0.8, color="red", label="predicted")
plt.title("1D+2D+USR")
plt.ylabel("pIC50")
plt.legend()
plt.show()