- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from statsmodels.graphics.regressionplots import influence_plot
- import statsmodels.formula.api as smf
- import numpy as np
- #Read the data
- cars = pd.read_csv("Cars.csv")
- cars.head()
- cars.info()
- #check for missing values
- cars.isna().sum()
- #Correlation Matrix
- cars.corr()
- #Scatterplot between variables along with histograms
- #Format the plot background and scatter plots for all the variables
- sns.set_style(style='darkgrid')
- sns.pairplot(cars)
- #Preparing a model
- #Build model
- import statsmodels.formula.api as smf
- model = smf.ols('MPG~WT+VOL+SP+HP',data=cars).fit()
- #Coefficients
- model.params
- #t and p-Values
- print(model.tvalues, '\n', model.pvalues)
- #R squared values
- (model.rsquared,model.rsquared_adj)
- Simple Linear Regression Models
- ml_v=smf.ols('MPG~VOL',data = cars).fit()
- #t and p-Values
- print(ml_v.tvalues, '\n', ml_v.pvalues)
- ml_w=smf.ols('MPG~WT',data = cars).fit()
- print(ml_w.tvalues, '\n', ml_w.pvalues)
- ml_wv=smf.ols('MPG~WT+VOL',data = cars).fit()
- print(ml_wv.tvalues, '\n', ml_wv.pvalues)
- #Calculating VIF
- #A variance inflation factor (VIF) is a measure of the amount of multicollinearity in regression analysis.
- rsq_hp = smf.ols('HP~WT+VOL+SP',data=cars).fit().rsquared
- vif_hp = 1/(1-rsq_hp) # 16.33
- rsq_wt = smf.ols('WT~HP+VOL+SP',data=cars).fit().rsquared
- vif_wt = 1/(1-rsq_wt) # 564.98
- rsq_vol = smf.ols('VOL~WT+SP+HP',data=cars).fit().rsquared
- vif_vol = 1/(1-rsq_vol) # 564.84
- rsq_sp = smf.ols('SP~WT+VOL+HP',data=cars).fit().rsquared
- vif_sp = 1/(1-rsq_sp) # 16.35
- # Storing vif values in a data frame
- d1 = {'Variables':['Hp','WT','VOL','SP'],'VIF':[vif_hp,vif_wt,vif_vol,vif_sp]}
- Vif_frame = pd.DataFrame(d1)
- Vif_frame
- #Residual Analysis
- Test for Normality of Residuals (Q-Q Plot)
- import statsmodels.api as sm
- qqplot=sm.qqplot(model.resid,line='q') # line = 45 to draw the diagnoal line
- plt.title("Normal Q-Q plot of residuals")
- plt.show()
- list(np.where(model.resid>10))
- Residual Plot for Homoscedasticity
- def get_standardized_values( vals ):
- return (vals - vals.mean())/vals.std()
- plt.scatter(get_standardized_values(model.fittedvalues),
- get_standardized_values(model.resid))
- plt.title('Residual Plot')
- plt.xlabel('Standardized Fitted values')
- plt.ylabel('Standardized residual values')
- plt.show()
- #Residual Vs Regressors
- fig = plt.figure(figsize=(15,8))
- fig = sm.graphics.plot_regress_exog(model, "VOL", fig=fig)
- plt.show()
- fig = plt.figure(figsize=(15,8))
- fig = sm.graphics.plot_regress_exog(model, "SP", fig=fig)
- plt.show()
- fig = plt.figure(figsize=(15,8))
- fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
- plt.show()
- fig = plt.figure(figsize=(15,8))
- fig = sm.graphics.plot_regress_exog(model, "WT", fig=fig)
- plt.show()
- #Model Deletion Diagnostics
- #Detecting Influencers/Outliers
- #Cook’s Distance
- model_influence = model.get_influence()
- (c,_ ) = model_influence.cooks_distance
- #Plot the influencers values using stem plot
- fig = plt.subplots(figsize=(20, 7))
- plt.stem(np.arange(len(cars)), np.round(c, 3))
- plt.xlabel('Row index')
- plt.ylabel('Cooks Distance')
- plt.show()
- #index and value of influencer where c is more than .5
- (np.argmax(c),np.max(c))
- #High Influence points
- from statsmodels.graphics.regressionplots import influence_plot
- influence_plot(model)
- plt.show()
- k = cars.shape[1]
- n = cars.shape[0]
- leverage_cutoff = 3*((k + 1)/n)
- leverage_cutoff
- 3From the above plot, it is evident that data point 70 and 76 are the influencers
- cars[cars.index.isin([70, 76])]
- #See the differences in HP and other variable values
- cars.head()
- #Improving the model
- #Load the data
- cars_new = pd.read_csv("Cars.csv",index_col=0)
- #Discard the data points which are influencers and reasign the row number (reset_index())
- car1=cars_new.drop(cars_new.index[[70,76]],axis=0).reset_index()
- car1
- #Build Model
- #Exclude variable "WT" and generate R-Squared and AIC values
- final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car1).fit()
- (final_ml_V.rsquared,final_ml_V.aic)
- #Exclude variable "VOL" and generate R-Squared and AIC values
- final_ml_W= smf.ols('MPG~WT+SP+HP',data = car1).fit()
- (final_ml_W.rsquared,final_ml_W.aic)
- #Comparing above R-Square and AIC values, model 'final_ml_V' has high R- square and low AIC value hence #include variable 'VOL' so that multi collinearity problem would be resolved.
- #Cook’s Distance
- model_influence_V = final_ml_V.get_influence()
- (c_V, _) = model_influence_V.cooks_distance
- fig= plt.subplots(figsize=(20,7))
- plt.stem(np.arange(len(car1)),np.round(c_V,3));
- plt.xlabel('Row index')
- plt.ylabel('Cooks Distance');
- #index of the data points where c is more than .5
- (np.argmax(c_V),np.max(c_V))
- #Drop 76 and 77 observations
- car2=car1.drop(car1.index[[76,77]],axis=0)
- car2
- #Reset the index and re arrange the row values
- car3=car2.reset_index()
- car4=car3.drop(['index'],axis=1)
- car4
- #Build the model on the new data
- final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
- #Again check for influencers
- model_influence_V = final_ml_V.get_influence()
- (c_V, _) = model_influence_V.cooks_distance
- fig= plt.subplots(figsize=(20,7))
- plt.stem(np.arange(len(car4)),np.round(c_V,3));
- plt.xlabel('Row index')
- plt.ylabel('Cooks Distance');
- #index of the data points where c is more than .5
- (np.argmax(c_V),np.max(c_V))
- #Since the value is <1 , we can stop the diagnostic process and finalize the model
- #Check the accuracy of the mode
- final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
- (final_ml_V.rsquared,final_ml_V.aic)
- #Predicting for new data
- #New data for prediction
- new_data=pd.DataFrame({'HP':40,"VOL":95,"SP":102},index=[1])
- new_data
- final_ml_V.predict(new_data)
- pred_y = final_ml_V.predict(car4)
- pred_y