Sdfgg | PasteXY: Free Copy And Paste Tool

The text below is selected, press Ctrl+C to copy to your clipboard. (⌘+C on Mac) No line numbers will be copied.

Sdfgg
By Guest on 1st February 2024 05:27:18 AM | Syntax: PYTHON | Views: 104

New paste | Download Paste

Download | Toggle Line Numbers

Show/Hide line no. | Copy Paste

Copy text to clipboard

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot
import statsmodels.formula.api as smf
import numpy as np
#Read the data
cars = pd.read_csv("Cars.csv")
cars.head()
cars.info()
#check for missing values
cars.isna().sum()
#Correlation Matrix
cars.corr()
#Scatterplot between variables along with histograms
#Format the plot background and scatter plots for all the variables
sns.set_style(style='darkgrid')
sns.pairplot(cars)
#Preparing a model
#Build model
import statsmodels.formula.api as smf
model = smf.ols('MPG~WT+VOL+SP+HP',data=cars).fit()
#Coefficients
model.params
#t and p-Values
print(model.tvalues, '\n', model.pvalues)
#R squared values
(model.rsquared,model.rsquared_adj)
Simple Linear Regression Models
ml_v=smf.ols('MPG~VOL',data = cars).fit()
#t and p-Values
print(ml_v.tvalues, '\n', ml_v.pvalues)
ml_w=smf.ols('MPG~WT',data = cars).fit()
print(ml_w.tvalues, '\n', ml_w.pvalues)
ml_wv=smf.ols('MPG~WT+VOL',data = cars).fit()
print(ml_wv.tvalues, '\n', ml_wv.pvalues)
#Calculating VIF
#A variance inflation factor (VIF) is a measure of the amount of multicollinearity in regression analysis.
rsq_hp = smf.ols('HP~WT+VOL+SP',data=cars).fit().rsquared
vif_hp = 1/(1-rsq_hp) # 16.33
rsq_wt = smf.ols('WT~HP+VOL+SP',data=cars).fit().rsquared
vif_wt = 1/(1-rsq_wt) # 564.98
rsq_vol = smf.ols('VOL~WT+SP+HP',data=cars).fit().rsquared
vif_vol = 1/(1-rsq_vol) # 564.84
rsq_sp = smf.ols('SP~WT+VOL+HP',data=cars).fit().rsquared
vif_sp = 1/(1-rsq_sp) # 16.35
# Storing vif values in a data frame
d1 = {'Variables':['Hp','WT','VOL','SP'],'VIF':[vif_hp,vif_wt,vif_vol,vif_sp]}
Vif_frame = pd.DataFrame(d1)
Vif_frame
#Residual Analysis
Test for Normality of Residuals (Q-Q Plot)
import statsmodels.api as sm
qqplot=sm.qqplot(model.resid,line='q') # line = 45 to draw the diagnoal line
plt.title("Normal Q-Q plot of residuals")
plt.show()
list(np.where(model.resid>10))
Residual Plot for Homoscedasticity
def get_standardized_values( vals ):
return (vals - vals.mean())/vals.std()
plt.scatter(get_standardized_values(model.fittedvalues),
get_standardized_values(model.resid))
plt.title('Residual Plot')
plt.xlabel('Standardized Fitted values')
plt.ylabel('Standardized residual values')
plt.show()
#Residual Vs Regressors
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "VOL", fig=fig)
plt.show()
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "SP", fig=fig)
plt.show()
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
plt.show()
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "WT", fig=fig)
plt.show()
#Model Deletion Diagnostics
#Detecting Influencers/Outliers
#Cook’s Distance
model_influence = model.get_influence()
(c,_ ) = model_influence.cooks_distance
#Plot the influencers values using stem plot
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(cars)), np.round(c, 3))
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')
plt.show()
#index and value of influencer where c is more than .5
(np.argmax(c),np.max(c))
#High Influence points
from statsmodels.graphics.regressionplots import influence_plot
influence_plot(model)
plt.show()
k = cars.shape[1]
n = cars.shape[0]
leverage_cutoff = 3*((k + 1)/n)
leverage_cutoff
3From the above plot, it is evident that data point 70 and 76 are the influencers
cars[cars.index.isin([70, 76])]
#See the differences in HP and other variable values
cars.head()
#Improving the model
#Load the data
cars_new = pd.read_csv("Cars.csv",index_col=0)
#Discard the data points which are influencers and reasign the row number (reset_index())
car1=cars_new.drop(cars_new.index[[70,76]],axis=0).reset_index()
car1
#Build Model
#Exclude variable "WT" and generate R-Squared and AIC values
final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car1).fit()
(final_ml_V.rsquared,final_ml_V.aic)
#Exclude variable "VOL" and generate R-Squared and AIC values
final_ml_W= smf.ols('MPG~WT+SP+HP',data = car1).fit()
(final_ml_W.rsquared,final_ml_W.aic)
#Comparing above R-Square and AIC values, model 'final_ml_V' has high R- square and low AIC value hence #include variable 'VOL' so that multi collinearity problem would be resolved.
#Cook’s Distance
model_influence_V = final_ml_V.get_influence()
(c_V, _) = model_influence_V.cooks_distance
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(car1)),np.round(c_V,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance');
#index of the data points where c is more than .5
(np.argmax(c_V),np.max(c_V))
#Drop 76 and 77 observations
car2=car1.drop(car1.index[[76,77]],axis=0)
car2
#Reset the index and re arrange the row values
car3=car2.reset_index()
car4=car3.drop(['index'],axis=1)
car4
#Build the model on the new data
final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
#Again check for influencers
model_influence_V = final_ml_V.get_influence()
(c_V, _) = model_influence_V.cooks_distance
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(car4)),np.round(c_V,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance');
#index of the data points where c is more than .5
(np.argmax(c_V),np.max(c_V))
#Since the value is <1 , we can stop the diagnostic process and finalize the model
#Check the accuracy of the mode
final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
(final_ml_V.rsquared,final_ml_V.aic)
#Predicting for new data
#New data for prediction
new_data=pd.DataFrame({'HP':40,"VOL":95,"SP":102},index=[1])
new_data
final_ml_V.predict(new_data)
pred_y = final_ml_V.predict(car4)
pred_y

Tags For This Paste:

Raw Paste:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot
import statsmodels.formula.api as smf
import numpy as np
#Read the data
cars = pd.read_csv("Cars.csv")
cars.head()
cars.info()
#check for missing values
cars.isna().sum()

#Correlation Matrix
cars.corr()

#Scatterplot between variables along with histograms
#Format the plot background and scatter plots for all the variables
sns.set_style(style='darkgrid')
sns.pairplot(cars)

#Preparing a model
#Build model
import statsmodels.formula.api as smf 
model = smf.ols('MPG~WT+VOL+SP+HP',data=cars).fit()
#Coefficients
model.params
#t and p-Values
print(model.tvalues, '\n', model.pvalues)
#R squared values
(model.rsquared,model.rsquared_adj)
Simple Linear Regression Models
ml_v=smf.ols('MPG~VOL',data = cars).fit()  
#t and p-Values
print(ml_v.tvalues, '\n', ml_v.pvalues)  
ml_w=smf.ols('MPG~WT',data = cars).fit()  
print(ml_w.tvalues, '\n', ml_w.pvalues)  
ml_wv=smf.ols('MPG~WT+VOL',data = cars).fit()  
print(ml_wv.tvalues, '\n', ml_wv.pvalues)
  
#Calculating VIF
#A variance inflation factor (VIF) is a measure of the amount of multicollinearity in regression analysis. 
rsq_hp = smf.ols('HP~WT+VOL+SP',data=cars).fit().rsquared  
vif_hp = 1/(1-rsq_hp) # 16.33

rsq_wt = smf.ols('WT~HP+VOL+SP',data=cars).fit().rsquared  
vif_wt = 1/(1-rsq_wt) # 564.98

rsq_vol = smf.ols('VOL~WT+SP+HP',data=cars).fit().rsquared  
vif_vol = 1/(1-rsq_vol) #  564.84

rsq_sp = smf.ols('SP~WT+VOL+HP',data=cars).fit().rsquared  
vif_sp = 1/(1-rsq_sp) #  16.35

# Storing vif values in a data frame
d1 = {'Variables':['Hp','WT','VOL','SP'],'VIF':[vif_hp,vif_wt,vif_vol,vif_sp]}
Vif_frame = pd.DataFrame(d1)  
Vif_frame

#Residual Analysis
Test for Normality of Residuals (Q-Q Plot)
import statsmodels.api as sm
qqplot=sm.qqplot(model.resid,line='q') # line = 45 to draw the diagnoal line
plt.title("Normal Q-Q plot of residuals")
plt.show()
list(np.where(model.resid>10))
Residual Plot for Homoscedasticity
def get_standardized_values( vals ):
    return (vals - vals.mean())/vals.std()
plt.scatter(get_standardized_values(model.fittedvalues),
            get_standardized_values(model.resid))

plt.title('Residual Plot')
plt.xlabel('Standardized Fitted values')
plt.ylabel('Standardized residual values')
plt.show()

#Residual Vs Regressors
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "VOL", fig=fig)
plt.show()
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "SP", fig=fig)
plt.show()
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
plt.show()
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "WT", fig=fig)
plt.show()

#Model Deletion Diagnostics
#Detecting Influencers/Outliers
#Cook’s Distance
 
model_influence = model.get_influence()
(c,_ ) = model_influence.cooks_distance
#Plot the influencers values using stem plot
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(cars)), np.round(c, 3))
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')
plt.show()
#index and value of influencer where c is more than .5
(np.argmax(c),np.max(c))

#High Influence points
from statsmodels.graphics.regressionplots import influence_plot
influence_plot(model)
plt.show()
k = cars.shape[1]
n = cars.shape[0]
leverage_cutoff = 3*((k + 1)/n)
leverage_cutoff
3From the above plot, it is evident that data point 70 and 76 are the influencers
cars[cars.index.isin([70, 76])]
#See the differences in HP and other variable values
cars.head()

#Improving the model
#Load the data
cars_new = pd.read_csv("Cars.csv",index_col=0)
#Discard the data points which are influencers and reasign the row number (reset_index())
car1=cars_new.drop(cars_new.index[[70,76]],axis=0).reset_index()
car1
 
#Build Model
#Exclude variable "WT" and generate R-Squared and AIC values
final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car1).fit()
(final_ml_V.rsquared,final_ml_V.aic)
#Exclude variable "VOL" and generate R-Squared and AIC values
final_ml_W= smf.ols('MPG~WT+SP+HP',data = car1).fit()
(final_ml_W.rsquared,final_ml_W.aic)
#Comparing above R-Square and AIC values, model 'final_ml_V' has high R- square and low AIC value hence #include variable 'VOL' so that multi collinearity problem would be resolved.

#Cook’s Distance
model_influence_V = final_ml_V.get_influence()
(c_V, _) = model_influence_V.cooks_distance
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(car1)),np.round(c_V,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance');
#index of the data points where c is more than .5
(np.argmax(c_V),np.max(c_V))
#Drop 76 and 77 observations
car2=car1.drop(car1.index[[76,77]],axis=0)
car2
#Reset the index and re arrange the row values
car3=car2.reset_index()
car4=car3.drop(['index'],axis=1)
car4
#Build the model on the new data
final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
#Again check for influencers
model_influence_V = final_ml_V.get_influence()
(c_V, _) = model_influence_V.cooks_distance
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(car4)),np.round(c_V,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance');
#index of the data points where c is more than .5
(np.argmax(c_V),np.max(c_V))
#Since the value is <1 , we can stop the diagnostic process and finalize the model
#Check the accuracy of the mode
final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
(final_ml_V.rsquared,final_ml_V.aic)

#Predicting for new data
#New data for prediction
new_data=pd.DataFrame({'HP':40,"VOL":95,"SP":102},index=[1])
new_data
final_ml_V.predict(new_data)
pred_y = final_ml_V.predict(car4)
pred_y

Useful Links

Webmaster Tools

Share This Page