The text below is selected, press Ctrl+C to copy to your clipboard. (⌘+C on Mac) No line numbers will be copied.
Guest
Sdfgg
By Guest on 1st February 2024 05:27:18 AM | Syntax: PYTHON | Views: 104



New Paste New paste | Download Paste Download | Toggle Line Numbers Show/Hide line no. | Copy Paste Copy text to clipboard
  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. import seaborn as sns
  4. from statsmodels.graphics.regressionplots import influence_plot
  5. import statsmodels.formula.api as smf
  6. import numpy as np
  7. #Read the data
  8. cars = pd.read_csv("Cars.csv")
  9. cars.head()
  10. cars.info()
  11. #check for missing values
  12. cars.isna().sum()
  13.  
  14. #Correlation Matrix
  15. cars.corr()
  16.  
  17. #Scatterplot between variables along with histograms
  18. #Format the plot background and scatter plots for all the variables
  19. sns.set_style(style='darkgrid')
  20. sns.pairplot(cars)
  21.  
  22. #Preparing a model
  23. #Build model
  24. import statsmodels.formula.api as smf
  25. model = smf.ols('MPG~WT+VOL+SP+HP',data=cars).fit()
  26. #Coefficients
  27. model.params
  28. #t and p-Values
  29. print(model.tvalues, '\n', model.pvalues)
  30. #R squared values
  31. (model.rsquared,model.rsquared_adj)
  32. Simple Linear Regression Models
  33. ml_v=smf.ols('MPG~VOL',data = cars).fit()  
  34. #t and p-Values
  35. print(ml_v.tvalues, '\n', ml_v.pvalues)  
  36. ml_w=smf.ols('MPG~WT',data = cars).fit()  
  37. print(ml_w.tvalues, '\n', ml_w.pvalues)  
  38. ml_wv=smf.ols('MPG~WT+VOL',data = cars).fit()  
  39. print(ml_wv.tvalues, '\n', ml_wv.pvalues)
  40.  
  41. #Calculating VIF
  42. #A variance inflation factor (VIF) is a measure of the amount of multicollinearity in regression analysis.
  43. rsq_hp = smf.ols('HP~WT+VOL+SP',data=cars).fit().rsquared  
  44. vif_hp = 1/(1-rsq_hp) # 16.33
  45.  
  46. rsq_wt = smf.ols('WT~HP+VOL+SP',data=cars).fit().rsquared  
  47. vif_wt = 1/(1-rsq_wt) # 564.98
  48.  
  49. rsq_vol = smf.ols('VOL~WT+SP+HP',data=cars).fit().rsquared  
  50. vif_vol = 1/(1-rsq_vol) #  564.84
  51.  
  52. rsq_sp = smf.ols('SP~WT+VOL+HP',data=cars).fit().rsquared  
  53. vif_sp = 1/(1-rsq_sp) #  16.35
  54.  
  55. # Storing vif values in a data frame
  56. d1 = {'Variables':['Hp','WT','VOL','SP'],'VIF':[vif_hp,vif_wt,vif_vol,vif_sp]}
  57. Vif_frame = pd.DataFrame(d1)  
  58. Vif_frame
  59.  
  60. #Residual Analysis
  61. Test for Normality of Residuals (Q-Q Plot)
  62. import statsmodels.api as sm
  63. qqplot=sm.qqplot(model.resid,line='q') # line = 45 to draw the diagnoal line
  64. plt.title("Normal Q-Q plot of residuals")
  65. plt.show()
  66. list(np.where(model.resid>10))
  67. Residual Plot for Homoscedasticity
  68. def get_standardized_values( vals ):
  69.     return (vals - vals.mean())/vals.std()
  70. plt.scatter(get_standardized_values(model.fittedvalues),
  71.             get_standardized_values(model.resid))
  72.  
  73. plt.title('Residual Plot')
  74. plt.xlabel('Standardized Fitted values')
  75. plt.ylabel('Standardized residual values')
  76. plt.show()
  77.  
  78. #Residual Vs Regressors
  79. fig = plt.figure(figsize=(15,8))
  80. fig = sm.graphics.plot_regress_exog(model, "VOL", fig=fig)
  81. plt.show()
  82. fig = plt.figure(figsize=(15,8))
  83. fig = sm.graphics.plot_regress_exog(model, "SP", fig=fig)
  84. plt.show()
  85. fig = plt.figure(figsize=(15,8))
  86. fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
  87. plt.show()
  88. fig = plt.figure(figsize=(15,8))
  89. fig = sm.graphics.plot_regress_exog(model, "WT", fig=fig)
  90. plt.show()
  91.  
  92. #Model Deletion Diagnostics
  93. #Detecting Influencers/Outliers
  94. #Cook’s Distance
  95.  
  96. model_influence = model.get_influence()
  97. (c,_ ) = model_influence.cooks_distance
  98. #Plot the influencers values using stem plot
  99. fig = plt.subplots(figsize=(20, 7))
  100. plt.stem(np.arange(len(cars)), np.round(c, 3))
  101. plt.xlabel('Row index')
  102. plt.ylabel('Cooks Distance')
  103. plt.show()
  104. #index and value of influencer where c is more than .5
  105. (np.argmax(c),np.max(c))
  106.  
  107. #High Influence points
  108. from statsmodels.graphics.regressionplots import influence_plot
  109. influence_plot(model)
  110. plt.show()
  111. k = cars.shape[1]
  112. n = cars.shape[0]
  113. leverage_cutoff = 3*((k + 1)/n)
  114. leverage_cutoff
  115. 3From the above plot, it is evident that data point 70 and 76 are the influencers
  116. cars[cars.index.isin([70, 76])]
  117. #See the differences in HP and other variable values
  118. cars.head()
  119.  
  120.  
  121. #Improving the model
  122. #Load the data
  123. cars_new = pd.read_csv("Cars.csv",index_col=0)
  124. #Discard the data points which are influencers and reasign the row number (reset_index())
  125. car1=cars_new.drop(cars_new.index[[70,76]],axis=0).reset_index()
  126. car1
  127.  
  128. #Build Model
  129. #Exclude variable "WT" and generate R-Squared and AIC values
  130. final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car1).fit()
  131. (final_ml_V.rsquared,final_ml_V.aic)
  132. #Exclude variable "VOL" and generate R-Squared and AIC values
  133. final_ml_W= smf.ols('MPG~WT+SP+HP',data = car1).fit()
  134. (final_ml_W.rsquared,final_ml_W.aic)
  135. #Comparing above R-Square and AIC values, model 'final_ml_V' has high R- square and low AIC value hence #include variable 'VOL' so that multi collinearity problem would be resolved.
  136.  
  137. #Cook’s Distance
  138. model_influence_V = final_ml_V.get_influence()
  139. (c_V, _) = model_influence_V.cooks_distance
  140. fig= plt.subplots(figsize=(20,7))
  141. plt.stem(np.arange(len(car1)),np.round(c_V,3));
  142. plt.xlabel('Row index')
  143. plt.ylabel('Cooks Distance');
  144. #index of the data points where c is more than .5
  145. (np.argmax(c_V),np.max(c_V))
  146. #Drop 76 and 77 observations
  147. car2=car1.drop(car1.index[[76,77]],axis=0)
  148. car2
  149. #Reset the index and re arrange the row values
  150. car3=car2.reset_index()
  151. car4=car3.drop(['index'],axis=1)
  152. car4
  153. #Build the model on the new data
  154. final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
  155. #Again check for influencers
  156. model_influence_V = final_ml_V.get_influence()
  157. (c_V, _) = model_influence_V.cooks_distance
  158. fig= plt.subplots(figsize=(20,7))
  159. plt.stem(np.arange(len(car4)),np.round(c_V,3));
  160. plt.xlabel('Row index')
  161. plt.ylabel('Cooks Distance');
  162. #index of the data points where c is more than .5
  163. (np.argmax(c_V),np.max(c_V))
  164. #Since the value is <1 , we can stop the diagnostic process and finalize the model
  165. #Check the accuracy of the mode
  166. final_ml_V= smf.ols('MPG~VOL+SP+HP',data = car4).fit()
  167. (final_ml_V.rsquared,final_ml_V.aic)
  168.  
  169. #Predicting for new data
  170. #New data for prediction
  171. new_data=pd.DataFrame({'HP':40,"VOL":95,"SP":102},index=[1])
  172. new_data
  173. final_ml_V.predict(new_data)
  174. pred_y = final_ml_V.predict(car4)
  175. pred_y





sdfgg