linear
February 10, 2025
[1]: import pandas as pd
import numpy as np
import seaborn as sns
import [Link] as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from [Link] import r2_score,mean_squared_error,mean_absolute_error
[2]: data=pd.read_csv('[Link]')
[3]: [Link]()
<class '[Link]'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 TV 200 non-null float64
1 Radio 200 non-null float64
2 Newspaper 200 non-null float64
3 Sales 200 non-null float64
dtypes: float64(4)
memory usage: 6.4 KB
[4]: [Link](data)
plt.tight_layout()
[Link]()
1
[5]: [Link]([Link](),annot=True,cmap='coolwarm',linewidth=2)
[Link]()
2
[6]: fig,ax=[Link](1,3,figsize=(18,8))
[Link](data=data,x='TV',y='Sales',ax=ax[0])
ax[0].set_title('TV Vs. Sales')
[Link](data=data,x='Radio',y='Sales',ax=ax[1])
ax[1].set_title('Radio Vs. Sales')
[Link](data=data,x='Newspaper',y='Sales',ax=ax[2])
ax[2].set_title('Newspaper Vs. Sales')
[Link]()
3
[7]: [Link](data)
[Link]()
4
[8]: X=[Link]('Sales',axis=1)
Y=data['Sales']
[9]: !pip install statsmodels
Requirement already satisfied: statsmodels in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages
(0.14.4)
Requirement already satisfied: numpy<3,>=1.22.3 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
statsmodels) (2.2.1)
Requirement already satisfied: scipy!=1.9.2,>=1.8 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
statsmodels) (1.15.1)
Requirement already satisfied: pandas!=2.1.0,>=1.4 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
statsmodels) (2.2.3)
Requirement already satisfied: patsy>=0.5.6 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
statsmodels) (1.0.1)
Requirement already satisfied: packaging>=21.3 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
statsmodels) (24.2)
Requirement already satisfied: python-dateutil>=2.8.2 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
pandas!=2.1.0,>=1.4->statsmodels) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
pandas!=2.1.0,>=1.4->statsmodels) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
pandas!=2.1.0,>=1.4->statsmodels) (2024.2)
Requirement already satisfied: six>=1.5 in
c:\users\itzsh\appdata\local\programs\python\python313\lib\site-packages (from
python-dateutil>=2.8.2->pandas!=2.1.0,>=1.4->statsmodels) (1.17.0)
[10]: print([Link])
print([Link])
(200, 3)
(200,)
[11]: import [Link] as sm
X_const=sm.add_constant(X)
model=[Link](Y,X_const).fit()
predictions=[Link](X_const)
residuals=[Link]
[Link]()
5
[11]:
Dep. Variable: Sales R-squared: 0.903
Model: OLS Adj. R-squared: 0.901
Method: Least Squares F-statistic: 605.4
Date: Mon, 20 Jan 2025 Prob (F-statistic): 8.13e-99
Time: [Link] Log-Likelihood: -383.34
No. Observations: 200 AIC: 774.7
Df Residuals: 196 BIC: 787.9
Df Model: 3
Covariance Type: nonrobust
coef std err t P> |t| [0.025 0.975]
const 4.6251 0.308 15.041 0.000 4.019 5.232
TV 0.0544 0.001 39.592 0.000 0.052 0.057
Radio 0.1070 0.008 12.604 0.000 0.090 0.124
Newspaper 0.0003 0.006 0.058 0.954 -0.011 0.012
Omnibus: 16.081 Durbin-Watson: 2.251
Prob(Omnibus): 0.000 Jarque-Bera (JB): 27.655
Skew: -0.431 Prob(JB): 9.88e-07
Kurtosis: 4.605 Cond. No. 454.
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
R-Squared - 90.3% of variance in sales is explained by my model very clearly Adj R-Squared - 90.1%
- no. of predictors with different sample sizes can be effectively predicted and also it is used check
the model is overfitting or not F-statistics :605.4 and p-value:<0.05 statistcally significant model
Sales=4.6251+0.0544TV+0.1070Radio+0.0003*Newspaper The t-statistics should be higher and p-
vale<0.05 to choose the feature ie) the feature is statistically significant to predict sales Therefor TV
and Radio are statistically significant Newspaper is not statistically significant Model Diagnostics:
Omnibus and Jaquar Berra: Residulas are not normally distributed Durbin Watson value:2.25
check autocorrelation =2 no autocorrelation <2 +ve autocorrelation 2 -ve autocorrelation There is
no autocorrelation since the value is closer to 2 [Link].:454 30 multicollinearity
[13]: #normality plot
fig,ax=[Link](1,2,figsize=(18,8))
[Link](residuals,line='45',fit=True,ax=ax[0])
ax[0].set_title('QQ Plot')
[Link](residuals,kde=True,ax=ax[1])
ax[1].set_title('Histogram')
[Link]()
6
[14]: X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.
↪2,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(160, 3)
(40, 3)
(160,)
(40,)
[18]: model_lin=LinearRegression()
model_lin.fit(X_train,Y_train)
y_pred=model_lin.predict(X_test)
[19]: model_lin.intercept_
[19]: np.float64(4.714126402214127)
[20]: model_lin.coef_
[20]: array([0.05450927, 0.10094536, 0.00433665])
[21]: r2=r2_score(Y_test,y_pred)
print('R Squared is',r2)
mse=mean_squared_error(Y_test,y_pred)
print('MSE',mse)
mae=mean_absolute_error(Y_test,y_pred)
print('MAE',mae)
7
R Squared is 0.9059011844150826
MSE 2.9077569102710896
MAE 1.2748262109549338
[22]: rmse=[Link](mse)
print('RMSE',rmse)
RMSE 1.7052146229349223
[23]: [Link](x=predictions,y=residuals)
[Link](y=0,color='r',linestyle='--')
[Link]()
[24]: #K fold cross validation for limited dataset
from sklearn.model_selection import cross_val_score,cross_val_predict
cv_score=cross_val_score(LinearRegression(),X,Y,cv=5,
scoring='neg_mean_squared_error')
#scoreing -> r2 or explained_variance
print(cv_score) # 10 scores
print(cv_score.mean())
[-3.05606897 -2.02676065 -1.85105212 -4.72039259 -2.63694072]
8
-2.8582430099910106
[25]: cv_pred=cross_val_predict(LinearRegression(),X,Y,cv=10)
cv_pred.shape
[25]: (200,)
[26]: from sklearn.linear_model import Ridge,Lasso
model_ridge=Ridge(alpha=1.0)#multicolinearity
model_ridge.fit(X_train,Y_train)
y_pred_ridge=model_ridge.predict(X_test)
r2=r2_score(Y_test,y_pred_ridge)
print(r2)
mse=mean_squared_error(Y_test,y_pred_ridge)
print(mse)
[Link](x=y_pred_ridge,y=Y_test,color='red',label='Ridge Regression')
0.9058999159458062
2.907796107367048
[26]: <Axes: ylabel='Sales'>
9
[27]: #high dimension data and perform feature selection by penalizing not important␣
↪feature
#Lasso
model_lasso=Lasso(alpha=0.01)
model_lasso.fit(X_train,Y_train)
y_pred_lasso=model_lasso.predict(X_test)
r2=r2_score(Y_test,y_pred_lasso)
print(r2)
mse=mean_squared_error(Y_test,y_pred_lasso)
print(mse)
[Link](x=y_pred_lasso,y=Y_test,color='blue',label='Lasso')
0.9058967115743171
2.9078951259708705
[27]: <Axes: ylabel='Sales'>
FEATURE SELECTION REGRESSION
[28]: import pandas as pd
import numpy as np
from [Link].outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_regression,RFE
10
[32]: df=pd.read_csv('Admission_Predict.csv')
[Link]
[32]: Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
dtype='object')
[33]: [Link]()
<class '[Link]'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Hours Studied 10000 non-null int64
1 Previous Scores 10000 non-null int64
2 Sleep Hours 10000 non-null int64
3 Sample Question Papers Practiced 10000 non-null int64
4 Performance Index 10000 non-null float64
dtypes: float64(1), int64(4)
memory usage: 390.8 KB
[34]: #clean the column
[Link]=[Link]()
[Link]
[34]: Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
'LOR', 'CGPA', 'Research', 'Chance of Admit'],
dtype='object')
[35]: [Link]()
[35]: Serial No. GRE Score TOEFL Score University Rating SOP LOR CGPA \
0 1 337 118 4 4.5 4.5 9.65
1 2 324 107 4 4.0 4.5 8.87
2 3 316 104 3 3.0 3.5 8.00
3 4 322 110 3 3.5 2.5 8.67
4 5 314 103 2 2.0 3.0 8.21
Research Chance of Admit
0 1 0.92
1 1 0.76
2 1 0.72
3 1 0.80
4 0 0.65
[36]: import [Link] as sm
X=[Link]('Chance of Admit',axis=1)
11
X_const=sm.add_constant(X)
vif_features=[Link]()
vif_features['features']=X_const.columns
vif_features['VIF']= [variance_inflation_factor(X_const.values,i)
for i in range(X_const.shape[1])]
vif_features
[36]: features VIF
0 const 1533.435012
1 Serial No. 1.087496
2 GRE Score 4.617117
3 TOEFL Score 4.357634
4 University Rating 2.959083
5 SOP 3.113171
6 LOR 2.432982
7 CGPA 5.419519
8 Research 1.543329
[37]: from sklearn.model_selection import train_test_split
Y=df['Chance of Admit']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2
,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(320, 8)
(80, 8)
(320,)
(80,)
[38]: from sklearn.linear_model import LinearRegression
#filter method ANOVA
f_score,p_value=f_regression(X_train,Y_train)
#wrapper method
rfe=RFE(LinearRegression(),n_features_to_select=5)
[Link](X_train,Y_train)
features_selected=[Link]({
'Features':X_train.columns,
'F_Score':f_score,
'P_Value':p_value,
'RFE Ranking':rfe.ranking_
})
features_selected.sort_values(by='RFE Ranking')
12
[38]: Features F_Score P_Value RFE Ranking
3 University Rating 336.841085 8.199160e-52 1
2 TOEFL Score 508.463462 6.344140e-68 1
6 CGPA 980.157945 3.782644e-99 1
5 LOR 226.292491 5.341324e-39 1
7 Research 135.906150 2.174883e-26 1
1 GRE Score 578.757756 1.429869e-73 2
4 SOP 263.411578 1.424603e-43 3
0 Serial No. 1.871572 1.722610e-01 4
[39]: print(X_train.columns)
print(rfe.support_)
Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
'LOR', 'CGPA', 'Research'],
dtype='object')
[False False True True False True True True]
[40]: import seaborn as sns
import [Link] as plt
[Link]([Link](),annot=True,cmap='coolwarm',linewidths=1)
[Link]()
13
[ ]:
14