Linear Regression
#data libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline
import seaborn as sns
sns.set(style="white",color_codes=True)
sns.set(font_scale=1.5)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import metrics
from math import sqrt
%matplotlib inline
Import Data
The Total Number of Columns & Rows of Data imported.
df_train = pd.read_csv('/content/reg.csv')
df_train.shape
(3576, 5)
df_train.describe(include="all")
UsedCar | logofCost | logofkmrun | VolofEngine | Year | |
---|---|---|---|---|---|
count | 3576 | 3576.000000 | 3576.000000 | 3576.000000 | 3576.000000 |
unique | 6 | NaN | NaN | NaN | NaN |
top | maruthi | NaN | NaN | NaN | NaN |
freq | 875 | NaN | NaN | NaN | NaN |
mean | NaN | 5.868672 | 5.218116 | 2.462321 | 2009.259508 |
std | NaN | 0.410362 | 0.631483 | 0.975121 | 6.817812 |
min | NaN | 4.560000 | 3.204120 | 0.600000 | 1972.000000 |
25% | NaN | 5.620000 | 5.158362 | 1.800000 | 2005.000000 |
50% | NaN | 5.840000 | 5.410946 | 2.200000 | 2010.000000 |
75% | NaN | 6.130000 | 5.575188 | 3.000000 | 2015.000000 |
max | NaN | 7.260000 | 6.195346 | 6.500000 | 2019.000000 |
Now check for null in the data
df_train.isnull().sum()
UsedCar 0 logofCost 0 logofkmrun 0 VolofEngine 0 Year 0 dtype: int64
Remove the unneccessary data
df_train.head()
UsedCar | logofCost | logofkmrun | VolofEngine | Year | |
---|---|---|---|---|---|
0 | Audi | 4.56 | 5.253338 | 1.6 | 1983 |
1 | Skoda | 4.56 | 4.204120 | 1.5 | 1982 |
2 | Duster | 4.69 | 5.505150 | 1.9 | 2000 |
3 | Duster | 4.78 | 5.837588 | 2.0 | 1991 |
4 | Skoda | 4.83 | 5.350248 | 2.0 | 1988 |
Check Null again
df_train.isnull().sum()
UsedCar 0 logofCost 0 logofkmrun 0 VolofEngine 0 Year 0 dtype: int64
Check the datatypes in the csv
df_train.dtypes
UsedCar object logofCost float64 logofkmrun float64 VolofEngine float64 Year int64 dtype: object
Segregating Data
df_train.head()
df_train.columns =['UsedCarBrands','logofCost','logofkmrun','VolofEngine','Year']
df_train.shape
(3576, 5)
Plot them and visualise
fig,axs = plt.subplots(1,3,sharey=True)
df_train.plot(kind='scatter',x='VolofEngine', y='logofCost', ax=axs[0],figsize=(16,8))
df_train.plot(kind='scatter',x='logofkmrun', y='logofCost', ax=axs[1])
df_train.plot(kind='scatter',x='Year', y='logofCost', ax=axs[2])
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points. *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
<matplotlib.axes._subplots.AxesSubplot at 0x7f5dda38f908>
Lets plot the independent variable and dependent variables..
feature_cols =['logofkmrun']
x = df_train[feature_cols]
y = df_train.logofCost
Importing Linear Regression Model
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
lm = LinearRegression()
lm.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Initializing the Model
Fitting the Model to x & y
print(lm.intercept_)
print(lm.coef_)
7.653466556267104 [-0.34203819]
From the above interception, an unit increase in km run, the price goes down by 0.3420 times.
let us predict the new X values
X_new = pd.DataFrame({'logofkmrun': [4]})
X_new.head()
logofkmrun | |
---|---|
0 | 4 |
Lets Predict that again,
lm.predict(X_new)
array([6.28531378])
X_new = pd.DataFrame({'logofkmrun':[df_train.logofkmrun.min(),
df_train.logofkmrun.max()]})
X_new.head()
logofkmrun | |
---|---|
0 | 3.204120 |
1 | 6.195346 |
preds =lm.predict(X_new)
preds
array([6.55753514, 5.53442157])
df_train.plot(kind='scatter',x='logofkmrun',y='logofCost')
plt.plot(X_new,preds,c='red',linewidth=2)
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
[<matplotlib.lines.Line2D at 0x7f5dda42e208>]
import statsmodels.formula.api as smf
lm = smf.ols(formula='logofCost ~ logofkmrun',data=df_train).fit()
lm.conf_int()
0 | 1 | |
---|---|---|
Intercept | 7.558220 | 7.748714 |
logofkmrun | -0.360159 | -0.323917 |
Printing P value & Rsquare
lm.pvalues
Intercept 0.000000e+00 logofkmrun 4.404297e-254 dtype: float64
lm.rsquared
0.2770372701711742
lm.rsquared_adj
0.2768349862512445
This Model is not rigid.
Now lets take all and Multiple linear Regression
feature_cols =['logofkmrun','Year','VolofEngine']
X=df_train[feature_cols]
y=df_train.logofCost
from sklearn import model_selection
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(X,y,test_size=0.3)
lm = LinearRegression()
lm.fit(X,y)
print(lm.intercept_)
print(lm.coef_)
-71.23282112101279 [-0.1233409 0.03847625 0.17720291]
lm = LinearRegression()
lm.fit(xtrain,ytrain)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
print(lm.intercept_)
print(lm.coef_)
-71.33567662158299 [-0.11894068 0.03851991 0.17461923]
Prediction & RootMeanSquar error
predictions = lm.predict(xtest)
print(sqrt(mean_squared_error(ytest,predictions)))
0.20406986899812207
Lets Do Summary of them
lm = smf.ols(formula='logofCost~logofkmrun + VolofEngine + Year',
data=df_train).fit()
lm.conf_int()
lm.summary()
Dep. Variable: | logofCost | R-squared: | 0.766 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.766 |
Method: | Least Squares | F-statistic: | 3902. |
Date: | Thu, 07 Jan 2021 | Prob (F-statistic): | 0.00 |
Time: | 19:16:32 | Log-Likelihood: | 709.91 |
No. Observations: | 3576 | AIC: | -1412. |
Df Residuals: | 3572 | BIC: | -1387. |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -71.2328 | 1.147 | -62.103 | 0.000 | -73.482 | -68.984 |
logofkmrun | -0.1233 | 0.006 | -20.292 | 0.000 | -0.135 | -0.111 |
VolofEngine | 0.1772 | 0.003 | 52.002 | 0.000 | 0.171 | 0.184 |
Year | 0.0385 | 0.001 | 68.362 | 0.000 | 0.037 | 0.040 |
Omnibus: | 222.623 | Durbin-Watson: | 1.343 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 983.005 |
Skew: | -0.065 | Prob(JB): | 3.49e-214 |
Kurtosis: | 5.565 | Cond. No. | 6.94e+05 |
All p values are significant. km run is negatively associated with sales. Better R square value. Let me add more variables to the equation which will enhance the rsqured value.
lm =smf.ols(formula='logofCost~logofkmrun',data=df_train).fit()
lm.rsquared
0.2770372701711742
lm =smf.ols(formula='logofCost~logofkmrun + VolofEngine',
data=df_train).fit()
lm.rsquared
0.460264660530304
lm =smf.ols(formula='logofCost~logofkmrun + VolofEngine + Year',
data=df_train).fit()
lm.rsquared
0.7661806312261705
When i keep adding the variables the rsquared increases.
Now, Categorical variable, Brand name of the car is introduced.
df_train.head()
UsedCarBrands | logofCost | logofkmrun | VolofEngine | Year | |
---|---|---|---|---|---|
0 | Audi | 4.56 | 5.253338 | 1.6 | 1983 |
1 | Skoda | 4.56 | 4.204120 | 1.5 | 1982 |
2 | Duster | 4.69 | 5.505150 | 1.9 | 2000 |
3 | Duster | 4.78 | 5.837588 | 2.0 | 1991 |
4 | Skoda | 4.83 | 5.350248 | 2.0 | 1988 |
usedcar_dummies=pd.get_dummies(df_train.UsedCarBrands,prefix ="Dummy").iloc[:,1:]
df_train=pd.concat([df_train,usedcar_dummies],axis=1)
df_train.head()
UsedCarBrands | logofCost | logofkmrun | VolofEngine | Year | Dummy_BMW | Dummy_Benz | Dummy_Duster | Dummy_Skoda | Dummy_maruthi | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Audi | 4.56 | 5.253338 | 1.6 | 1983 | 0 | 0 | 0 | 0 | 0 |
1 | Skoda | 4.56 | 4.204120 | 1.5 | 1982 | 0 | 0 | 0 | 1 | 0 |
2 | Duster | 4.69 | 5.505150 | 1.9 | 2000 | 0 | 0 | 1 | 0 | 0 |
3 | Duster | 4.78 | 5.837588 | 2.0 | 1991 | 0 | 0 | 1 | 0 | 0 |
4 | Skoda | 4.83 | 5.350248 | 2.0 | 1988 | 0 | 0 | 0 | 1 | 0 |
Lets include the Brand into the model
feature_cols=['logofkmrun','VolofEngine','Year','Dummy_BMW','Dummy_Benz','Dummy_Duster','Dummy_Skoda','Dummy_maruthi']
X = df_train[feature_cols]
y = df_train.logofCost
lm = LinearRegression()
lm.fit(X,y)
print(feature_cols,lm.coef_)
['logofkmrun', 'VolofEngine', 'Year', 'Dummy_BMW', 'Dummy_Benz', 'Dummy_Duster', 'Dummy_Skoda', 'Dummy_maruthi'] [-0.10777545 0.13929154 0.04100395 0.07065918 0.06576832 -0.19842845 0.0027215 -0.03791353]
The above are the cofficients for all variables. These coefficients are with respect to Car Audi as a benchmark.
lm.intercept_
-76.29257871920358
The Above factors are inline with the respective calculations performed in other tools.
plt.figure(figsize=(5,5))
sns.heatmap(df_train.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7f5dda28ddd8>