#data libraries
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline

import seaborn as sns
sns.set(style="white",color_codes=True)
sns.set(font_scale=1.5)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import metrics
from math import sqrt

%matplotlib inline


df_train = pd.read_csv('/content/reg.csv')
df_train.shape

(3576, 5)


df_train.describe(include="all")


df_train.isnull().sum()

UsedCar        0
logofCost      0
logofkmrun     0
VolofEngine    0
Year           0
dtype: int64


df_train.head()


df_train.isnull().sum()

UsedCar        0
logofCost      0
logofkmrun     0
VolofEngine    0
Year           0
dtype: int64


df_train.dtypes

UsedCar         object
logofCost      float64
logofkmrun     float64
VolofEngine    float64
Year             int64
dtype: object


df_train.head()
df_train.columns =['UsedCarBrands','logofCost','logofkmrun','VolofEngine','Year']
df_train.shape

(3576, 5)


fig,axs = plt.subplots(1,3,sharey=True)
df_train.plot(kind='scatter',x='VolofEngine', y='logofCost', ax=axs[0],figsize=(16,8))
df_train.plot(kind='scatter',x='logofkmrun', y='logofCost', ax=axs[1])
df_train.plot(kind='scatter',x='Year', y='logofCost', ax=axs[2])

*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.

<matplotlib.axes._subplots.AxesSubplot at 0x7f5dda38f908>


feature_cols =['logofkmrun']
x = df_train[feature_cols]
y = df_train.logofCost


from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf


lm = LinearRegression()
lm.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


print(lm.intercept_)
print(lm.coef_)

7.653466556267104
[-0.34203819]


X_new = pd.DataFrame({'logofkmrun': [4]})
X_new.head()


lm.predict(X_new)

array([6.28531378])


X_new = pd.DataFrame({'logofkmrun':[df_train.logofkmrun.min(),
                                    df_train.logofkmrun.max()]})
X_new.head()


preds =lm.predict(X_new)
preds

array([6.55753514, 5.53442157])


df_train.plot(kind='scatter',x='logofkmrun',y='logofCost')
plt.plot(X_new,preds,c='red',linewidth=2)

*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.

[<matplotlib.lines.Line2D at 0x7f5dda42e208>]


import statsmodels.formula.api as smf
lm = smf.ols(formula='logofCost ~ logofkmrun',data=df_train).fit()


lm.conf_int()


lm.pvalues

Intercept      0.000000e+00
logofkmrun    4.404297e-254
dtype: float64


lm.rsquared

0.2770372701711742


lm.rsquared_adj

0.2768349862512445


feature_cols =['logofkmrun','Year','VolofEngine']
X=df_train[feature_cols]
y=df_train.logofCost

from sklearn import model_selection
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(X,y,test_size=0.3)


lm = LinearRegression()
lm.fit(X,y)
print(lm.intercept_)
print(lm.coef_)

-71.23282112101279
[-0.1233409   0.03847625  0.17720291]


lm = LinearRegression()
lm.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


print(lm.intercept_)
print(lm.coef_)

-71.33567662158299
[-0.11894068  0.03851991  0.17461923]


predictions = lm.predict(xtest)
print(sqrt(mean_squared_error(ytest,predictions)))

0.20406986899812207


lm = smf.ols(formula='logofCost~logofkmrun + VolofEngine + Year',
             data=df_train).fit()
lm.conf_int()
lm.summary()


lm =smf.ols(formula='logofCost~logofkmrun',data=df_train).fit()
lm.rsquared

0.2770372701711742


lm =smf.ols(formula='logofCost~logofkmrun + VolofEngine', 
            data=df_train).fit()
lm.rsquared

0.460264660530304


lm =smf.ols(formula='logofCost~logofkmrun + VolofEngine + Year', 
            data=df_train).fit()
lm.rsquared

0.7661806312261705


df_train.head()


usedcar_dummies=pd.get_dummies(df_train.UsedCarBrands,prefix ="Dummy").iloc[:,1:]
df_train=pd.concat([df_train,usedcar_dummies],axis=1)
df_train.head()


feature_cols=['logofkmrun','VolofEngine','Year','Dummy_BMW','Dummy_Benz','Dummy_Duster','Dummy_Skoda','Dummy_maruthi']
X = df_train[feature_cols]
y = df_train.logofCost

lm = LinearRegression()
lm.fit(X,y)
print(feature_cols,lm.coef_)

['logofkmrun', 'VolofEngine', 'Year', 'Dummy_BMW', 'Dummy_Benz', 'Dummy_Duster', 'Dummy_Skoda', 'Dummy_maruthi'] [-0.10777545  0.13929154  0.04100395  0.07065918  0.06576832 -0.19842845
  0.0027215  -0.03791353]


lm.intercept_

-76.29257871920358


plt.figure(figsize=(5,5))
sns.heatmap(df_train.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x7f5dda28ddd8>

	UsedCar	logofCost	logofkmrun	VolofEngine	Year
count	3576	3576.000000	3576.000000	3576.000000	3576.000000
unique	6	NaN	NaN	NaN	NaN
top	maruthi	NaN	NaN	NaN	NaN
freq	875	NaN	NaN	NaN	NaN
mean	NaN	5.868672	5.218116	2.462321	2009.259508
std	NaN	0.410362	0.631483	0.975121	6.817812
min	NaN	4.560000	3.204120	0.600000	1972.000000
25%	NaN	5.620000	5.158362	1.800000	2005.000000
50%	NaN	5.840000	5.410946	2.200000	2010.000000
75%	NaN	6.130000	5.575188	3.000000	2015.000000
max	NaN	7.260000	6.195346	6.500000	2019.000000

	UsedCar	logofCost	logofkmrun	VolofEngine	Year
0	Audi	4.56	5.253338	1.6	1983
1	Skoda	4.56	4.204120	1.5	1982
2	Duster	4.69	5.505150	1.9	2000
3	Duster	4.78	5.837588	2.0	1991
4	Skoda	4.83	5.350248	2.0	1988

	0	1
Intercept	7.558220	7.748714
logofkmrun	-0.360159	-0.323917

Dep. Variable:	logofCost	R-squared:	0.766
Model:	OLS	Adj. R-squared:	0.766
Method:	Least Squares	F-statistic:	3902.
Date:	Thu, 07 Jan 2021	Prob (F-statistic):	0.00
Time:	19:16:32	Log-Likelihood:	709.91
No. Observations:	3576	AIC:	-1412.
Df Residuals:	3572	BIC:	-1387.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-71.2328	1.147	-62.103	0.000	-73.482	-68.984
logofkmrun	-0.1233	0.006	-20.292	0.000	-0.135	-0.111
VolofEngine	0.1772	0.003	52.002	0.000	0.171	0.184
Year	0.0385	0.001	68.362	0.000	0.037	0.040

Omnibus:	222.623	Durbin-Watson:	1.343
Prob(Omnibus):	0.000	Jarque-Bera (JB):	983.005
Skew:	-0.065	Prob(JB):	3.49e-214
Kurtosis:	5.565	Cond. No.	6.94e+05