sklearn.linear_model.LinearRegression
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data=pd.read_csv('house_price.csv', encoding='utf-8')
X=data[data.columns[1:5]]
y=data[["house_value"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_scaled_train=scaler.transform(X_train)
X_scaled_test=scaler.transform(X_test)
import statsmodels.api as sm
x_train_new = sm.add_constant(X_train)
x_test_new = sm.add_constant(X_test)
x_train_new.head()
const income bedrooms households rooms
17235 1.0 2.0577 0.185449 3.945455 6.372727
14220 1.0 4.0000 0.171566 2.741497 6.363946
3280 1.0 5.8904 0.154485 2.969325 6.651840
15279 1.0 0.9393 0.241460 3.257256 4.518470
14727 1.0 2.7143 0.194977 2.679287 6.385301
multi_model = sm.OLS(y_train,x_train_new).fit()
print (multi_model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: house_value R-squared: 0.546
Model: OLS Adj. R-squared: 0.545
Method: Least Squares F-statistic: 3980.
Date: Mon, 18 Oct 2021 Prob (F-statistic): 0.00
Time: 09:32:54 Log-Likelihood: -1.6570e+05
No. Observations: 13266 AIC: 3.314e+05
Df Residuals: 13261 BIC: 3.315e+05
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -2.849e+04 8884.093 -3.206 0.001 -4.59e+04 -1.11e+04
income 5.588e+04 500.997 111.538 0.000 5.49e+04 5.69e+04
bedrooms 5.586e+05 2.02e+04 27.637 0.000 5.19e+05 5.98e+05
households -2.586e+04 775.128 -33.356 0.000 -2.74e+04 -2.43e+04
rooms -5810.6069 834.780 -6.961 0.000 -7446.896 -4174.318
==============================================================================
Omnibus: 1975.541 Durbin-Watson: 2.016
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4568.878
Skew: 0.866 Prob(JB): 0.00
Kurtosis: 5.294 Cond. No. 284.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
multi_model2 = sm.OLS(y_test,x_test_new).fit()
print (multi_model2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: house_value R-squared: 0.563
Model: OLS Adj. R-squared: 0.562
Method: Least Squares F-statistic: 1421.
Date: Mon, 18 Oct 2021 Prob (F-statistic): 0.00
Time: 09:37:12 Log-Likelihood: -55169.
No. Observations: 4423 AIC: 1.103e+05
Df Residuals: 4418 BIC: 1.104e+05
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -2.196e+04 1.48e+04 -1.483 0.138 -5.1e+04 7075.709
income 5.57e+04 838.452 66.426 0.000 5.41e+04 5.73e+04
bedrooms 5.402e+05 3.44e+04 15.713 0.000 4.73e+05 6.08e+05
households -2.603e+04 1270.717 -20.484 0.000 -2.85e+04 -2.35e+04
rooms -6039.8888 1344.918 -4.491 0.000 -8676.601 -3403.177
==============================================================================
Omnibus: 688.606 Durbin-Watson: 1.968
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1499.714
Skew: 0.915 Prob(JB): 0.00
Kurtosis: 5.188 Cond. No. 284.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_scaled_train, y_train)
pred_train=model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)
0.5455724996358273
pred_test=model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)
0.562684388358716
# RMSE (Root Mean Squared Error)
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("훈련 데이터 RMSE:", np.sqrt(MSE_train))
print("테스트 데이터 RMSE:", np.sqrt(MSE_test))
훈련 데이터 RMSE: 64340.33927728243
테스트 데이터 RMSE: 63220.79672157402
# 기타 선형 모델평가지표: MAE (Mean Absolute Error)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred_test)
47230.874701637375
# 기타 선형 모델평가지표: MSE (Mean Squared Error)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)
3996869138.1105847
# 기타 선형 모델평가지표: MAPE (Mean Absolute Percentage Error)
def MAPE(y_test, y_pred):
return np.mean(np.abs((y_test - pred_test) / y_test)) * 100
MAPE(y_test, pred_test)
house_value 30.571439
dtype: float64
# 기타 선형 모델평가지표: MPE (Mean Percentage Error)
def MAE(y_test, y_pred):
return np.mean((y_test - pred_test) / y_test) * 100
MAE(y_test, pred_test)
house_value -12.37266
dtype: float64