교재 : 파이썬 머신러닝 완벽 가이드, 위키북스
회귀
보스턴 주택 가격
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.read_csv('houseprice.csv')
|
Id |
MSSubClass |
MSZoning |
LotFrontage |
LotArea |
Street |
Alley |
LotShape |
LandContour |
Utilities |
... |
PoolArea |
PoolQC |
Fence |
MiscFeature |
MiscVal |
MoSold |
YrSold |
SaleType |
SaleCondition |
SalePrice |
0 |
1 |
60 |
RL |
65.0 |
8450 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
2 |
2008 |
WD |
Normal |
208500 |
1 |
2 |
20 |
RL |
80.0 |
9600 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
5 |
2007 |
WD |
Normal |
181500 |
2 |
3 |
60 |
RL |
68.0 |
11250 |
Pave |
NaN |
IR1 |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
9 |
2008 |
WD |
Normal |
223500 |
3 |
4 |
70 |
RL |
60.0 |
9550 |
Pave |
NaN |
IR1 |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
2 |
2006 |
WD |
Abnorml |
140000 |
4 |
5 |
60 |
RL |
84.0 |
14260 |
Pave |
NaN |
IR1 |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
12 |
2008 |
WD |
Normal |
250000 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1455 |
1456 |
60 |
RL |
62.0 |
7917 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
8 |
2007 |
WD |
Normal |
175000 |
1456 |
1457 |
20 |
RL |
85.0 |
13175 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
MnPrv |
NaN |
0 |
2 |
2010 |
WD |
Normal |
210000 |
1457 |
1458 |
70 |
RL |
66.0 |
9042 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
GdPrv |
Shed |
2500 |
5 |
2010 |
WD |
Normal |
266500 |
1458 |
1459 |
20 |
RL |
68.0 |
9717 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
4 |
2010 |
WD |
Normal |
142125 |
1459 |
1460 |
20 |
RL |
75.0 |
9937 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
6 |
2008 |
WD |
Normal |
147500 |
1460 rows × 81 columns
df = pd.read_csv('houseprice.csv')
df.head(2)
|
Id |
MSSubClass |
MSZoning |
LotFrontage |
LotArea |
Street |
Alley |
LotShape |
LandContour |
Utilities |
... |
PoolArea |
PoolQC |
Fence |
MiscFeature |
MiscVal |
MoSold |
YrSold |
SaleType |
SaleCondition |
SalePrice |
0 |
1 |
60 |
RL |
65.0 |
8450 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
2 |
2008 |
WD |
Normal |
208500 |
1 |
2 |
20 |
RL |
80.0 |
9600 |
Pave |
NaN |
Reg |
Lvl |
AllPub |
... |
0 |
NaN |
NaN |
NaN |
0 |
5 |
2007 |
WD |
Normal |
181500 |
2 rows × 81 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
isnull_series = df.isnull().sum()
isnull_series[isnull_series>0].sort_values(ascending=False)
PoolQC 1453
MiscFeature 1406
Alley 1369
Fence 1179
FireplaceQu 690
LotFrontage 259
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
BsmtExposure 38
BsmtFinType2 38
BsmtFinType1 37
BsmtCond 37
BsmtQual 37
MasVnrArea 8
MasVnrType 8
Electrical 1
dtype: int64
plt.xticks(rotation=5)
sns.histplot(df['SalePrice'],kde=True)
<AxesSubplot:xlabel='SalePrice', ylabel='Count'>

log_saleprice = np.log1p(df['SalePrice'])
sns.histplot(log_saleprice,kde=True)
<AxesSubplot:xlabel='SalePrice', ylabel='Count'>

original_saleprice = df['SalePrice']
df['SalePrice'] = np.log1p(df['SalePrice'])
df.drop(columns=['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','Id'],inplace=True)
df.fillna(df.mean(),inplace=True)
null_column_count = df.isnull().sum()[df.isnull().sum()>0]
df.dtypes[null_column_count.index]
MasVnrType object
BsmtQual object
BsmtCond object
BsmtExposure object
BsmtFinType1 object
BsmtFinType2 object
Electrical object
GarageType object
GarageFinish object
GarageQual object
GarageCond object
dtype: object
df.shape
(1460, 75)
df_ohe=pd.get_dummies(df)
df_ohe.shape
(1460, 271)
def get_rmse(model):
from sklearn.metrics import mean_squared_error
import numpy as np
pred = model.predict(X_test)
mse = mean_squared_error(y_test,pred)
rmse = np.sqrt(mse)
print(model.__class__.__name__,' 로그 변환된 RMSE:',np.round(rmse,3))
return rmse
def get_rmses(models):
rmses=[]
for model in models:
rmse = get_rmse(model)
rmses.append(rmse)
return rmses
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
df_ohe['SalePrice']
0 12.247699
1 12.109016
2 12.317171
3 11.849405
4 12.429220
...
1455 12.072547
1456 12.254868
1457 12.493133
1458 11.864469
1459 11.901590
Name: SalePrice, Length: 1460, dtype: float64
y = df_ohe['SalePrice']
X = df_ohe.drop(columns=['SalePrice'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=156)
lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
LinearRegression()
ridge_reg = Ridge()
ridge_reg.fit(X_train,y_train)
Ridge()
lasso_reg =Lasso()
lasso_reg.fit(X_train,y_train)
Lasso()
models=[lr_reg,ridge_reg,lasso_reg]
get_rmses(models)
LinearRegression 로그 변환된 RMSE: 0.132
Ridge 로그 변환된 RMSE: 0.128
Lasso 로그 변환된 RMSE: 0.176
[0.13189576579154494, 0.12750846334052998, 0.17628250556471403]
np.expm1(0.13189576579154494)
0.14098938294353924
def get_top_bottom_coef(model,n=10):
coef = pd.Series(model.coef_,index=X.columns)
return coef
get_top_bottom_coef(lr_reg)
MSSubClass -0.000488
LotFrontage 0.000390
LotArea 0.000002
OverallQual 0.041866
OverallCond 0.032077
...
SaleCondition_AdjLand -0.076029
SaleCondition_Alloca 0.028081
SaleCondition_Family -0.063272
SaleCondition_Normal -0.000464
SaleCondition_Partial 0.167346
Length: 270, dtype: float64
def get_top_bottom_coef(model,n=10):
coef = pd.Series(model.coef_,index=X.columns)
coef_high = coef.sort_values(ascending=False).head(n)
coef_low = coef.sort_values(ascending=False).tail(n)
return coef_high, coef_low
get_top_bottom_coef(lr_reg,5)
(RoofMatl_Membran 0.528057
RoofMatl_Metal 0.414453
RoofMatl_WdShngl 0.345254
RoofMatl_Roll 0.311983
RoofStyle_Shed 0.292647
dtype: float64,
Electrical_Mix -0.300984
MSZoning_C (all) -0.323040
Functional_Maj2 -0.426389
Condition2_PosN -0.663744
RoofMatl_ClyTile -2.372268
dtype: float64)
def visualize_coefficient(models):
fig,axs = plt.subplots(figsize=(24,10),nrows=1,ncols=3)
fig.tight_layout()
for i_num,model in enumerate(models):
coef_high,coef_low = get_top_bottom_coef(model)
coef_concat = pd.concat([coef_high,coef_low])
axs[i_num].set_title(model.__class__.__name__+ ' Coefficient',size=25)
axs[i_num].tick_params(axis='y',direction='in',pad=-120)
for label in (axs[i_num].get_xticklabels()+axs[i_num].get_yticklabels()):
label.set_fontsize(22)
sns.barplot(x=coef_concat.values,y=coef_concat.index,ax=axs[i_num])
visualize_coefficient(models)

from sklearn.model_selection import cross_val_score
def get_avg_rmse_cv(models):
for model in models:
rmse_list = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error',cv=5))
rmse_avg = np.mean(rmse_list)
print(f'{model.__class__.__name__} cv rmse 값 리스트 : {np.round(rmse_list, 3)}')
print(f'{model.__class__.__name__} cv 평균 rmse 값 : {np.round(rmse_avg, 3)}')
get_avg_rmse_cv(models)
LinearRegression cv rmse 값 리스트 : [0.135 0.165 0.168 0.111 0.198]
LinearRegression cv 평균 rmse 값 : 0.155
Ridge cv rmse 값 리스트 : [0.117 0.154 0.142 0.117 0.189]
Ridge cv 평균 rmse 값 : 0.144
Lasso cv rmse 값 리스트 : [0.161 0.204 0.177 0.181 0.265]
Lasso cv 평균 rmse 값 : 0.198
from sklearn.model_selection import GridSearchCV
def print_best_params(model, params):
grid_model = GridSearchCV(model,params,scoring='neg_mean_squared_error',cv=5)
grid_model.fit(X,y)
rmse = np.sqrt(-1*grid_model.best_score_)
print(f'{model.__class__.__name__} 5 cv시 최적 평균 rmse 값:{np.round(rmse,4)}, 최적 alpha값:{grid_model.best_params_}')
ridge_params = {
'alpha':[0.05,0.1,1,5,8,12,15,20]
}
print_best_params(ridge_reg,ridge_params)
Ridge 5 cv시 최적 평균 rmse 값:0.1418, 최적 alpha값:{'alpha': 12}
lasso_params = {
'alpha':[0.001,0.005,0.008,0.05,0.03,0.1,0.5,1,5,10]
}
print_best_params(lasso_reg,lasso_params)
Lasso 5 cv시 최적 평균 rmse 값:0.142, 최적 alpha값:{'alpha': 0.001}
lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
ridge_reg = Ridge(alpha=12)
ridge_reg.fit(X_train,y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train,y_train)
models=[lr_reg,ridge_reg,lasso_reg]
get_rmses(models)
visualize_coefficient(models)
LinearRegression 로그 변환된 RMSE: 0.132
Ridge 로그 변환된 RMSE: 0.124
Lasso 로그 변환된 RMSE: 0.12

from scipy.stats import skew
features_index = df.dtypes[df.dtypes != 'object'].index
skew_features = df[features_index].apply(lambda x:skew(x))
skew_features_top = skew_features[skew_features>1]
skew_features_top.sort_values(ascending=False)
MiscVal 24.451640
PoolArea 14.813135
LotArea 12.195142
3SsnPorch 10.293752
LowQualFinSF 9.002080
KitchenAbvGr 4.483784
BsmtFinSF2 4.250888
ScreenPorch 4.117977
BsmtHalfBath 4.099186
EnclosedPorch 3.086696
MasVnrArea 2.673661
LotFrontage 2.382499
OpenPorchSF 2.361912
BsmtFinSF1 1.683771
WoodDeckSF 1.539792
TotalBsmtSF 1.522688
MSSubClass 1.406210
1stFlrSF 1.375342
GrLivArea 1.365156
dtype: float64
np.log1p(df[skew_features_top.index])
|
MSSubClass |
LotFrontage |
LotArea |
MasVnrArea |
BsmtFinSF1 |
BsmtFinSF2 |
TotalBsmtSF |
1stFlrSF |
LowQualFinSF |
GrLivArea |
BsmtHalfBath |
KitchenAbvGr |
WoodDeckSF |
OpenPorchSF |
EnclosedPorch |
3SsnPorch |
ScreenPorch |
PoolArea |
MiscVal |
0 |
4.110874 |
4.189655 |
9.042040 |
5.283204 |
6.561031 |
0.000000 |
6.753438 |
6.753438 |
0.0 |
7.444833 |
0.000000 |
0.693147 |
0.000000 |
4.127134 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
1 |
3.044522 |
4.394449 |
9.169623 |
0.000000 |
6.886532 |
0.000000 |
7.141245 |
7.141245 |
0.0 |
7.141245 |
0.693147 |
0.693147 |
5.700444 |
0.000000 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
2 |
4.110874 |
4.234107 |
9.328212 |
5.093750 |
6.188264 |
0.000000 |
6.825460 |
6.825460 |
0.0 |
7.488294 |
0.000000 |
0.693147 |
0.000000 |
3.761200 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
3 |
4.262680 |
4.110874 |
9.164401 |
0.000000 |
5.379897 |
0.000000 |
6.629363 |
6.869014 |
0.0 |
7.448916 |
0.000000 |
0.693147 |
0.000000 |
3.583519 |
5.609472 |
0.0 |
0.0 |
0.0 |
0.000000 |
4 |
4.110874 |
4.442651 |
9.565284 |
5.860786 |
6.486161 |
0.000000 |
7.044033 |
7.044033 |
0.0 |
7.695758 |
0.000000 |
0.693147 |
5.262690 |
4.442651 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1455 |
4.110874 |
4.143135 |
8.976894 |
0.000000 |
0.000000 |
0.000000 |
6.860664 |
6.860664 |
0.0 |
7.407318 |
0.000000 |
0.693147 |
0.000000 |
3.713572 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
1456 |
3.044522 |
4.454347 |
9.486152 |
4.787492 |
6.673298 |
5.099866 |
7.341484 |
7.637234 |
0.0 |
7.637234 |
0.000000 |
0.693147 |
5.857933 |
0.000000 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
1457 |
4.262680 |
4.204693 |
9.109746 |
0.000000 |
5.620401 |
0.000000 |
7.050123 |
7.080868 |
0.0 |
7.758333 |
0.000000 |
0.693147 |
0.000000 |
4.110874 |
0.000000 |
0.0 |
0.0 |
0.0 |
7.824446 |
1458 |
3.044522 |
4.234107 |
9.181735 |
0.000000 |
3.912023 |
6.937314 |
6.983790 |
6.983790 |
0.0 |
6.983790 |
0.000000 |
0.693147 |
5.905362 |
0.000000 |
4.727388 |
0.0 |
0.0 |
0.0 |
0.000000 |
1459 |
3.044522 |
4.330733 |
9.204121 |
0.000000 |
6.722630 |
5.673323 |
7.136483 |
7.136483 |
0.0 |
7.136483 |
0.000000 |
0.693147 |
6.602588 |
4.234107 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.000000 |
1460 rows × 19 columns
df[skew_features_top.index] = np.log1p(df[skew_features_top.index])
df_ohe=pd.get_dummies(df)
y = df_ohe['SalePrice']
X = df_ohe.drop(columns=['SalePrice'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=156)
ridge_params = {'alpha':[0.05,0.1,1,5,8,10,12,15,20]}
print_best_params(ridge_reg,ridge_params)
lasso_params = {'alpha':[0.001,0.005,0.008,0.05,0.03,0.1,0.5,1,5,10]}
print_best_params(lasso_reg,lasso_params)
Ridge 5 cv시 최적 평균 rmse 값:0.1275, 최적 alpha값:{'alpha': 10}
Lasso 5 cv시 최적 평균 rmse 값:0.1252, 최적 alpha값:{'alpha': 0.001}
lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
ridge_reg = Ridge(alpha=10)
ridge_reg.fit(X_train,y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train,y_train)
models=[lr_reg,ridge_reg,lasso_reg]
get_rmses(models)
visualize_coefficient(models)
LinearRegression 로그 변환된 RMSE: 0.128
Ridge 로그 변환된 RMSE: 0.122
Lasso 로그 변환된 RMSE: 0.119

df_org = pd.read_csv('houseprice.csv')
plt.scatter(x=df_org['GrLivArea'],y=df_org['SalePrice'])
<matplotlib.collections.PathCollection at 0x247de310730>

cond1 = df_ohe['GrLivArea'] > np.log1p(4000)
cond2 = df_ohe['SalePrice'] < np.log1p(500000)
df_ohe[cond1 & cond2]
|
MSSubClass |
LotFrontage |
LotArea |
OverallQual |
OverallCond |
YearBuilt |
YearRemodAdd |
MasVnrArea |
BsmtFinSF1 |
BsmtFinSF2 |
... |
SaleType_ConLw |
SaleType_New |
SaleType_Oth |
SaleType_WD |
SaleCondition_Abnorml |
SaleCondition_AdjLand |
SaleCondition_Alloca |
SaleCondition_Family |
SaleCondition_Normal |
SaleCondition_Partial |
523 |
4.110874 |
4.875197 |
10.599007 |
10 |
5 |
2007 |
2008 |
6.637258 |
7.723562 |
0.0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1298 |
4.110874 |
5.749393 |
11.064887 |
10 |
5 |
2008 |
2008 |
6.680855 |
8.638525 |
0.0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
2 rows × 271 columns
outlier_index = df_ohe[cond1 & cond2].index
df_ohe.shape
(1460, 271)
df_ohe.drop(index=outlier_index,inplace=True)
df_ohe.shape
(1458, 271)
y = df_ohe['SalePrice']
X = df_ohe.drop(columns=['SalePrice'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=156)
ridge_params = {'alpha':[0.05,0.1,1,5,8,10,12,15,20]}
print_best_params(ridge_reg,ridge_params)
lasso_params = {'alpha':[0.001,0.005,0.008,0.05,0.03,0.1,0.5,1,5,10]}
print_best_params(lasso_reg,lasso_params)
Ridge 5 cv시 최적 평균 rmse 값:0.1125, 최적 alpha값:{'alpha': 8}
Lasso 5 cv시 최적 평균 rmse 값:0.1122, 최적 alpha값:{'alpha': 0.001}
lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
ridge_reg = Ridge(alpha=10)
ridge_reg.fit(X_train,y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train,y_train)
models=[lr_reg,ridge_reg,lasso_reg]
get_rmses(models)
visualize_coefficient(models)
LinearRegression 로그 변환된 RMSE: 0.129
Ridge 로그 변환된 RMSE: 0.103
Lasso 로그 변환된 RMSE: 0.1

차원 축소
- 일반적으로 적은 차원에서 학습된 모델일수록 신뢰도가 높아짐.
- 피처 선택(feature election)과 피처 추출(feature extraction)을 일반적으로 사용
주 성분 분석(Principal Component Analysis)
- PCA는 가장 대표적인 차원 축소 기법.
- 축 생성 -> 축으로 데이터 투영 -> 데이터를 축으로 만들어 표현
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
iris = load_iris(as_frame=True)
iris.data.columns
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
'petal width (cm)'],
dtype='object')
iris.data.columns = ['sepal length', 'sepal width', 'petal length', 'petal width']
iris.data
|
sepal length |
sepal width |
petal length |
petal width |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
... |
... |
... |
... |
... |
145 |
6.7 |
3.0 |
5.2 |
2.3 |
146 |
6.3 |
2.5 |
5.0 |
1.9 |
147 |
6.5 |
3.0 |
5.2 |
2.0 |
148 |
6.2 |
3.4 |
5.4 |
2.3 |
149 |
5.9 |
3.0 |
5.1 |
1.8 |
150 rows × 4 columns
iris.target
0 0
1 0
2 0
3 0
4 0
..
145 2
146 2
147 2
148 2
149 2
Name: target, Length: 150, dtype: int32
iris.data['target'] = iris.target
iris.data.head(2)
|
sepal length |
sepal width |
petal length |
petal width |
target |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
0 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
0 |
markers=['^','s','o']
for i, marker in enumerate(markers):
df = iris.data
x = df[df['target']==i]['sepal length']
y = df[df['target']==i]['sepal width']
plt.scatter(x,y,marker=marker,label=iris.target_names[i])
plt.legend()
plt.show()

from sklearn.preprocessing import StandardScaler
df_scaled = StandardScaler().fit_transform(df.iloc[:,:-1])
df_scaled
array([[-9.00681170e-01, 1.01900435e+00, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.38535265e+00, 3.28414053e-01, -1.39706395e+00,
-1.31544430e+00],
[-1.50652052e+00, 9.82172869e-02, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.24920112e+00, -1.34022653e+00,
-1.31544430e+00],
[-5.37177559e-01, 1.93979142e+00, -1.16971425e+00,
-1.05217993e+00],
[-1.50652052e+00, 7.88807586e-01, -1.34022653e+00,
-1.18381211e+00],
[-1.02184904e+00, 7.88807586e-01, -1.28338910e+00,
-1.31544430e+00],
[-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, 9.82172869e-02, -1.28338910e+00,
-1.44707648e+00],
[-5.37177559e-01, 1.47939788e+00, -1.28338910e+00,
-1.31544430e+00],
[-1.26418478e+00, 7.88807586e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.26418478e+00, -1.31979479e-01, -1.34022653e+00,
-1.44707648e+00],
[-1.87002413e+00, -1.31979479e-01, -1.51073881e+00,
-1.44707648e+00],
[-5.25060772e-02, 2.16998818e+00, -1.45390138e+00,
-1.31544430e+00],
[-1.73673948e-01, 3.09077525e+00, -1.28338910e+00,
-1.05217993e+00],
[-5.37177559e-01, 1.93979142e+00, -1.39706395e+00,
-1.05217993e+00],
[-9.00681170e-01, 1.01900435e+00, -1.34022653e+00,
-1.18381211e+00],
[-1.73673948e-01, 1.70959465e+00, -1.16971425e+00,
-1.18381211e+00],
[-9.00681170e-01, 1.70959465e+00, -1.28338910e+00,
-1.18381211e+00],
[-5.37177559e-01, 7.88807586e-01, -1.16971425e+00,
-1.31544430e+00],
[-9.00681170e-01, 1.47939788e+00, -1.28338910e+00,
-1.05217993e+00],
[-1.50652052e+00, 1.24920112e+00, -1.56757623e+00,
-1.31544430e+00],
[-9.00681170e-01, 5.58610819e-01, -1.16971425e+00,
-9.20547742e-01],
[-1.26418478e+00, 7.88807586e-01, -1.05603939e+00,
-1.31544430e+00],
[-1.02184904e+00, -1.31979479e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.02184904e+00, 7.88807586e-01, -1.22655167e+00,
-1.05217993e+00],
[-7.79513300e-01, 1.01900435e+00, -1.28338910e+00,
-1.31544430e+00],
[-7.79513300e-01, 7.88807586e-01, -1.34022653e+00,
-1.31544430e+00],
[-1.38535265e+00, 3.28414053e-01, -1.22655167e+00,
-1.31544430e+00],
[-1.26418478e+00, 9.82172869e-02, -1.22655167e+00,
-1.31544430e+00],
[-5.37177559e-01, 7.88807586e-01, -1.28338910e+00,
-1.05217993e+00],
[-7.79513300e-01, 2.40018495e+00, -1.28338910e+00,
-1.44707648e+00],
[-4.16009689e-01, 2.63038172e+00, -1.34022653e+00,
-1.31544430e+00],
[-1.14301691e+00, 9.82172869e-02, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 3.28414053e-01, -1.45390138e+00,
-1.31544430e+00],
[-4.16009689e-01, 1.01900435e+00, -1.39706395e+00,
-1.31544430e+00],
[-1.14301691e+00, 1.24920112e+00, -1.34022653e+00,
-1.44707648e+00],
[-1.74885626e+00, -1.31979479e-01, -1.39706395e+00,
-1.31544430e+00],
[-9.00681170e-01, 7.88807586e-01, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.01900435e+00, -1.39706395e+00,
-1.18381211e+00],
[-1.62768839e+00, -1.74335684e+00, -1.39706395e+00,
-1.18381211e+00],
[-1.74885626e+00, 3.28414053e-01, -1.39706395e+00,
-1.31544430e+00],
[-1.02184904e+00, 1.01900435e+00, -1.22655167e+00,
-7.88915558e-01],
[-9.00681170e-01, 1.70959465e+00, -1.05603939e+00,
-1.05217993e+00],
[-1.26418478e+00, -1.31979479e-01, -1.34022653e+00,
-1.18381211e+00],
[-9.00681170e-01, 1.70959465e+00, -1.22655167e+00,
-1.31544430e+00],
[-1.50652052e+00, 3.28414053e-01, -1.34022653e+00,
-1.31544430e+00],
[-6.58345429e-01, 1.47939788e+00, -1.28338910e+00,
-1.31544430e+00],
[-1.02184904e+00, 5.58610819e-01, -1.34022653e+00,
-1.31544430e+00],
[ 1.40150837e+00, 3.28414053e-01, 5.35408562e-01,
2.64141916e-01],
[ 6.74501145e-01, 3.28414053e-01, 4.21733708e-01,
3.95774101e-01],
[ 1.28034050e+00, 9.82172869e-02, 6.49083415e-01,
3.95774101e-01],
[-4.16009689e-01, -1.74335684e+00, 1.37546573e-01,
1.32509732e-01],
[ 7.95669016e-01, -5.92373012e-01, 4.78571135e-01,
3.95774101e-01],
[-1.73673948e-01, -5.92373012e-01, 4.21733708e-01,
1.32509732e-01],
[ 5.53333275e-01, 5.58610819e-01, 5.35408562e-01,
5.27406285e-01],
[-1.14301691e+00, -1.51316008e+00, -2.60315415e-01,
-2.62386821e-01],
[ 9.16836886e-01, -3.62176246e-01, 4.78571135e-01,
1.32509732e-01],
[-7.79513300e-01, -8.22569778e-01, 8.07091462e-02,
2.64141916e-01],
[-1.02184904e+00, -2.43394714e+00, -1.46640561e-01,
-2.62386821e-01],
[ 6.86617933e-02, -1.31979479e-01, 2.51221427e-01,
3.95774101e-01],
[ 1.89829664e-01, -1.97355361e+00, 1.37546573e-01,
-2.62386821e-01],
[ 3.10997534e-01, -3.62176246e-01, 5.35408562e-01,
2.64141916e-01],
[-2.94841818e-01, -3.62176246e-01, -8.98031345e-02,
1.32509732e-01],
[ 1.03800476e+00, 9.82172869e-02, 3.64896281e-01,
2.64141916e-01],
[-2.94841818e-01, -1.31979479e-01, 4.21733708e-01,
3.95774101e-01],
[-5.25060772e-02, -8.22569778e-01, 1.94384000e-01,
-2.62386821e-01],
[ 4.32165405e-01, -1.97355361e+00, 4.21733708e-01,
3.95774101e-01],
[-2.94841818e-01, -1.28296331e+00, 8.07091462e-02,
-1.30754636e-01],
[ 6.86617933e-02, 3.28414053e-01, 5.92245988e-01,
7.90670654e-01],
[ 3.10997534e-01, -5.92373012e-01, 1.37546573e-01,
1.32509732e-01],
[ 5.53333275e-01, -1.28296331e+00, 6.49083415e-01,
3.95774101e-01],
[ 3.10997534e-01, -5.92373012e-01, 5.35408562e-01,
8.77547895e-04],
[ 6.74501145e-01, -3.62176246e-01, 3.08058854e-01,
1.32509732e-01],
[ 9.16836886e-01, -1.31979479e-01, 3.64896281e-01,
2.64141916e-01],
[ 1.15917263e+00, -5.92373012e-01, 5.92245988e-01,
2.64141916e-01],
[ 1.03800476e+00, -1.31979479e-01, 7.05920842e-01,
6.59038469e-01],
[ 1.89829664e-01, -3.62176246e-01, 4.21733708e-01,
3.95774101e-01],
[-1.73673948e-01, -1.05276654e+00, -1.46640561e-01,
-2.62386821e-01],
[-4.16009689e-01, -1.51316008e+00, 2.38717193e-02,
-1.30754636e-01],
[-4.16009689e-01, -1.51316008e+00, -3.29657076e-02,
-2.62386821e-01],
[-5.25060772e-02, -8.22569778e-01, 8.07091462e-02,
8.77547895e-04],
[ 1.89829664e-01, -8.22569778e-01, 7.62758269e-01,
5.27406285e-01],
[-5.37177559e-01, -1.31979479e-01, 4.21733708e-01,
3.95774101e-01],
[ 1.89829664e-01, 7.88807586e-01, 4.21733708e-01,
5.27406285e-01],
[ 1.03800476e+00, 9.82172869e-02, 5.35408562e-01,
3.95774101e-01],
[ 5.53333275e-01, -1.74335684e+00, 3.64896281e-01,
1.32509732e-01],
[-2.94841818e-01, -1.31979479e-01, 1.94384000e-01,
1.32509732e-01],
[-4.16009689e-01, -1.28296331e+00, 1.37546573e-01,
1.32509732e-01],
[-4.16009689e-01, -1.05276654e+00, 3.64896281e-01,
8.77547895e-04],
[ 3.10997534e-01, -1.31979479e-01, 4.78571135e-01,
2.64141916e-01],
[-5.25060772e-02, -1.05276654e+00, 1.37546573e-01,
8.77547895e-04],
[-1.02184904e+00, -1.74335684e+00, -2.60315415e-01,
-2.62386821e-01],
[-2.94841818e-01, -8.22569778e-01, 2.51221427e-01,
1.32509732e-01],
[-1.73673948e-01, -1.31979479e-01, 2.51221427e-01,
8.77547895e-04],
[-1.73673948e-01, -3.62176246e-01, 2.51221427e-01,
1.32509732e-01],
[ 4.32165405e-01, -3.62176246e-01, 3.08058854e-01,
1.32509732e-01],
[-9.00681170e-01, -1.28296331e+00, -4.30827696e-01,
-1.30754636e-01],
[-1.73673948e-01, -5.92373012e-01, 1.94384000e-01,
1.32509732e-01],
[ 5.53333275e-01, 5.58610819e-01, 1.27429511e+00,
1.71209594e+00],
[-5.25060772e-02, -8.22569778e-01, 7.62758269e-01,
9.22302838e-01],
[ 1.52267624e+00, -1.31979479e-01, 1.21745768e+00,
1.18556721e+00],
[ 5.53333275e-01, -3.62176246e-01, 1.04694540e+00,
7.90670654e-01],
[ 7.95669016e-01, -1.31979479e-01, 1.16062026e+00,
1.31719939e+00],
[ 2.12851559e+00, -1.31979479e-01, 1.61531967e+00,
1.18556721e+00],
[-1.14301691e+00, -1.28296331e+00, 4.21733708e-01,
6.59038469e-01],
[ 1.76501198e+00, -3.62176246e-01, 1.44480739e+00,
7.90670654e-01],
[ 1.03800476e+00, -1.28296331e+00, 1.16062026e+00,
7.90670654e-01],
[ 1.64384411e+00, 1.24920112e+00, 1.33113254e+00,
1.71209594e+00],
[ 7.95669016e-01, 3.28414053e-01, 7.62758269e-01,
1.05393502e+00],
[ 6.74501145e-01, -8.22569778e-01, 8.76433123e-01,
9.22302838e-01],
[ 1.15917263e+00, -1.31979479e-01, 9.90107977e-01,
1.18556721e+00],
[-1.73673948e-01, -1.28296331e+00, 7.05920842e-01,
1.05393502e+00],
[-5.25060772e-02, -5.92373012e-01, 7.62758269e-01,
1.58046376e+00],
[ 6.74501145e-01, 3.28414053e-01, 8.76433123e-01,
1.44883158e+00],
[ 7.95669016e-01, -1.31979479e-01, 9.90107977e-01,
7.90670654e-01],
[ 2.24968346e+00, 1.70959465e+00, 1.67215710e+00,
1.31719939e+00],
[ 2.24968346e+00, -1.05276654e+00, 1.78583195e+00,
1.44883158e+00],
[ 1.89829664e-01, -1.97355361e+00, 7.05920842e-01,
3.95774101e-01],
[ 1.28034050e+00, 3.28414053e-01, 1.10378283e+00,
1.44883158e+00],
[-2.94841818e-01, -5.92373012e-01, 6.49083415e-01,
1.05393502e+00],
[ 2.24968346e+00, -5.92373012e-01, 1.67215710e+00,
1.05393502e+00],
[ 5.53333275e-01, -8.22569778e-01, 6.49083415e-01,
7.90670654e-01],
[ 1.03800476e+00, 5.58610819e-01, 1.10378283e+00,
1.18556721e+00],
[ 1.64384411e+00, 3.28414053e-01, 1.27429511e+00,
7.90670654e-01],
[ 4.32165405e-01, -5.92373012e-01, 5.92245988e-01,
7.90670654e-01],
[ 3.10997534e-01, -1.31979479e-01, 6.49083415e-01,
7.90670654e-01],
[ 6.74501145e-01, -5.92373012e-01, 1.04694540e+00,
1.18556721e+00],
[ 1.64384411e+00, -1.31979479e-01, 1.16062026e+00,
5.27406285e-01],
[ 1.88617985e+00, -5.92373012e-01, 1.33113254e+00,
9.22302838e-01],
[ 2.49201920e+00, 1.70959465e+00, 1.50164482e+00,
1.05393502e+00],
[ 6.74501145e-01, -5.92373012e-01, 1.04694540e+00,
1.31719939e+00],
[ 5.53333275e-01, -5.92373012e-01, 7.62758269e-01,
3.95774101e-01],
[ 3.10997534e-01, -1.05276654e+00, 1.04694540e+00,
2.64141916e-01],
[ 2.24968346e+00, -1.31979479e-01, 1.33113254e+00,
1.44883158e+00],
[ 5.53333275e-01, 7.88807586e-01, 1.04694540e+00,
1.58046376e+00],
[ 6.74501145e-01, 9.82172869e-02, 9.90107977e-01,
7.90670654e-01],
[ 1.89829664e-01, -1.31979479e-01, 5.92245988e-01,
7.90670654e-01],
[ 1.28034050e+00, 9.82172869e-02, 9.33270550e-01,
1.18556721e+00],
[ 1.03800476e+00, 9.82172869e-02, 1.04694540e+00,
1.58046376e+00],
[ 1.28034050e+00, 9.82172869e-02, 7.62758269e-01,
1.44883158e+00],
[-5.25060772e-02, -8.22569778e-01, 7.62758269e-01,
9.22302838e-01],
[ 1.15917263e+00, 3.28414053e-01, 1.21745768e+00,
1.44883158e+00],
[ 1.03800476e+00, 5.58610819e-01, 1.10378283e+00,
1.71209594e+00],
[ 1.03800476e+00, -1.31979479e-01, 8.19595696e-01,
1.44883158e+00],
[ 5.53333275e-01, -1.28296331e+00, 7.05920842e-01,
9.22302838e-01],
[ 7.95669016e-01, -1.31979479e-01, 8.19595696e-01,
1.05393502e+00],
[ 4.32165405e-01, 7.88807586e-01, 9.33270550e-01,
1.44883158e+00],
[ 6.86617933e-02, -1.31979479e-01, 7.62758269e-01,
7.90670654e-01]])
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
iris_pca = pca.fit_transform(df_scaled)
iris_pca
array([[-2.26470281, 0.4800266 ],
[-2.08096115, -0.67413356],
[-2.36422905, -0.34190802],
[-2.29938422, -0.59739451],
[-2.38984217, 0.64683538],
[-2.07563095, 1.48917752],
[-2.44402884, 0.0476442 ],
[-2.23284716, 0.22314807],
[-2.33464048, -1.11532768],
[-2.18432817, -0.46901356],
[-2.1663101 , 1.04369065],
[-2.32613087, 0.13307834],
[-2.2184509 , -0.72867617],
[-2.6331007 , -0.96150673],
[-2.1987406 , 1.86005711],
[-2.26221453, 2.68628449],
[-2.2075877 , 1.48360936],
[-2.19034951, 0.48883832],
[-1.898572 , 1.40501879],
[-2.34336905, 1.12784938],
[-1.914323 , 0.40885571],
[-2.20701284, 0.92412143],
[-2.7743447 , 0.45834367],
[-1.81866953, 0.08555853],
[-2.22716331, 0.13725446],
[-1.95184633, -0.62561859],
[-2.05115137, 0.24216355],
[-2.16857717, 0.52714953],
[-2.13956345, 0.31321781],
[-2.26526149, -0.3377319 ],
[-2.14012214, -0.50454069],
[-1.83159477, 0.42369507],
[-2.61494794, 1.79357586],
[-2.44617739, 2.15072788],
[-2.10997488, -0.46020184],
[-2.2078089 , -0.2061074 ],
[-2.04514621, 0.66155811],
[-2.52733191, 0.59229277],
[-2.42963258, -0.90418004],
[-2.16971071, 0.26887896],
[-2.28647514, 0.44171539],
[-1.85812246, -2.33741516],
[-2.5536384 , -0.47910069],
[-1.96444768, 0.47232667],
[-2.13705901, 1.14222926],
[-2.0697443 , -0.71105273],
[-2.38473317, 1.1204297 ],
[-2.39437631, -0.38624687],
[-2.22944655, 0.99795976],
[-2.20383344, 0.00921636],
[ 1.10178118, 0.86297242],
[ 0.73133743, 0.59461473],
[ 1.24097932, 0.61629765],
[ 0.40748306, -1.75440399],
[ 1.0754747 , -0.20842105],
[ 0.38868734, -0.59328364],
[ 0.74652974, 0.77301931],
[-0.48732274, -1.85242909],
[ 0.92790164, 0.03222608],
[ 0.01142619, -1.03401828],
[-0.11019628, -2.65407282],
[ 0.44069345, -0.06329519],
[ 0.56210831, -1.76472438],
[ 0.71956189, -0.18622461],
[-0.0333547 , -0.43900321],
[ 0.87540719, 0.50906396],
[ 0.35025167, -0.19631173],
[ 0.15881005, -0.79209574],
[ 1.22509363, -1.6222438 ],
[ 0.1649179 , -1.30260923],
[ 0.73768265, 0.39657156],
[ 0.47628719, -0.41732028],
[ 1.2341781 , -0.93332573],
[ 0.6328582 , -0.41638772],
[ 0.70266118, -0.06341182],
[ 0.87427365, 0.25079339],
[ 1.25650912, -0.07725602],
[ 1.35840512, 0.33131168],
[ 0.66480037, -0.22592785],
[-0.04025861, -1.05871855],
[ 0.13079518, -1.56227183],
[ 0.02345269, -1.57247559],
[ 0.24153827, -0.77725638],
[ 1.06109461, -0.63384324],
[ 0.22397877, -0.28777351],
[ 0.42913912, 0.84558224],
[ 1.04872805, 0.5220518 ],
[ 1.04453138, -1.38298872],
[ 0.06958832, -0.21950333],
[ 0.28347724, -1.32932464],
[ 0.27907778, -1.12002852],
[ 0.62456979, 0.02492303],
[ 0.33653037, -0.98840402],
[-0.36218338, -2.01923787],
[ 0.28858624, -0.85573032],
[ 0.09136066, -0.18119213],
[ 0.22771687, -0.38492008],
[ 0.57638829, -0.1548736 ],
[-0.44766702, -1.54379203],
[ 0.25673059, -0.5988518 ],
[ 1.84456887, 0.87042131],
[ 1.15788161, -0.69886986],
[ 2.20526679, 0.56201048],
[ 1.44015066, -0.04698759],
[ 1.86781222, 0.29504482],
[ 2.75187334, 0.8004092 ],
[ 0.36701769, -1.56150289],
[ 2.30243944, 0.42006558],
[ 2.00668647, -0.71143865],
[ 2.25977735, 1.92101038],
[ 1.36417549, 0.69275645],
[ 1.60267867, -0.42170045],
[ 1.8839007 , 0.41924965],
[ 1.2601151 , -1.16226042],
[ 1.4676452 , -0.44227159],
[ 1.59007732, 0.67624481],
[ 1.47143146, 0.25562182],
[ 2.42632899, 2.55666125],
[ 3.31069558, 0.01778095],
[ 1.26376667, -1.70674538],
[ 2.0377163 , 0.91046741],
[ 0.97798073, -0.57176432],
[ 2.89765149, 0.41364106],
[ 1.33323218, -0.48181122],
[ 1.7007339 , 1.01392187],
[ 1.95432671, 1.0077776 ],
[ 1.17510363, -0.31639447],
[ 1.02095055, 0.06434603],
[ 1.78834992, -0.18736121],
[ 1.86364755, 0.56229073],
[ 2.43595373, 0.25928443],
[ 2.30492772, 2.62632347],
[ 1.86270322, -0.17854949],
[ 1.11414774, -0.29292262],
[ 1.2024733 , -0.81131527],
[ 2.79877045, 0.85680333],
[ 1.57625591, 1.06858111],
[ 1.3462921 , 0.42243061],
[ 0.92482492, 0.0172231 ],
[ 1.85204505, 0.67612817],
[ 2.01481043, 0.61388564],
[ 1.90178409, 0.68957549],
[ 1.15788161, -0.69886986],
[ 2.04055823, 0.8675206 ],
[ 1.9981471 , 1.04916875],
[ 1.87050329, 0.38696608],
[ 1.56458048, -0.89668681],
[ 1.5211705 , 0.26906914],
[ 1.37278779, 1.01125442],
[ 0.96065603, -0.02433167]])
pca_columns = ['pca_com_1','pca_com_2']
df_pca = pd.DataFrame(iris_pca,columns=pca_columns)
df_pca['target']=iris.target
df_pca.head(2)
|
pca_com_1 |
pca_com_2 |
target |
0 |
-2.264703 |
0.480027 |
0 |
1 |
-2.080961 |
-0.674134 |
0 |
markers=['^','s','o']
for i, marker in enumerate(markers):
df = iris.data
x = df_pca[df_pca['target']==i]['pca_com_1']
y = df_pca[df_pca['target']==i]['pca_com_2']
plt.scatter(x,y,marker=marker,label=iris.target_names[i])
plt.legend()
plt.show()

pca.explained_variance_ratio_
array([0.72962445, 0.22850762])
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
rcf = RandomForestClassifier(random_state=156)
scores = cross_val_score(rcf, iris.data.iloc[:,:-1],iris.target,scoring='accuracy',cv=3)
print(f'개별 정확도:{scores}, 평균 정확도 : {np.mean(scores)}')
개별 정확도:[0.98 0.94 0.96], 평균 정확도 : 0.96
rcf = RandomForestClassifier(random_state=156)
scores = cross_val_score(rcf, df_pca.iloc[:,:-1],iris.target,scoring='accuracy',cv=3)
print(f'개별 정확도:{scores}, 평균 정확도 : {np.mean(scores)}')
개별 정확도:[0.88 0.88 0.88], 평균 정확도 : 0.88