지도학습(데이터 학습)

이주현·2023년 12월 12일

머신러닝

목록 보기

10/14

학습 순서 및 방법

1.1 데이터 분리

#데이터프레임에서 타깃데이터 분리
X = df.drop('타깃데이터', axis=1)
y = df['타깃데이터']

1.2 데이터 분리

#데이터프레임에서 학습 및 테스트용 분리
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = 
train_test_split(X, y, test_size=0.2, random_state=42)

2. 모델학습

from sklearn.linear_model import LinearRegression

# 모델 학습
model = LinearRegression()
model.fit(X_tr, y_tr)

3. 모델 검증 및 성능 평가

#모델 검증 및 성능 평가
y_pred = model.predict(X_val)
print(y_pred)

#성능평가

#R2(0~1값/ 1에 가까울수록 좋음)
r2 = r2_score(y_val, y_pred)
r2

#RMSE(RMSE가 낮을수록 좋음)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

#mae(MAE가 낮을수록 좋음)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_val, y_pred)
mae

4. 하이퍼파라미터 최적화(그리드서치)

#하이퍼파라미터 최적화 그리드서치
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Ridge()
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_tr, y_tr)

#최적값 확인
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print(best_model)
print(best_params)

5. 최적값으로 다시 모델 훈련


#모델 훈련
from sklearn.linear_model import Ridge

# Ridge 모델 생성 및 alpha 설정
ridge_model = Ridge(alpha=0.001)#최적값 넣기

# 훈련 데이터를 사용하여 모델 훈련
ridge_model.fit(X_tr, y_tr)

6. 다시 성능평가

from sklearn.metrics import mean_squared_error

# 검증 데이터셋 또는 테스트 데이터셋에 대한 예측 수행
y_pred = ridge_model.predict(X_val)

# 평가지표
r2 = r2_score(y_val, y_pred)
r2

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

mse = mean_squared_error(y_val, y_pred)
mse

7. 시각화

# 예측값과 실제값 비교
plt.scatter(y_val, y_pred)
plt.show()


import matplotlib.pyplot as plt
import numpy as np

# 최적 모델의 가중치 가져오기
ridge_coefficients = ridge_model.coef_

# 가중치를 특성과 연결하여 정렬
feature_names = X_train.columns  

sorted_coefficients = sorted(zip(ridge_coefficients, 
feature_names), key=lambda x: abs(x[0]), reverse=True)

# 가중치를 시각화하기 위한 막대 그래프 생성
coefficients, names = zip(*sorted_coefficients)
plt.figure(figsize=(10, 6))
plt.barh(range(len(names)), coefficients, align='center')
plt.yticks(range(len(names)), names)
plt.gca().invert_yaxis()  # 중요도가 높은 특성을 상단에 표시