학습내용
One hot encoding
pd.get_dummies(data, prefix='X')
pd.get_dummies(data, drop_first = True)
encoder = OneHotEncoder(use_cat_names = True)
Pandas profiling
import pandas_profiling
from pandas_profiling import ProfileReport
df.profile_report()
ProfileReport(df)
Distribution plot
displot
distplot
np.percentile 이상치 제거
df = df[(df['price'] >= np.percentile(df['price'], 0.05)) &
(df['price'] <= np.percentile(df['price'], 99.5))]
K best feature selection
from sklearn.feature_selection import f_regression, SelectKBest
selector = SelectKBest(score_func = f_regression, k = 10)
mark = selector.get_support()
all_names = df.columns
select_names = all_names[mark]
selector.fit_transform(X_train, y_train)
selector.transform(X_test)
Ridge
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import mean_absolute_error, r2_score
ridge = Ridge(alphas = alpha, normalize = True)
RidgeCV
alphas = [0, 0.001, 0.01, 0.1, 1]
ridge = RidgeCV(alphas = alphas, normalize = True, cv = 5)
ridge.fit(X_train_selected, y_train)
print('best alpha : ', ridge.alpha_)
print('best score : ', ridge.best_score_)