[KT AIVLE] Pt1. 인공지능 총 정리!

누디·2023년 7월 3일

AIVLE

목록 보기

8/8

에이블스쿨 pt1 인공지능 부분 동안 배웠던 부분들을 총 정리해두었다.
노션에 바로바로 업로드하다 보니 벨로그로 옮기기가 쉽지가 않네 (...)
이미지 데이터(CNN, Yolo), 자연어 데이터(RNN, lstm) 부분을 제외하면 모든 부분이 정리되어 있다!

데이터 탐색

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

1. 데이터 불러오기

df = pd.read_csv('data.csv')

2. 데이터 정보 확인

# 맨 위 5줄
df.head()

# 데이터프레임 정보(컬럼정보, Null 여부, 타입)
df.info()

# 데이터프레임 컬럼을 확인
df.columns

# 데이터프레임의 계산 가능한 값들에 대한 통계치를 확인
df.describe()

# DataFrame 컬럼 항목에 Null 존재하는지 확인
df.isnull().sum()

3. 통계적 특성 파악

df.describe()

4. 필요한 컬럼 추출

cust=df[["col1", "col2", "col3"]]

5. 데이터 타입 변경

df = df.astype({'age' : int})

데이터 전처리

1. 결측치 처리

## 결측치 채우기
# Cust Data 복사
cust_fix=cust.copy()

#fillna 함수를 사용해서 특정 숫자나 문자로 결측치를 처리하는 방법
cust=cust.fillna(15)

# 뒤에 있는 data를 사용해서 결측치를 처리하는 방법
cust=cust.fillna(method='backfill')

# 앞에 있는 data를 사용해서 결측치를 처리하는 방법
cust=cust.fillna(method='ffill')

#replace()함수로 결측치 채우기
cust['age']=cust['age'].replace(np.nan, cust['age'].median())

# interpolate 함수의 선형 방법을 사용하여 결측값을 채우기
cust=cust.interpolate()

## 결측치 제거하기
#listwise 방식으로 제거 하기
cust=cust.dropna()

#pairwise 방식으로 제거하기
cust=cust.dropna(how='all')

#임계치를 설정해서 제거하기
cust=cust.dropna(thresh=10)

# 특정열 안에서만 삭제하기|
cust=cust.dropna(subset=['class'])

2. 이상치 처리

## 범주형
# 이상치 제거
cust_data=cust[(cust['class']!='H')]
print(cust_data['class'].value_counts())

# 이상치 변경
cust_data['class']=cust_data['class'].replace('H','F')
print(cust_data['class'].value_counts())

3. 데이터 시각화

## Matplotlib
#Matplotlib를 사용하여 간단한 차트를 그리기
plt.figure()
plt.plot([1,2,3], [100,120,110])
plt.show()

#bill, age간의 관계를 알아보기 위해 산점도 그리기
plt.figure(figsize=(16,6))
plt.scatter(y=df["bill"], x=df["age"])
plt.show()

#bill에 대한 빈도를 10개 구간으로 그리기
plt.figure()
plt.hist(df["bill"], bins=20)
plt.show()

#임의의 값 리스트를 생성하고, 생성한 리스트를 사용하여 박스 그래프 그리기
x=[5, 3, 7, 10, 9, 5, 3.5, 8]
plt.boxplot(x=x)
plt.show()
# 가로 box plot 그리기
# 나이대별 총이용금액 분포를 박스 그래프로 그리기
df.boxplot(by="age", column="bill", figsize=(16,8))
plt.show()

#임의의 값 리스트를 생성하고, 생성한 리스트를 사용하여 막대 그래프 그리기
y=[5, 3, 7, 10, 9, 5, 3.5, 8]
x=list(range(len(y)))
plt.figure()
plt.bar(x, y)
plt.show()

## Seaborn
# scatterplot
sns.scatterplot(x='age', y='bill', data=df)
plt.show()

# catplot : 3개 이상의 카테고리 값에 의한 분포 변화
#age와 avg_bill간의 관계를 class별로 확인하기
sns.catplot(x='age', y='bill',data=df ,col="class", col_wrap=2)
plt.show()

# Implot : 산점도에 회귀선 그려줌
#lmplot을 사용하여 bill과 Bbill의 상관관계를 확인하기
plt.figure(figsize=(10,5))
sns.lmplot(x='bill', y='Bbill', data=df,line_kws={'color': 'red'})
plt.show()

# countplot : 항목 별 개수 확인
#나이대별 bill_rating분포를 확인하기
plt.figure(figsize=(10,5))
sns.countplot(x="age", hue="bill_rating", data=df)
plt.show()

# jointplot : 산점도 + countplot
#jointplot을 사용하여 bill과 age간의 관계 확인하기
sns.jointplot(x="bill", y="age", data=df)
plt.show()

# heatmap : 상관관계 확인
df.corr()
#컬럼별 상과관계를 heatmap 그래프로 그리기
sns.heatmap(df.corr())
plt.show()

# boxplot
plt.figure(figsize=(16,8))
sns.boxplot(y=df["bill"], x=df["age"],width=0.9)
plt.show()

# violinplot
# 나이대별 A상품 요금에 대한 violinplot을 그리기
plt.figure(figsize=(16,8))
sns.violinplot(y=df["Abill"], x=df["class"],width=1)
plt.show()

4. Feature Engineering

## Scaling
# Standardization
cust_data_num = cust_data[['bill', 'Abill', 'Bbill']]
#표준화
Standardization_df = (cust_data_num - cust_data_num.mean())/cust_data_num.std()
Standardization_df.head()

# Normalization
#사이킷런 패키지의 MinMaxScaler를 이용하여  Scaling 하기
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
normalization_df=cust_data_num.copy()
normalization_df[:]=scaler.fit_transform(normalization_df[:])
normalization_df.head()

## Label Encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_df=cust_data.copy()

le = LabelEncoder()
labelencoder_df['class'] = le.fit_transform(labelencoder_df['class'])

## One-Hot Encoding
# df1의 나머지 object 컬럼에 대해서 One-Hot-Encoding될수 있도록 Pandas의 get_dummies 함수를 적용
df1 = pd.get_dummies(data=df1, columns=['cont_cd', 'cust_cd', 'yn'], drop_first=True)

## Object 컬럼에 대해
cal_cols = df.select_dtypes('object').columns.values
df1 = pd.get_dummies(data=df, columns=cal_cols)

ML 모델 구현

1. Train Test Split

from sklearn.model_selection import train_test_split

X = df1.drop(columns=['t_Y'], axis=1).values
y = df1['t_Y'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

2. 데이터 스케일링(표준화, 정규화)

# 라이브러리 임포트
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

3. ML 모델 구현

라이브러리 임포트
모델 학습, 예측, 성능 평가

## 분류 : confusion_matrix, classification_report
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_socre, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

lg = LogisticRegression()
lg.fit(X_train, y_train)

lg.score(X_test, y_test)

lg_pred = lg.predict(X_test)

confusion_matrix(y_test, lg_pred) # 앞에가 정답
accuracy_score(y_test, lg_pred)
print(classification_report(y_test, lg_pred))

# Dicision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred  = dt.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 회귀 : mean_squared_error, r2_score
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

knn_pred = knn.predict(X_test)

print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

## Ensemble
# Bagging : Random Forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=3, random_state=42)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

recall_eval('RandomForest Ensemble', rfc_pred, y_test)

# Boosting : XGBoost, LGBM
# XGBoost
!pip install xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=3, random_state=42)  
xgb.fit(X_train, y_train)

xgb_pred = xgb.predict(X_test)

recall_eval('XGBoost', xgb_pred, y_test)

#LGBM
!pip install lightgbm
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=3, random_state=42)  
lgbm.fit(X_train, y_train)

lgbm_pred = lgbm.predict(X_test)

recall_eval('LGBM', lgbm_pred, y_test)

lgbm.score(X_test, y_test)

DL 모델 구현

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

1. 데이터 스케일링 (MinMaxScaler)

# 라이브러리 임포트
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

2. 모델 생성

## 이진분류
batch_size = 16
epochs = 20

model = Sequential()
model.add(Dense(4, activation='relu', input_shape=(39,)))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid')

model.summary()

## 다중분류
model = Sequential()
model.add(Dense(5, activation='relu', input_shape=(39,))
model.add(Dropout(0.3))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(2, activation='softmax'))

model.summary()

3. 모델 컴파일, 학습

## 이진 분류
model.compile(optimizer='adam',
							loss='binary_crossentropy',
							metrics=['accuracy'])

## 다중 분류 = 각각에 대한 확률
# y 값 one-hot-encoding : loss='categorical_crossentropy'
# y 값 one-hot-encoding X : loss='sparse_categorical_crossentropy'
model.compile(optimizer='adam',
							loss='sparse_categorical_crossentropy',
							metrics=['accuracy'])

## 예측 모델ㄹ
model.compile(optimizer='adam', loss='mse')

history = model.fit(X_train, y_train,
					validation_data=(X_test, y_test),
					epochs=10,
					batch_size=10)

4. Callbacks 함수 설정

Early Stopping, ModelCheckpoint

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## Early Stopping
es = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)

## ModelCheckpoint
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# 적용하여 학습
history = model.fit(X_train, y_train,
										batch_size=batch_size,
										epochs=epocs,
										callbacks=[es, mc],
										validation_data=(X_test, y_test),
										verbose=1)

5. 모델 성능 평가

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend(['acc', 'val_acc'])
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

pred = model.predict(X_test)
y_pred = np.argmax(pred, axis=1)

# 정확도 80%
accuracy_score(y_test, y_pred)

# 재현율 성능이 좋지 않다
recall_score(y_test, y_pred)

# accuracy, recall, precision 성능 한번에 보기
print(classification_report(y_test, y_pred))

6. 모델 저장

model.save('model.h5')

누디

이전 포스트

[KT AIVLE] Pt1. 인공지능 총 정리!

AIVLE

데이터 탐색

1. 데이터 불러오기

2. 데이터 정보 확인

3. 통계적 특성 파악

4. 필요한 컬럼 추출

5. 데이터 타입 변경

데이터 전처리

1. 결측치 처리

2. 이상치 처리

3. 데이터 시각화

4. Feature Engineering

ML 모델 구현

1. Train Test Split

2. 데이터 스케일링(표준화, 정규화)

3. ML 모델 구현

DL 모델 구현

1. 데이터 스케일링 (MinMaxScaler)

2. 모델 생성

3. 모델 컴파일, 학습

4. Callbacks 함수 설정

5. 모델 성능 평가

6. 모델 저장

[AIVLE] Week 07 정리

0개의 댓글