- Installing and calling Seaborn package
!pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("your file name", encoding='cp949')
- Checking Correlation between of the data using Heatmap
sns.heatmap(df.corr(), annot=True)
plt.rc('font', family='Malgun Gothic')
plt.show()
- Get the average height for each age group in bar code
df.columns
sns.barplot(x='gentercode', y='drinkingornot', data=df)
plt.rc('font', family='Malgun Gothic') # assigning the font
plt.show()
- Handling of Missing Values
- Deleting dental related data
( 27...
28...
29...
30...)
df.head(1)
pd.set_option('display.max_columns', None) # printng every columns
pd.set_option('display.max_row', None) # printing every rows
df.head(1)
df.columns
df = df.drop(['column names stated above1', '2', '3', '4'], axis=1)
df['baseyear'].nunique()
df['subscriber serial number'].nunique()
df = df.drop(['base year', 'subsciber serial number'], axis=1)
df.head()
df = df.fillna(0) # need to write another way of dealing this
- Convert gender code and age code data with One Hot Incoding
df_dummies=pd.get_dummies(df[['gender code', 'trial code']], drop_first=True)
df_dummies.head(1)
df.head(1)
df1 = pd.get_dummies(data=df, columns=cal_cols)
df.info()
df = pd.get_dummies(df, columns=['trail code', 'gender code'])
df.head(1)
- Adding whether drunk or not. Set the remaining columns to Featrue
and Divide by 7:3
y = df.drunk
X = df.drop(['drunk'], axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, scratify = y, random_state=42)
- Apply scaling to the data (Standard Scaler)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
- Train with RandomForest model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=30, max_features=3, max_depth=7, random_state=21)
rfc.fit(X_train, y_train)
rfc.score(X_train, y_train)
- Evaluating the performance (Get the confusion matrix and visualize it with a heatmap graph)
from sklearn.metrics import confusion_matrix, classification_report
y_pred = rfc.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True)
plt.show()
print(f"classification_report: {classification_report(y_test, y_pred)}")
- Making a deep-learning model (More than X hidden layers.
Dropout configuration required.
Apply Early Stopping Model Checkpoint)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
X_train.shape
model = Sequential()
model.add(Dense(128, activation = 'relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
mc = ModelCheckpoint(filepath='best.h5', verbose=1, monitor='val_loss', mode='auto', save_best_only=True)
es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=5)
history= model.fit(X_train,y_train, epochs=50,
validation_data=(X_test,y_test),
# validation_split=0.1,
verbose=1,
callbacks=[es,mc]
)
print(history['acc'])
print(history.history['acc'])
plt.figure(figsize=(10,5))
plt.plot(history.history['acc'], 'red', label="acc")
plt.plot(history.history['val_acc'], 'blue', label='val_acc')
plt.title("타이틀이에요")
plt.xlabel('학습회수')
plt.ylabel('정확도')
plt.rc('font', family='Malgun Gothic') #setting the font
plt.show()