🔗 github.com/hayannn/AIFFEL_MAIN_QUEST/MiniProject3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
ap = pd.read_csv('/content/drive/MyDrive/시계열/AirPassengers.csv')
ap.head()
Month
컬럼 삭제ap.drop('Month', axis=1, inplace=True)
ap
plt.figure()
plt.plot(ap)
plt.show()
log_transformed = np.log(ap)
log_transformed
plt.figure()
plt.plot(log_transformed)
plt.show()
diffed = log_transformed.diff()[1:]
diffed
plt.figure()
plt.plot(diffed)
plt.show()
seasonally_diffed = diffed.diff(12)
seasonally_diffed.dropna(inplace = True)
seasonally_diffed
plt.figure()
plt.plot(seasonally_diffed)
plt.show()
관찰값
adf
- 단위근 검정
p-value
- 유의 검정
- 관찰 데이터가 귀무가설이 맞을 경우 -> 통계값 1이 실제 관측 값 이상일 확률을 구함
usedlag
- 시차 수(사용된 경우를 뜻함)
nobs
- ADF 회귀, 임계값 계산에 사용된 관측치 수
- critical values
임계값
- 1%, 5%, 10% 수준
icbest
- lag 길이 자동 결정: autolag
- autolag를 none으로 지정하지❌ -> 최대화된 정보를 기준으로 함
from statsmodels.tsa.stattools import adfuller
def adf_test(x):
stat, p_value, lags, nobs, crit, icb = adfuller(x)
print('ADF statistics')
print(stat)
print('P-value')
print(p_value)
adf_test(seasonally_diffed)
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()
timeseries
y
def custom_classification_split(x,y,test_size=0.3):
num_true = int(y.sum()*test_size)
num_false = int((len(y)-y.sum())*test_size)
id_list = y[y==False].head(num_false).index.to_list() + y[y==True].head(num_true).index.to_list()
y_train = y.drop(id_list)
y_test = y.iloc[id_list].sort_index()
X_train = timeseries[~timeseries['id'].isin(id_list)]
X_test = timeseries[timeseries['id'].isin(id_list)]
return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = custom_classification_split(timeseries, y, test_size=0.25)
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
settings = EfficientFCParameters()
comprehensive_features_train = extract_features(
X_train,
column_id="id",
column_sort="time",
default_fc_parameters=settings
)
comprehensive_features_test = extract_features(
X_test,
column_id="id",
column_sort="time",
default_fc_parameters=settings
)
# train 데이터
impute(comprehensive_features_train)
# test 데이터
impute(comprehensive_features_test)
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 10, max_depth = 3)
rf_clf.fit(comprehensive_features_train, y_train)
rf_clf.score(comprehensive_features_test, y_test)
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(n_estimators = 10, max_depth = 3)
xgb_clf.fit(comprehensive_features_train, y_train)
xgb_clf.score(comprehensive_features_test, y_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
log_reg_clf = LogisticRegression(max_iter=1000)
log_reg_clf.fit(comprehensive_features_train, y_train)
log_reg_clf.score(comprehensive_features_test, y_test)
xgb.plot_importance(xgb_clf, importance_type = 'gain')
plt.show()
sum(xgb_clf.feature_importances_ != 0)
F_X_abs_energy
feature임을 알 수 있음from sklearn.metrics import classification_report
classification_report(y_test, xgb_clf.predict(comprehensive_features_test), target_names = ['true', 'false'], output_dict = True)
Precision
Recall
F1-Score
Accuracy
support
Macro Avg
Weighted Avg
시계열 데이터
정상성 확인
모델 적용부터는 크게 어려운 내용은 없었음