#connect colab for using data file
from google.colab import drive
drive.mount("/content/gdrive") #Mounted at /content/gdrive
#import analysis packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
pd.set_option('display.max_columns', 100)
#load the data
trainset = pd.read_csv('./gdrive/MyDrive/train 2.csv')
testset = pd.read_csv('./gdrive/MyDrive/test 2.csv')
postfix bin : binary features
postfix cat : categorical features
features without bin and cat: continous values of ordinal values
missing value: -1
prediction: target column
id : input ordinal input
trainset.head()
id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03 ps_car_01_cat ps_car_02_cat ps_car_03_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 7 0 2 2 5 1 0 0 1 0 0 0 0 0 0 0 11 0 1 0 0.7 0.2 0.718070 10 1 -1 0 1 4 1 0 0 1 12 2 0.400000 0.883679 0.370810 3.605551 0.6 0.5 0.2 3 1 10 1 10 1 5 9 1 5 8 0 1 1 0 0 1
1 9 0 1 1 7 0 0 0 0 1 0 0 0 0 0 0 3 0 0 1 0.8 0.4 0.766078 11 1 -1 0 -1 11 1 1 2 1 19 3 0.316228 0.618817 0.388716 2.449490 0.3 0.1 0.3 2 1 9 5 8 1 7 3 1 1 9 0 1 1 0 1 0
2 13 0 5 4 9 1 0 0 0 1 0 0 0 0 0 0 12 1 0 0 0.0 0.0 -1.000000 7 1 -1 0 -1 14 1 1 2 1 60 1 0.316228 0.641586 0.347275 3.316625 0.5 0.7 0.1 2 2 9 1 8 2 7 4 2 7 7 0 1 1 0 1 0
3 16 0 0 1 2 0 0 1 0 0 0 0 0 0 0 0 8 1 0 0 0.9 0.2 0.580948 7 1 0 0 1 11 1 1 3 1 104 1 0.374166 0.542949 0.294958 2.000000 0.6 0.9 0.1 2 4 7 1 8 4 2 2 2 4 9 0 0 0 0 0 0
4 17 0 0 2 0 1 0 1 0 0 0 0 0 0 0 0 9 1 0 0 0.7 0.6 0.840759 11 1 -1 0 -1 14 1 1 2 1 82 3 0.316070 0.565832 0.365103 2.000000 0.4 0.6 0.0 2 2 6 3 10 2 12 3 1 1 3 0 0 0 1 1 0
print("Train dataset (rows, cols):",trainset.shape, "\n dataset (rows, cols):",testset.shape)
#Train dataset (rows, cols): (595212, 59)
#dataset (rows, cols): (892816, 58)
#target data
print("Columns in train and not in test dataset:",set(trainset.columns)- set(testset.columns))
#Columns in train and not in test dataset: {'target'}
use: input, ID, target
type: nominal, interval, ordinal, binary
preserve: True of False
dataType: int, float, char
category, ind, reg, car, calc
data = []
for feature in trainset.columns:
# Defining the role
if feature == 'target':
use = 'target'
elif feature == 'id':
use = 'id'
else:
use = 'input'
# Defining the type
if 'bin' in feature or feature == 'target':
type = 'binary'
elif 'cat' in feature or feature == 'id':
type = 'categorical'
elif trainset[feature].dtype == float or isinstance(trainset[feature].dtype, float):
type = 'real'
elif trainset[feature].dtype == int:
type = 'integer'
# Initialize preserve to True for all variables except for id
preserve = True
if feature == 'id':
preserve = False
# Defining the data type
dtype = trainset[feature].dtype
category = 'none'
# Defining the category
if 'ind' in feature:
category = 'individual'
elif 'reg' in feature:
category = 'registration'
elif 'car' in feature:
category = 'car'
elif 'calc' in feature:
category = 'calculated'
# Creating a Dict that contains all the metadata for the variable
feature_dictionary = {
'varname': feature,
'use': use,
'type': type,
'preserve': preserve,
'dtype': dtype,
'category' : category
}
data.append(feature_dictionary)
metadata = pd.DataFrame(data, columns=['varname', 'use', 'type', 'preserve', 'dtype', 'category'])
metadata.set_index('varname', inplace=True)
metadata
use type preserve dtype category
varname
id id categorical False int64 none
target target binary True int64 none
ps_ind_01 input integer True int64 individual
ps_ind_02_cat input categorical True int64 individual
ps_ind_03 input integer True int64 individual
ps_ind_04_cat input categorical True int64 individual
ps_ind_05_cat input categorical True int64 individual
ps_ind_06_bin input binary True int64 individual
ps_ind_07_bin input binary True int64 individual
ps_ind_08_bin input binary True int64 individual
ps_ind_09_bin input binary True int64 individual
ps_ind_10_bin input binary True int64 individual
ps_ind_11_bin input binary True int64 individual
ps_ind_12_bin input binary True int64 individual
ps_ind_13_bin input binary True int64 individual
ps_ind_14 input integer True int64 individual
ps_ind_15 input integer True int64 individual
ps_ind_16_bin input binary True int64 individual
ps_ind_17_bin input binary True int64 individual
ps_ind_18_bin input binary True int64 individual
ps_reg_01 input real True float64 registration
ps_reg_02 input real True float64 registration
ps_reg_03 input real True float64 registration
ps_car_01_cat input categorical True int64 car
ps_car_02_cat input categorical True int64 car
ps_car_03_cat input categorical True int64 car
ps_car_04_cat input categorical True int64 car
ps_car_05_cat input categorical True int64 car
ps_car_06_cat input categorical True int64 car
ps_car_07_cat input categorical True int64 car
ps_car_08_cat input categorical True int64 car
ps_car_09_cat input categorical True int64 car
ps_car_10_cat input categorical True int64 car
ps_car_11_cat input categorical True int64 car
ps_car_11 input integer True int64 car
ps_car_12 input real True float64 car
ps_car_13 input real True float64 car
ps_car_14 input real True float64 car
ps_car_15 input real True float64 car
ps_calc_01 input real True float64 calculated
ps_calc_02 input real True float64 calculated
ps_calc_03 input real True float64 calculated
ps_calc_04 input integer True int64 calculated
ps_calc_05 input integer True int64 calculated
ps_calc_06 input integer True int64 calculated
ps_calc_07 input integer True int64 calculated
ps_calc_08 input integer True int64 calculated
ps_calc_09 input integer True int64 calculated
ps_calc_10 input integer True int64 calculated
ps_calc_11 input integer True int64 calculated
ps_calc_12 input integer True int64 calculated
ps_calc_13 input integer True int64 calculated
ps_calc_14 input integer True int64 calculated
ps_calc_15_bin input binary True int64 calculated
ps_calc_16_bin input binary True int64 calculated
ps_calc_17_bin input binary True int64 calculated
ps_calc_18_bin input binary True int64 calculated
ps_calc_19_bin input binary True int64 calculated
ps_calc_20_bin input binary True int64 calculated
metadata[(metadata.type == 'categorical') & (metadata.preserve)].index
#Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat'],
dtype='object', name='varname')
pd.DataFrame({'count' : metadata.groupby(['category'])['category'].size()}).reset_index()
category count
0 calculated 20
1 car 16
2 individual 18
3 none 2
4 registration 3
pd.DataFrame({'count' : metadata.groupby(['use','type'])['use'].size()}).reset_index()
use type count
0 id categorical 1
1 input binary 17
2 input categorical 14
3 input integer 16
4 input real 10
5 target binary 1