Porto_Seguro_Exploratory_Analysis_and_Prediction

매일 공부(ML)·2022년 4월 12일
0

캐글 필사

목록 보기
22/34
#connect colab for using data file
from google.colab import drive

drive.mount("/content/gdrive") #Mounted at /content/gdrive
#import analysis packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 100)
#load the data
trainset = pd.read_csv('./gdrive/MyDrive/train 2.csv')
testset = pd.read_csv('./gdrive/MyDrive/test 2.csv')

Rules

postfix bin : binary features

postfix cat : categorical features

features without bin and cat: continous values of ordinal values

missing value: -1

prediction: target column

id : input ordinal input


trainset.head()
	id	target	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_05_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_10_bin	ps_ind_11_bin	ps_ind_12_bin	ps_ind_13_bin	ps_ind_14	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_04_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	0	2	2	5	1	0	0	1	0	0	0	0	0	0	0	11	0	1	0	0.7	0.2	0.718070	10	1	-1	0	1	4	1	0	0	1	12	2	0.400000	0.883679	0.370810	3.605551	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	0	1	1	0	0	1
1	9	0	1	1	7	0	0	0	0	1	0	0	0	0	0	0	3	0	0	1	0.8	0.4	0.766078	11	1	-1	0	-1	11	1	1	2	1	19	3	0.316228	0.618817	0.388716	2.449490	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	0	1	1	0	1	0
2	13	0	5	4	9	1	0	0	0	1	0	0	0	0	0	0	12	1	0	0	0.0	0.0	-1.000000	7	1	-1	0	-1	14	1	1	2	1	60	1	0.316228	0.641586	0.347275	3.316625	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	0	1	1	0	1	0
3	16	0	0	1	2	0	0	1	0	0	0	0	0	0	0	0	8	1	0	0	0.9	0.2	0.580948	7	1	0	0	1	11	1	1	3	1	104	1	0.374166	0.542949	0.294958	2.000000	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0	0
4	17	0	0	2	0	1	0	1	0	0	0	0	0	0	0	0	9	1	0	0	0.7	0.6	0.840759	11	1	-1	0	-1	14	1	1	2	1	82	3	0.316070	0.565832	0.365103	2.000000	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	0	1	1	0
print("Train dataset (rows, cols):",trainset.shape, "\n dataset (rows, cols):",testset.shape)

#Train dataset (rows, cols): (595212, 59) 
#dataset (rows, cols): (892816, 58)
#target data

print("Columns in train and not in test dataset:",set(trainset.columns)- set(testset.columns))

#Columns in train and not in test dataset: {'target'}

Meta data

use: input, ID, target

type: nominal, interval, ordinal, binary

preserve: True of False

dataType: int, float, char

category, ind, reg, car, calc

data = []
for feature in trainset.columns:
    # Defining the role
    if feature == 'target':
        use = 'target'
    elif feature == 'id':
        use = 'id'
    else:
        use = 'input'
         
    # Defining the type
    if 'bin' in feature or feature == 'target':
        type = 'binary'
    elif 'cat' in feature or feature == 'id':
        type = 'categorical'
    elif trainset[feature].dtype == float or isinstance(trainset[feature].dtype, float):
        type = 'real'
    elif trainset[feature].dtype == int:
        type = 'integer'
        
    # Initialize preserve to True for all variables except for id
    preserve = True
    if feature == 'id':
        preserve = False
    
    # Defining the data type 
    dtype = trainset[feature].dtype
    
    category = 'none'
    # Defining the category
    if 'ind' in feature:
        category = 'individual'
    elif 'reg' in feature:
        category = 'registration'
    elif 'car' in feature:
        category = 'car'
    elif 'calc' in feature:
        category = 'calculated'
    
    
    # Creating a Dict that contains all the metadata for the variable
    feature_dictionary = {
        'varname': feature,
        'use': use,
        'type': type,
        'preserve': preserve,
        'dtype': dtype,
        'category' : category
    }
    data.append(feature_dictionary)
    
metadata = pd.DataFrame(data, columns=['varname', 'use', 'type', 'preserve', 'dtype', 'category'])
metadata.set_index('varname', inplace=True)
metadata
	use	type	preserve	dtype	category
varname					
id	id	categorical	False	int64	none
target	target	binary	True	int64	none
ps_ind_01	input	integer	True	int64	individual
ps_ind_02_cat	input	categorical	True	int64	individual
ps_ind_03	input	integer	True	int64	individual
ps_ind_04_cat	input	categorical	True	int64	individual
ps_ind_05_cat	input	categorical	True	int64	individual
ps_ind_06_bin	input	binary	True	int64	individual
ps_ind_07_bin	input	binary	True	int64	individual
ps_ind_08_bin	input	binary	True	int64	individual
ps_ind_09_bin	input	binary	True	int64	individual
ps_ind_10_bin	input	binary	True	int64	individual
ps_ind_11_bin	input	binary	True	int64	individual
ps_ind_12_bin	input	binary	True	int64	individual
ps_ind_13_bin	input	binary	True	int64	individual
ps_ind_14	input	integer	True	int64	individual
ps_ind_15	input	integer	True	int64	individual
ps_ind_16_bin	input	binary	True	int64	individual
ps_ind_17_bin	input	binary	True	int64	individual
ps_ind_18_bin	input	binary	True	int64	individual
ps_reg_01	input	real	True	float64	registration
ps_reg_02	input	real	True	float64	registration
ps_reg_03	input	real	True	float64	registration
ps_car_01_cat	input	categorical	True	int64	car
ps_car_02_cat	input	categorical	True	int64	car
ps_car_03_cat	input	categorical	True	int64	car
ps_car_04_cat	input	categorical	True	int64	car
ps_car_05_cat	input	categorical	True	int64	car
ps_car_06_cat	input	categorical	True	int64	car
ps_car_07_cat	input	categorical	True	int64	car
ps_car_08_cat	input	categorical	True	int64	car
ps_car_09_cat	input	categorical	True	int64	car
ps_car_10_cat	input	categorical	True	int64	car
ps_car_11_cat	input	categorical	True	int64	car
ps_car_11	input	integer	True	int64	car
ps_car_12	input	real	True	float64	car
ps_car_13	input	real	True	float64	car
ps_car_14	input	real	True	float64	car
ps_car_15	input	real	True	float64	car
ps_calc_01	input	real	True	float64	calculated
ps_calc_02	input	real	True	float64	calculated
ps_calc_03	input	real	True	float64	calculated
ps_calc_04	input	integer	True	int64	calculated
ps_calc_05	input	integer	True	int64	calculated
ps_calc_06	input	integer	True	int64	calculated
ps_calc_07	input	integer	True	int64	calculated
ps_calc_08	input	integer	True	int64	calculated
ps_calc_09	input	integer	True	int64	calculated
ps_calc_10	input	integer	True	int64	calculated
ps_calc_11	input	integer	True	int64	calculated
ps_calc_12	input	integer	True	int64	calculated
ps_calc_13	input	integer	True	int64	calculated
ps_calc_14	input	integer	True	int64	calculated
ps_calc_15_bin	input	binary	True	int64	calculated
ps_calc_16_bin	input	binary	True	int64	calculated
ps_calc_17_bin	input	binary	True	int64	calculated
ps_calc_18_bin	input	binary	True	int64	calculated
ps_calc_19_bin	input	binary	True	int64	calculated
ps_calc_20_bin	input	binary	True	int64	calculated
metadata[(metadata.type == 'categorical') & (metadata.preserve)].index

#Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object', name='varname')
pd.DataFrame({'count' : metadata.groupby(['category'])['category'].size()}).reset_index()

   category 	count
0	calculated	20
1	car	16
2	individual	18
3	none	2
4	registration	3
pd.DataFrame({'count' : metadata.groupby(['use','type'])['use'].size()}).reset_index()

    use	type	 count
0	id	categorical	1
1	input	binary	17
2	input	categorical	14
3	input	integer	16
4	input	real	10
5	target	binary	1
profile
성장을 도울 아카이빙 블로그

0개의 댓글