Porto_Seguro_Exploratory_Analysis_and_Prediction(Data analysis and staticts)

매일 공부(ML)·2022년 4월 12일
0

캐글 필사

목록 보기
23/34

Data analysis and statistics

# Target variable

plt.figure()
fig, ax = plt.subplots(figsize=(6,6))
x = trainset['target'].value_counts().index.values
y = trainset['target'].value_counts().values

#Bar plot
#order the bars descending on target mean

sns.barplot(ax=ax, x=x, y=y)
plt.ylabel("Number of values", fontsize=12)
plt.xlabel('Target value', fontsize = 12)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show;

#Real features

variable = metadata[(metadata.type == 'real') & (metadata.preserve)].index
trainset[variable].describe()
	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03
count	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000
mean	0.610991	0.439184	0.551102	0.379945	0.813265	0.276256	3.065899	0.449756	0.449589	0.449849
std	0.287643	0.404264	0.793506	0.058327	0.224588	0.357154	0.731366	0.287198	0.286893	0.287153
min	0.000000	0.000000	-1.000000	-1.000000	0.250619	-1.000000	0.000000	0.000000	0.000000	0.000000
25%	0.400000	0.200000	0.525000	0.316228	0.670867	0.333167	2.828427	0.200000	0.200000	0.200000
50%	0.700000	0.300000	0.720677	0.374166	0.765811	0.368782	3.316625	0.500000	0.400000	0.500000
75%	0.900000	0.600000	1.000000	0.400000	0.906190	0.396485	3.605551	0.700000	0.700000	0.700000
max	0.900000	1.800000	4.037945	1.264911	3.720626	0.636396	3.741657	0.900000	0.900000	0.900000

(pow(trainset['ps_car_12']*10,2)).head(10)
0    16.00
1    10.00
2    10.00
3    14.00
4     9.99
5    19.89
6    10.00
7    19.98
8    16.00
9    20.00
Name: ps_car_12, dtype: float64
(pow(trainset['ps_car_15'],2)).head(10)
0    13.0
1     6.0
2    11.0
3     4.0
4     4.0
5     9.0
6    10.0
7    11.0
8     8.0
9    13.0
Name: ps_car_15, dtype: float64

  • Features with missing values

    • ps_reg_o3, ps_car_12, ps_car_14 : missing values
  • Registration feature

    • ps_reg_01 , ps_reg_32: fractions with denominator 10
  • Car features

    • ps_car_12: square roots of natural numbers

    • ps_car_15: square roots of natural numbers

sample = trainset.sample(frac=0.05)
var = ['ps_car_12', 'ps_car_15', 'target']
sample = sample[var]
sns.pairplot(sample, hue='target', palette='Set1', diag_kind='kde')
plt.show()

#Calculated features

var = metadata[(metadata.type == 'real') & (metadata.preserve)].index
i = 0
t1 = trainset.loc[trainset['target'] != 0]
t0 = trainset.loc[trainset['target'] == 0]

sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(3,4,figsize=(16,12))

for feature in var:
    i += 1
    plt.subplot(3,4,i)
    sns.kdeplot(t1[feature], bw=0.5, label="target = 1")
    sns.kdeplot(t0[feature], bw=0.5, label='target = 0')
    plt.ylabel('Density plot', fontsize=12)
    plt.xlabel(feature, fontsize=12)
    locs, labels = plt.xticks()
    plt.tick_params(axis='both', which='major', labelsize=12)
plt.show();

#heatmap

def corr_heatmap(var):
correlations = trainset[var].corr()

#Create color map ranging between two colors
cmap = sns.diverging_palette(50, 10, as_cmap=True)

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
            square=True, linewidth=5, annot=True, cbar_kws={"shrink":.75})
plt.show();
var = metadata[(metadata.type == 'real') & (metadata.preserve)].index
corr_heatmap(var)

sample = trainset.sample(frac=0.05)
var = ['ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_15', 'target']
sample = sample[var]
sns.pairplot(sample, hue='target', palette = 'Set1', diag_kind='kde')
plt.show()

profile
성장을 도울 아카이빙 블로그

0개의 댓글