# Target variable
plt.figure()
fig, ax = plt.subplots(figsize=(6,6))
x = trainset['target'].value_counts().index.values
y = trainset['target'].value_counts().values
#Bar plot
#order the bars descending on target mean
sns.barplot(ax=ax, x=x, y=y)
plt.ylabel("Number of values", fontsize=12)
plt.xlabel('Target value', fontsize = 12)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show;
#Real features
variable = metadata[(metadata.type == 'real') & (metadata.preserve)].index
trainset[variable].describe()
ps_reg_01 ps_reg_02 ps_reg_03 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03
count 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000
mean 0.610991 0.439184 0.551102 0.379945 0.813265 0.276256 3.065899 0.449756 0.449589 0.449849
std 0.287643 0.404264 0.793506 0.058327 0.224588 0.357154 0.731366 0.287198 0.286893 0.287153
min 0.000000 0.000000 -1.000000 -1.000000 0.250619 -1.000000 0.000000 0.000000 0.000000 0.000000
25% 0.400000 0.200000 0.525000 0.316228 0.670867 0.333167 2.828427 0.200000 0.200000 0.200000
50% 0.700000 0.300000 0.720677 0.374166 0.765811 0.368782 3.316625 0.500000 0.400000 0.500000
75% 0.900000 0.600000 1.000000 0.400000 0.906190 0.396485 3.605551 0.700000 0.700000 0.700000
max 0.900000 1.800000 4.037945 1.264911 3.720626 0.636396 3.741657 0.900000 0.900000 0.900000
(pow(trainset['ps_car_12']*10,2)).head(10)
0 16.00
1 10.00
2 10.00
3 14.00
4 9.99
5 19.89
6 10.00
7 19.98
8 16.00
9 20.00
Name: ps_car_12, dtype: float64
(pow(trainset['ps_car_15'],2)).head(10)
0 13.0
1 6.0
2 11.0
3 4.0
4 4.0
5 9.0
6 10.0
7 11.0
8 8.0
9 13.0
Name: ps_car_15, dtype: float64
Features with missing values
Registration feature
Car features
ps_car_12: square roots of natural numbers
ps_car_15: square roots of natural numbers
sample = trainset.sample(frac=0.05)
var = ['ps_car_12', 'ps_car_15', 'target']
sample = sample[var]
sns.pairplot(sample, hue='target', palette='Set1', diag_kind='kde')
plt.show()
#Calculated features
var = metadata[(metadata.type == 'real') & (metadata.preserve)].index
i = 0
t1 = trainset.loc[trainset['target'] != 0]
t0 = trainset.loc[trainset['target'] == 0]
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(3,4,figsize=(16,12))
for feature in var:
i += 1
plt.subplot(3,4,i)
sns.kdeplot(t1[feature], bw=0.5, label="target = 1")
sns.kdeplot(t0[feature], bw=0.5, label='target = 0')
plt.ylabel('Density plot', fontsize=12)
plt.xlabel(feature, fontsize=12)
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show();
#heatmap
def corr_heatmap(var):
correlations = trainset[var].corr()
#Create color map ranging between two colors
cmap = sns.diverging_palette(50, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
square=True, linewidth=5, annot=True, cbar_kws={"shrink":.75})
plt.show();
var = metadata[(metadata.type == 'real') & (metadata.preserve)].index
corr_heatmap(var)
sample = trainset.sample(frac=0.05)
var = ['ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_15', 'target']
sample = sample[var]
sns.pairplot(sample, hue='target', palette = 'Set1', diag_kind='kde')
plt.show()