import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
os.environ['KAGGLE_USERNAME'] = 'YOUR ID'
os.environ['KAGGLE_KEY'] = 'YOUR PASSWORD'
# !rm *.* #deleting all files in this root
!kaggle datasets download -d austinreese/craigslist-carstrucks-data
!unzip '*.zip'
df = pd.read_csv('vehicles.csv')
df
id | url | region | region_url | price | year | manufacturer | model | condition | cylinders | ... | size | type | paint_color | image_url | description | county | state | lat | long | posting_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7222695916 | https://prescott.craigslist.org/cto/d/prescott... | prescott | https://prescott.craigslist.org | 6000 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | az | NaN | NaN | NaN |
1 | 7218891961 | https://fayar.craigslist.org/ctd/d/bentonville... | fayetteville | https://fayar.craigslist.org | 11900 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | ar | NaN | NaN | NaN |
2 | 7221797935 | https://keys.craigslist.org/cto/d/summerland-k... | florida keys | https://keys.craigslist.org | 21000 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | fl | NaN | NaN | NaN |
3 | 7222270760 | https://worcester.craigslist.org/cto/d/west-br... | worcester / central MA | https://worcester.craigslist.org | 1500 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | ma | NaN | NaN | NaN |
4 | 7210384030 | https://greensboro.craigslist.org/cto/d/trinit... | greensboro | https://greensboro.craigslist.org | 4900 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nc | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
426875 | 7301591192 | https://wyoming.craigslist.org/ctd/d/atlanta-2... | wyoming | https://wyoming.craigslist.org | 23590 | 2019.0 | nissan | maxima s sedan 4d | good | 6 cylinders | ... | NaN | sedan | NaN | https://images.craigslist.org/00o0o_iiraFnHg8q... | Carvana is the safer way to buy a car During t... | NaN | wy | 33.786500 | -84.445400 | 2021-04-04T03:21:31-0600 |
426876 | 7301591187 | https://wyoming.craigslist.org/ctd/d/atlanta-2... | wyoming | https://wyoming.craigslist.org | 30590 | 2020.0 | volvo | s60 t5 momentum sedan 4d | good | NaN | ... | NaN | sedan | red | https://images.craigslist.org/00x0x_15sbgnxCIS... | Carvana is the safer way to buy a car During t... | NaN | wy | 33.786500 | -84.445400 | 2021-04-04T03:21:29-0600 |
426877 | 7301591147 | https://wyoming.craigslist.org/ctd/d/atlanta-2... | wyoming | https://wyoming.craigslist.org | 34990 | 2020.0 | cadillac | xt4 sport suv 4d | good | NaN | ... | NaN | hatchback | white | https://images.craigslist.org/00L0L_farM7bxnxR... | Carvana is the safer way to buy a car During t... | NaN | wy | 33.779214 | -84.411811 | 2021-04-04T03:21:17-0600 |
426878 | 7301591140 | https://wyoming.craigslist.org/ctd/d/atlanta-2... | wyoming | https://wyoming.craigslist.org | 28990 | 2018.0 | lexus | es 350 sedan 4d | good | 6 cylinders | ... | NaN | sedan | silver | https://images.craigslist.org/00z0z_bKnIVGLkDT... | Carvana is the safer way to buy a car During t... | NaN | wy | 33.786500 | -84.445400 | 2021-04-04T03:21:11-0600 |
426879 | 7301591129 | https://wyoming.craigslist.org/ctd/d/atlanta-2... | wyoming | https://wyoming.craigslist.org | 30590 | 2019.0 | bmw | 4 series 430i gran coupe | good | NaN | ... | NaN | coupe | NaN | https://images.craigslist.org/00Y0Y_lEUocjyRxa... | Carvana is the safer way to buy a car During t... | NaN | wy | 33.779214 | -84.411811 | 2021-04-04T03:21:07-0600 |
426880 rows × 26 columns
df.columns
Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
'image_url', 'description', 'county', 'state', 'lat', 'long',
'posting_date'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 426880 non-null int64
1 url 426880 non-null object
2 region 426880 non-null object
3 region_url 426880 non-null object
4 price 426880 non-null int64
5 year 425675 non-null float64
6 manufacturer 409234 non-null object
7 model 421603 non-null object
8 condition 252776 non-null object
9 cylinders 249202 non-null object
10 fuel 423867 non-null object
11 odometer 422480 non-null float64
12 title_status 418638 non-null object
13 transmission 424324 non-null object
14 VIN 265838 non-null object
15 drive 296313 non-null object
16 size 120519 non-null object
17 type 334022 non-null object
18 paint_color 296677 non-null object
19 image_url 426812 non-null object
20 description 426810 non-null object
21 county 0 non-null float64
22 state 426880 non-null object
23 lat 420331 non-null float64
24 long 420331 non-null float64
25 posting_date 426812 non-null object
dtypes: float64(5), int64(2), object(19)
memory usage: 84.7+ MB
df.describe()
id | price | year | odometer | county | lat | long | |
---|---|---|---|---|---|---|---|
count | 4.268800e+05 | 4.268800e+05 | 425675.000000 | 4.224800e+05 | 0.0 | 420331.000000 | 420331.000000 |
mean | 7.311487e+09 | 7.519903e+04 | 2011.235191 | 9.804333e+04 | NaN | 38.493940 | -94.748599 |
std | 4.473170e+06 | 1.218228e+07 | 9.452120 | 2.138815e+05 | NaN | 5.841533 | 18.365462 |
min | 7.207408e+09 | 0.000000e+00 | 1900.000000 | 0.000000e+00 | NaN | -84.122245 | -159.827728 |
25% | 7.308143e+09 | 5.900000e+03 | 2008.000000 | 3.770400e+04 | NaN | 34.601900 | -111.939847 |
50% | 7.312621e+09 | 1.395000e+04 | 2013.000000 | 8.554800e+04 | NaN | 39.150100 | -88.432600 |
75% | 7.315254e+09 | 2.648575e+04 | 2017.000000 | 1.335425e+05 | NaN | 42.398900 | -80.832039 |
max | 7.317101e+09 | 3.736929e+09 | 2022.000000 | 1.000000e+07 | NaN | 82.390818 | 173.885502 |
df.isna().sum()
id 0
url 0
region 0
region_url 0
price 0
year 1205
manufacturer 17646
model 5277
condition 174104
cylinders 177678
fuel 3013
odometer 4400
title_status 8242
transmission 2556
VIN 161042
drive 130567
size 306361
type 92858
paint_color 130203
image_url 68
description 70
county 426880
state 0
lat 6549
long 6549
posting_date 68
dtype: int64
df.columns
Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
'image_url', 'description', 'county', 'state', 'lat', 'long',
'posting_date'],
dtype='object')
df.drop(['id', 'url', 'region_url', 'VIN', 'image_url',
'description', 'county', 'state', 'lat', 'long',
'posting_date'], axis = 1, inplace = True)
df['age'] = 2023 - df['year']
df.drop(['year'], axis = 1, inplace = True)
df
region | price | manufacturer | model | condition | cylinders | fuel | odometer | title_status | transmission | drive | size | type | paint_color | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | prescott | 6000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | fayetteville | 11900 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | florida keys | 21000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | worcester / central MA | 1500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | greensboro | 4900 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
426875 | wyoming | 23590 | nissan | maxima s sedan 4d | good | 6 cylinders | gas | 32226.0 | clean | other | fwd | NaN | sedan | NaN | 4.0 |
426876 | wyoming | 30590 | volvo | s60 t5 momentum sedan 4d | good | NaN | gas | 12029.0 | clean | other | fwd | NaN | sedan | red | 3.0 |
426877 | wyoming | 34990 | cadillac | xt4 sport suv 4d | good | NaN | diesel | 4174.0 | clean | other | NaN | NaN | hatchback | white | 3.0 |
426878 | wyoming | 28990 | lexus | es 350 sedan 4d | good | 6 cylinders | gas | 30112.0 | clean | other | fwd | NaN | sedan | silver | 5.0 |
426879 | wyoming | 30590 | bmw | 4 series 430i gran coupe | good | NaN | gas | 22716.0 | clean | other | rwd | NaN | coupe | NaN | 4.0 |
426880 rows × 15 columns
df.describe()
price | odometer | age | |
---|---|---|---|
count | 4.268800e+05 | 4.224800e+05 | 425675.000000 |
mean | 7.519903e+04 | 9.804333e+04 | 11.764809 |
std | 1.218228e+07 | 2.138815e+05 | 9.452120 |
min | 0.000000e+00 | 0.000000e+00 | 1.000000 |
25% | 5.900000e+03 | 3.770400e+04 | 6.000000 |
50% | 1.395000e+04 | 8.554800e+04 | 10.000000 |
75% | 2.648575e+04 | 1.335425e+05 | 15.000000 |
max | 3.736929e+09 | 1.000000e+07 | 123.000000 |
df.columns
Index(['region', 'price', 'manufacturer', 'model', 'condition', 'cylinders',
'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
'type', 'paint_color', 'age'],
dtype='object')
#범주형 컬럼
cat_list = ['region', 'manufacturer', 'model', 'condition', 'cylinders',
'fuel', 'title_status', 'transmission', 'drive', 'size',
'type', 'paint_color']
#수치형 컬럼
num_list = ['price', 'odometer', 'age']
df['manufacturer'].unique()
array([nan, 'gmc', 'chevrolet', 'toyota', 'ford', 'jeep', 'nissan', 'ram',
'mazda', 'cadillac', 'honda', 'dodge', 'lexus', 'jaguar', 'buick',
'chrysler', 'volvo', 'audi', 'infiniti', 'lincoln', 'alfa-romeo',
'subaru', 'acura', 'hyundai', 'mercedes-benz', 'bmw', 'mitsubishi',
'volkswagen', 'porsche', 'kia', 'rover', 'ferrari', 'mini',
'pontiac', 'fiat', 'tesla', 'saturn', 'mercury', 'harley-davidson',
'datsun', 'aston-martin', 'land rover', 'morgan'], dtype=object)
df['manufacturer'].value_counts()
ford 70985
chevrolet 55064
toyota 34202
honda 21269
nissan 19067
jeep 19014
ram 18342
gmc 16785
bmw 14699
dodge 13707
mercedes-benz 11817
hyundai 10338
subaru 9495
volkswagen 9345
kia 8457
lexus 8200
audi 7573
cadillac 6953
chrysler 6031
acura 5978
buick 5501
mazda 5427
infiniti 4802
lincoln 4220
volvo 3374
mitsubishi 3292
mini 2376
pontiac 2288
rover 2113
jaguar 1946
porsche 1384
mercury 1184
saturn 1090
alfa-romeo 897
tesla 868
fiat 792
harley-davidson 153
ferrari 95
datsun 63
aston-martin 24
land rover 21
morgan 3
Name: manufacturer, dtype: int64
sns.countplot(x = 'manufacturer', data = df.fillna('n/a'), order = df.fillna('n/a')['manufacturer'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
len(df['model'].value_counts())
29667
for model, num in zip(df['model'].value_counts().index, df['model'].value_counts()):
print(model, '\t', num)
sns.countplot(x = 'model', data = df.fillna('n/a'), order = df.fillna('n/a')['model'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'condition', data = df.fillna('n/a'), order = df.fillna('n/a')['condition'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'cylinders', data = df.fillna('n/a'), order = df.fillna('n/a')['cylinders'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'fuel', data = df.fillna('n/a'), order = df.fillna('n/a')['fuel'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'transmission', data = df.fillna('n/a'), order = df.fillna('n/a')['transmission'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'drive', data = df.fillna('n/a'), order = df.fillna('n/a')['drive'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'size', data = df.fillna('n/a'), order = df.fillna('n/a')['size'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'type', data = df.fillna('n/a'), order = df.fillna('n/a')['type'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.countplot(x = 'paint_color', data = df.fillna('n/a'), order = df.fillna('n/a')['paint_color'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()
sns.boxplot(x = 'manufacturer', y = 'price', data = df.fillna('n/a'))
df.columns
Index(['region', 'price', 'manufacturer', 'model', 'condition', 'cylinders',
'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
'type', 'paint_color', 'age'],
dtype='object')
sns.boxplot(x = 'price', data = df)
sns.rugplot(x = 'price', data = df, height = 1)
sns.histplot(x = 'odometer', data = df)
sns.histplot(x = 'age', data = df, bins = 18, kde = True)
범주형 데이터를 아래 방법 중 적절히 판단하여 처리
1. 결손데이터가 포함된 row 제거
2. 결손데이터를 others 범주로 변경
3. 지나치게 소수로 이루어진 범주르 others 범주로 변경
4. classifier 를 학습, 결손데이터 추정해서 채우기
df.columns
Index(['region', 'price', 'manufacturer', 'model', 'condition', 'cylinders',
'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
'type', 'paint_color', 'age'],
dtype='object')
col = 'manufacturer'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts)), counts)
n_categorical = 10
df[col] = df[col].apply(lambda s: s if s not in counts.index[n_categorical:] else 'others')
df[col].value_counts()
others 139807
ford 70985
chevrolet 55064
toyota 34202
honda 21269
nissan 19067
jeep 19014
ram 18342
gmc 16785
bmw 14699
Name: manufacturer, dtype: int64
col = 'region'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts)), counts)
plt.grid()
n_categorical = 5
df[col] = df[col].apply(lambda s: s if s not in counts.index[n_categorical:] else 'others')
df[col].value_counts()
others 410754
columbus 3608
jacksonville 3562
spokane / coeur d'alene 2988
eugene 2985
fresno / madera 2983
Name: region, dtype: int64
col = 'model'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 10
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
others 386690
f-150 8009
silverado 1500 5140
1500 4211
camry 3135
silverado 3023
accord 2969
wrangler 2848
civic 2799
altima 2779
Name: model, dtype: int64
col = 'condition'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 3
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
good 121456
excellent 101467
others 29853
Name: condition, dtype: int64
col = 'cylinders'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 4
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
6 cylinders 94169
4 cylinders 77642
8 cylinders 72062
others 5329
Name: cylinders, dtype: int64
col = 'fuel'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 2
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
gas 356209
others 36930
other 30728
Name: fuel, dtype: int64
col = 'title_status'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 2
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
clean 405117
others 13521
Name: title_status, dtype: int64
col = 'transmission'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 1
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
automatic 336524
others 87800
Name: transmission, dtype: int64
col = 'drive'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
df[col].fillna('others', inplace = True)
df[col].value_counts()
4wd 131904
others 130567
fwd 105517
rwd 58892
Name: drive, dtype: int64
n_categorical = 1
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
others 294976
4wd 131904
Name: drive, dtype: int64
col = 'size'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 2
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
full-size 63465
others 57054
Name: size, dtype: int64
col = 'type'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 8
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
sedan 87056
SUV 77284
pickup 43510
truck 35279
others 32981
other 22110
coupe 19204
hatchback 16598
Name: type, dtype: int64
df.loc[df[col] == 'other', col] = 'others'
df[col].value_counts()
sedan 87056
SUV 77284
others 55091
pickup 43510
truck 35279
coupe 19204
hatchback 16598
Name: type, dtype: int64
col = 'paint_color'
counts = df[col].fillna('others').value_counts()
plt.plot(range(len(counts))[:20], counts[:20])
plt.grid()
n_categorical = 7
list_ = counts.index[n_categorical:] #efficient
df[col] = df[col].apply(lambda s: s if s not in list_ else 'others')
df[col].value_counts()
white 79285
black 62861
silver 42970
blue 31223
red 30473
others 25449
grey 24416
Name: paint_color, dtype: int64
df.loc[df[col] == 'other', col] = 'others'
df[col].value_counts()
white 79285
black 62861
silver 42970
blue 31223
red 30473
others 25449
grey 24416
Name: paint_color, dtype: int64
df.drop(['title_status'], axis = 1, inplace = True)
sns.boxplot(x = 'manufacturer', y = 'price', data = df)
sns.boxplot(x = 'model', y = 'price', data = df)
sns.boxplot(x = 'price', data = df)
sns.rugplot(x = 'price', data = df, height = 1)
sns.histplot(x = 'odometer', data = df)
sns.histplot(x = 'age', data = df, kde = True)
p1 = df['price'].quantile(0.99)
p2 = df['price'].quantile(0.10)
print(p1, p2)
66995.0 500.0
df = df[(p1 > df['price']) & (p2 < df['price'])]
o1 = df['odometer'].quantile(0.99)
o2 = df['odometer'].quantile(0.10)
print(o1, o2)
280000.0 15812.0
df = df[(o1 > df['odometer']) & (o2 < df['odometer'])]
sns.boxplot(x = 'manufacturer', y = 'price', data = df)
sns.boxplot(x = 'model', y = 'price', data = df)
sns.boxplot(x = 'price', data = df)
df.describe()
price | odometer | age | |
---|---|---|---|
count | 335851.000000 | 335851.000000 | 335248.000000 |
mean | 17534.838914 | 101335.798690 | 12.239056 |
std | 12645.528274 | 56884.388094 | 8.469817 |
min | 501.000000 | 15813.000000 | 1.000000 |
25% | 7425.500000 | 53000.000000 | 7.000000 |
50% | 14500.000000 | 96936.000000 | 10.000000 |
75% | 25226.500000 | 140337.000000 | 15.000000 |
max | 66992.000000 | 279974.000000 | 123.000000 |
df.corr()['price']
price 1.000000
odometer -0.527293
age -0.401429
Name: price, dtype: float64
sns.heatmap(df.corr()[['price']], annot = True, cmap = 'YlOrRd')
from sklearn.preprocessing import StandardScaler
X_num = df[['odometer', 'age']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_num)
X_scaled = pd.DataFrame(X_scaled, index = X_num.index, columns = X_num.columns)
X_cat = df.drop(['odometer', 'age', 'price'], axis = 1)
X_cat = pd.get_dummies(X_cat)
X = pd.concat([X_scaled, X_cat], axis = 1)
y = df['price']
X.head()
odometer | age | region_columbus | region_eugene | region_fresno / madera | region_jacksonville | region_others | region_spokane / coeur d'alene | manufacturer_bmw | manufacturer_chevrolet | ... | type_pickup | type_sedan | type_truck | paint_color_black | paint_color_blue | paint_color_grey | paint_color_others | paint_color_red | paint_color_silver | paint_color_white | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
27 | -0.763177 | -0.382424 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
28 | -0.529264 | 0.089842 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
29 | -1.444613 | -1.090823 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
30 | -1.058496 | -0.736623 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
31 | 0.468744 | -0.264357 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 58 columns
X.isna().sum()
odometer 0
age 603
region_columbus 0
region_eugene 0
region_fresno / madera 0
region_jacksonville 0
region_others 0
region_spokane / coeur d'alene 0
manufacturer_bmw 0
manufacturer_chevrolet 0
manufacturer_ford 0
manufacturer_gmc 0
manufacturer_honda 0
manufacturer_jeep 0
manufacturer_nissan 0
manufacturer_others 0
manufacturer_ram 0
manufacturer_toyota 0
model_1500 0
model_accord 0
model_altima 0
model_camry 0
model_civic 0
model_f-150 0
model_others 0
model_silverado 0
model_silverado 1500 0
model_wrangler 0
condition_excellent 0
condition_good 0
condition_others 0
cylinders_4 cylinders 0
cylinders_6 cylinders 0
cylinders_8 cylinders 0
cylinders_others 0
fuel_gas 0
fuel_other 0
fuel_others 0
transmission_automatic 0
transmission_others 0
drive_4wd 0
drive_others 0
size_full-size 0
size_others 0
type_SUV 0
type_coupe 0
type_hatchback 0
type_others 0
type_pickup 0
type_sedan 0
type_truck 0
paint_color_black 0
paint_color_blue 0
paint_color_grey 0
paint_color_others 0
paint_color_red 0
paint_color_silver 0
paint_color_white 0
dtype: int64
X['age'].mean()
-6.816164267063434e-17
X.fillna(0.0, inplace = True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
ex) linear regression, random forest, neural network etc.