โถ olist_orders_dataset ํ
์ด๋ธ
1-1. order_status = 'canceled' ํ ์ญ์
1-2. ํ๋งค ์ฌ์ดํด์ ๋ง์ง ์๋ ๋ฐ์ดํฐ ์ญ์
SELECT * FROM olist_orders_dataset WHERE order_status = 'canceled';
DELECT FROM olist_orders_dataset WHERE order_status = 'canceled';
โถ olist_sellers_dataset ํ
์ด๋ธ
2. seller_city๊ฐ seller_state์ ํฌํจ๋์ง ์์ ์ด์ ๋ฐ์
SELECT SD.seller_id, SD.seller_zip_code_prefix, GD.geolocation_state
FROM olist_sellers_dataset SD
JOIN
(SELECT
DISTINCT geolocation_zip_code_prefix AS zip_code,
geolocation_state FROM olist_geolocation_dataset) GD
ON SD.seller_zip_code_prefix = GD.zip_code
-> ์ด์ ํด๊ฒฐ
โถ ๋ชจ๋ ๋ฐ์ดํฐ ์กฐ์ธ
# ๋ฐ์ดํฐ ํฉ์น๊ธฐ
olist_df = pd.merge(orders_df, order_payments_df, on = 'order_id')
olist_df = olist_df.merge(customer_df, on = 'customer_id')
olist_df = olist_df.merge(order_items_df, on = 'order_id')
olist_df = olist_df.merge(products_df, on = 'product_id')
olist_df = olist_df.merge(category_name_df, on = 'product_category_name')
olist_df = olist_df.merge(order_reviews_df, on = 'order_id')
olist_df = olist_df.merge(sellers_df, on = 'seller_id')
olist_df.info()
# ์ค๋ณต ํ ์ ๊ฑฐ
olist_df.duplicated(subset = 'order_id').value_counts()
olist_df = olist_df[~olist_df.duplicated(['order_id'])]
# ์ธ๋ฑ์ค ์ด๊ธฐํ
olist_df = olist_df.reset_index()
olist_df = olist_df.drop(columns = {'index'})
โถ ๋ ์ง ์ปฌ๋ผ ํ๋ณํํ ์ฐ, ์, ์์ผ๋ณ๋ก ๋ถํด
# datetime ํ์์ผ๋ก ๊ตฌ๋งค ์ผ์ ๋ณ๊ฒฝ
date_time = olist_df['order_purchase_timestamp'].str.split()
date_list = []
time_list = []
for x in range(date_time.shape[0]) :
date_list.append(date_time[x][0])
time_list.append(date_time[x][1])
olist_df['purchase_date'], olist_df['purchase_time'] = date_list, time_list
olist_df = olist_df.drop(columns = {'order_purchase_timestamp'})
olist_df['purchase_date'] = pd.to_datetime(olist_df['purchase_date'])
# ๊ตฌ๋งค ์ผ์๋ฅผ ์ฐ, ์, ์์ผ๋ณ๋ก ๋ถํด
olist_df['year'] = olist_df['purchase_date'].dt.year
olist_df['month'] = olist_df['purchase_date'].dt.month
olist_df['day_of_week'] = olist_df['purchase_date'].dt.day_name()
# olist_df['purchase_time']์ ์๊ฐ ๊ฐ๋ง ์ ์ฅ
olist_df['purchase_time'] = olist_df['purchase_time'].str.slice(0, 2)
โถ ๋ธ๋ผ์ง์ ์ฃผ ์ด๋ฆ์ ํ๊ธ ๋ช ์ ์ปฌ๋ผ์ผ๋ก ๋ฐ๋ก ์ ์ฅ(์๊ฐํ์ ์ฉ์ดํ๊ธฐ ์ํด)
# ์ฃผ ์ด๋ฆ์ ํ๊ธ๋ก ๋ฐ๊พธ๋ ํจ์
def get_kor_state(state) :
if state == 'AC' :
state_kor_name = '์ํฌ๋ฆฌ์ฃผ'
elif state == 'AL' :
state_kor_name = '์๋ผ๊ณ ์์ค์ฃผ'
elif state == 'AP' :
state_kor_name = '์๋งํ์ฃผ'
elif state == 'AM' :
state_kor_name = '์๋ง์กฐ๋์ค์ฃผ'
elif state == 'BA' :
state_kor_name = '๋ฐ์ด์์ฃผ'
elif state == 'CE' :
state_kor_name = '์ธ์๋ผ์ฃผ'
elif state == 'DF' :
state_kor_name = '์ฐ๋ฐฉ๊ตฌ'
elif state == 'ES' :
state_kor_name = '์ด์คํผ๋ฆฌํฌ์ฐํฌ์ฃผ'
elif state == 'GO' :
state_kor_name = '๊ณ ์ด์์ค์ฃผ'
elif state == 'MA' :
state_kor_name = '๋ง๋ผ๋ฅ์ฃผ'
elif state == 'MT' :
state_kor_name = '๋งํฌ๊ทธ๋ก์์ฃผ'
elif state == 'MG' :
state_kor_name = '๋ฏธ๋์ค์ ๋ผ์ด์ค์ฃผ'
elif state == 'PA' :
state_kor_name = 'ํ๋ผ์ฃผ'
elif state == 'PB' :
state_kor_name = 'ํ๋ผ์ด๋ฐ์ฃผ'
elif state == 'PR' :
state_kor_name = 'ํ๋ผ๋์ฃผ'
elif state == 'PE' :
state_kor_name = 'ํ๋ฅด๋จ๋ถ์ฟ ์ฃผ'
elif state == 'PI' :
state_kor_name = 'ํผ์์ฐ์ด์ฃผ'
elif state == 'RJ' :
state_kor_name = '๋ฆฌ์ฐ๋ฐ์๋ค์ด๋ฃจ์ฃผ'
elif state == 'RN' :
state_kor_name = 'ํ์ฐ๊ทธ๋์ง๋๋
ธ๋ฅด์น์ฃผ'
elif state == 'RS' :
state_kor_name = 'ํ์ฐ๊ทธ๋์ง๋์ ์ฃผ'
elif state == 'RO' :
state_kor_name = 'ํผ๋๋์์ฃผ'
elif state == 'RR' :
state_kor_name = 'ํธ๋ผ์ด๋ง์ฃผ'
elif state == 'SC' :
state_kor_name = '์ฐํ์นดํ๋ฆฌ๋์ฃผ'
elif state == 'SP' :
state_kor_name = '์ํ์ธ๋ฃจ์ฃผ'
elif state == 'SE' :
state_kor_name = '์ธ๋ฅด์งํผ์ฃผ'
elif state == 'MS' :
state_kor_name = '๋งํฌ๊ทธ๋ก์๋์ '
else :
state_kor_name = 'ํ ์นธ์นญ์ค์ฃผ'
return state_kor_name
# ์ฃผ์ ํ๊ธ ์ด๋ฆ ์ปฌ๋ผ ์ถ๊ฐ
for index, row in olist_df.iterrows():
state = row['customer_state']
kor_name = get_kor_state(state)
olist_df.at[index,'kor_state'] = kor_name
โถ ์ฃผ์ ์์น์ ๋ฐ๋ผ ์ง์ญ๋ณ๋ก ๋๋์ด ์๋ก์ด ์ปฌ๋ผ์ ์ ์ฅ
# state๋ฅผ ์
๋ ฅ๋ฐ์ผ๋ฉด ์ง์ญ์ returnํ๋ ํจ์
def get_region(state) :
if (state == 'SP' or state == 'MG' or state == 'ES' or state == 'RJ') :
region = '๋จ๋๋ถ'
elif (state == 'PR' or state == 'SC' or state == 'RS') :
region = '๋จ๋ถ'
elif (state == 'BA' or state == 'PE' or state == 'CE' or state == 'RN' or state == 'PI' or state == 'MA' or state == 'SE' or state == 'AL' or state == 'PB') :
region = '๋ถ๋๋ถ'
elif (state == 'GO' or state == 'MT' or state == 'MS' or state == 'DF') :
region = '์ค์๋ถ'
else :
region = '๋ถ๋ถ'
return region
# ํ๋งค์ ๊ฑฐ์ฃผ ์ง์ญ ์ปฌ๋ผ ์ถ๊ฐ
for index, row in olist_df.iterrows():
state = row['seller_state']
region = get_region(state)
olist_df.at[index,'seller_region'] = region
# ๊ตฌ๋งค์ ๊ฑฐ์ฃผ ์ง์ญ ์ปฌ๋ผ ์ถ๊ฐ
for index, row in olist_df.iterrows():
state = row['customer_state']
region = get_region(state)
olist_df.at[index,'customer_region'] = region
customer_state_df = olist_df['customer_state']
customer_state_df = pd.DataFrame(customer_state_df.value_counts())
customer_state_df = customer_state_df.reset_index().rename(columns = {'index' : 'state'})
px.pie(customer_state_df, values='customer_state', names='state',
color_discrete_sequence=px.colors.qualitative.Pastel1 + px.colors.qualitative.Pastel2)
category_df = pd.DataFrame(olist_df['product_category_name_english'].value_counts())
category_df = category_df.reset_index().rename(columns = {'index' : 'category', 'product_category_name_english' : 'count'})
category_df
px.bar(category_df, x="category", y='count',
labels={"category":"์นดํ
๊ณ ๋ฆฌ","count":"์ด ์ฃผ๋ฌธ๊ฑด์"},
title='์นดํ
๊ณ ๋ฆฌ๋ณ ์ฃผ๋ฌธ ๊ฑด์', color = px.colors.qualitative.Pastel1 + px.colors.qualitative.Pastel2 + px.colors.qualitative.Pastel + px.colors.qualitative.Light24 + px.colors.qualitative.Safe + px.colors.qualitative.Set2,
color_discrete_map='identity')
date_purchase_df = olist_df[['payment_value', 'purchase_date', 'year', 'month', 'day_of_week']]
date_purchase_df['purchase_date'].describe()
date_purchase_pt = pd.pivot_table(data = date_purchase_df,
index = 'purchase_date',
values = 'payment_value',
aggfunc = 'sum').reset_index()
date_purchase_pt
px.line(date_purchase_pt, x = 'purchase_date', y = 'payment_value',
labels={"purchase_date":"๋ ์ง","payment_value":"๋งค์ถ์ก"},
title='๋งค์ถ์ก ์ถ์ด')
# ์ฐ๋๋ณ ๋งค์ถ์ก์ ํฉ๊ณ ํผ๋ฒํ
์ด๋ธ ์์ฑ
year_purchase_pt = pd.pivot_table(data = date_purchase_df,
index = 'year',
values = 'payment_value',
aggfunc = 'sum').reset_index()
# ์ฐ๋๋ณ ์ฃผ๋ฌธ ๊ฑด์ ํผ๋ฒํ
์ด๋ธ ์์ฑ
year_purchase_pt2 = pd.pivot_table(data = date_purchase_df,
index = 'year',
values = 'payment_value',
aggfunc = 'count').reset_index()
year_purchase_pt = pd.merge(year_purchase_pt, year_purchase_pt2, on = 'year')
year_purchase_pt = year_purchase_pt.rename(columns = {'payment_value_x' : 'value_sum', 'payment_value_y' : 'value_count'})
year_purchase_pt
# ๋ง๋๊ทธ๋ํ = ์ฐ๋๋ณ ์ฃผ๋ฌธ ๊ฑด์, ์ ๊ทธ๋ํ = ์ฐ๋๋ณ ๋งค์ถ์ก
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
fig, ax1 = plt.subplots(figsize=(12,6))
sns.lineplot(data = year_purchase_pt['value_sum'], marker='o', sort = False, ax=ax1)
ax2 = ax1.twinx()
sns.barplot(year_purchase_pt, x='year', y='value_count', alpha=0.5, ax=ax2)
# ์๋ณ ๋งค์ถ์ก์ ํฉ๊ณ ํผ๋ฒํ
์ด๋ธ ์์ฑ
month_purchase_pt = pd.pivot_table(data = date_purchase_df,
index = 'month',
values = 'payment_value',
aggfunc = 'sum').reset_index()
# ์๋ณ ์ฃผ๋ฌธ ๊ฑด์ ํผ๋ฒํ
์ด๋ธ ์์ฑ
month_purchase_pt2 = pd.pivot_table(data = date_purchase_df,
index = 'month',
values = 'payment_value',
aggfunc = 'count').reset_index()
month_purchase_pt = pd.merge(month_purchase_pt, month_purchase_pt2, on = 'month')
month_purchase_pt = month_purchase_pt.rename(columns = {'payment_value_x' : 'value_sum', 'payment_value_y' : 'value_count'})
month_purchase_pt
# ๋ง๋๊ทธ๋ํ = ์๋ณ ์ฃผ๋ฌธ ๊ฑด์, ์ ๊ทธ๋ํ = ์๋ณ ๋งค์ถ์ก
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
fig, ax1 = plt.subplots(figsize=(12,6))
sns.lineplot(data = month_purchase_pt['value_sum'], marker='o', sort = False, ax=ax1)
ax2 = ax1.twinx()
sns.barplot(month_purchase_pt, x='month', y='value_count', alpha=0.5, ax=ax2)
# ์์ผ๋ณ ๋งค์ถ์ก์ ํฉ๊ณ ํผ๋ฒํ
์ด๋ธ ์์ฑ
dayofweek_purchase_pt = pd.pivot_table(data = date_purchase_df,
index = 'day_of_week',
values = 'payment_value',
aggfunc = 'sum').reset_index()
# ์์ผ๋ณ ์ฃผ๋ฌธ ๊ฑด์ ํผ๋ฒํ
์ด๋ธ ์์ฑ
dayofweek_purchase_pt2 = pd.pivot_table(data = date_purchase_df,
index = 'day_of_week',
values = 'payment_value',
aggfunc = 'count').reset_index()
dayofweek_purchase_pt = pd.merge(dayofweek_purchase_pt, dayofweek_purchase_pt2, on = 'day_of_week')
dayofweek_purchase_pt = dayofweek_purchase_pt.rename(columns = {'payment_value_x' : 'value_sum', 'payment_value_y' : 'value_count'})
# ์ธ๋ฑ์ค ์ฌ๋ฐฐ์ด
dayofweek_purchase_pt = dayofweek_purchase_pt.reindex([3, 1, 5, 6, 4, 0, 2])
dayofweek_purchase_pt = dayofweek_purchase_pt.reset_index()
dayofweek_purchase_pt = dayofweek_purchase_pt.drop(columns = {'index'})
dayofweek_purchase_pt
# ๋ง๋๊ทธ๋ํ = ์์ผ๋ณ ์ฃผ๋ฌธ ๊ฑด์, ์ ๊ทธ๋ํ = ์์ผ๋ณ ๋งค์ถ์ก
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
fig, ax1 = plt.subplots(figsize=(12,6))
sns.lineplot(data = dayofweek_purchase_pt['value_sum'], marker='o', sort = False, ax=ax1)
ax2 = ax1.twinx()
sns.barplot(dayofweek_purchase_pt, x='day_of_week', y='value_count', alpha=0.5, ax=ax2)
time_purchase_df = olist_df[['payment_value', 'purchase_time']]
# ์๊ฐ๋ณ ๋งค์ถ์ก์ ํฉ๊ณ ํผ๋ฒํ
์ด๋ธ ์์ฑ
time_purchase_pt = pd.pivot_table(data = time_purchase_df,
index = 'purchase_time',
values = 'payment_value',
aggfunc = 'sum').reset_index()
# ์๊ฐ๋ณ ์ฃผ๋ฌธ ๊ฑด์ ํผ๋ฒํ
์ด๋ธ ์์ฑ
time_purchase_pt2 = pd.pivot_table(data = time_purchase_df,
index = 'purchase_time',
values = 'payment_value',
aggfunc = 'count').reset_index()
time_purchase_pt = pd.merge(time_purchase_pt, time_purchase_pt2, on = 'purchase_time')
time_purchase_pt = time_purchase_pt.rename(columns = {'payment_value_x' : 'value_sum', 'payment_value_y' : 'value_count'})
time_purchase_pt
# ๋ง๋๊ทธ๋ํ = ์๊ฐ๋๋ณ ์ฃผ๋ฌธ ๊ฑด์, ์ ๊ทธ๋ํ = ์๊ฐ๋๋ณ ๋งค์ถ์ก
matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )
fig, ax1 = plt.subplots(figsize=(12,6))
sns.lineplot(data = time_purchase_pt['value_sum'], marker='o', sort = False, ax=ax1)
ax2 = ax1.twinx()
sns.barplot(time_purchase_pt, x='purchase_time', y='value_count', alpha=0.5, ax=ax2)
category_delivery_df = olist_df[['product_category_name_english', 'purchase_date', 'order_delivered_customer_date']].dropna(axis = 0).reset_index().drop(columns = {'index'})
# ๋ฐฐ์ก๋ ๋ ์ง๋ง ์ถ์ถ
delivered_date = category_delivery_df['order_delivered_customer_date'].str.split()
delivered_date_list = []
for x in range(delivered_date.shape[0]) :
delivered_date_list.append(delivered_date[x][0])
# ์ปฌ๋ผ์ ์ ์ฅ
category_delivery_df['order_delivered_customer_date'] = pd.to_datetime(delivered_date_list)
# ์ฃผ๋ฌธ ๋ ์ง์ ๋ฐฐ์ก ๋์ฐฉ ๋ ์ง์ ์ฐจ์ด๋ฅผ ๊ณ์ฐํ์ฌ ์ปฌ๋ผ์ ์ ์ฅ
category_delivery_df['delivered_date'] = category_delivery_df['order_delivered_customer_date'] - category_delivery_df['purchase_date']
category_delivery_df = category_delivery_df.drop(columns = {'order_delivered_customer_date', 'purchase_date'})
# ์นดํ
๊ณ ๋ฆฌ๋ณ ๋ฐฐ์ก๊น์ง ๊ฑธ๋ฆฐ ๋ ์ง์ ํ๊ท ํผ๋ฒํ
์ด๋ธ ์์ฑ
category_delivery_pt = pd.pivot_table(data = category_delivery_df,
index = 'product_category_name_english',
values = 'delivered_date',
aggfunc = 'mean').reset_index()
# ๋ฐฐ์ก๊น์ง ๊ฑธ๋ฆฐ ๋ ์ง๋ฅผ ๊ธฐ์ค์ผ๋ก ์ค๋ฆ์ฐจ์์ผ๋ก ์ ๋ ฌ
category_delivery_pt = category_delivery_pt.sort_values(by = 'delivered_date', ascending=True)
category_delivery_pt['delivered_date'] = category_delivery_pt['delivered_date'].dt.days
px.line(category_delivery_pt,
x = 'product_category_name_english',
y = 'delivered_date',
labels={'product_category_name_english' : '์นดํ
๊ณ ๋ฆฌ ๋ช
', 'delivered_date' : 'ํ๊ท ๋ฐฐ์ก ์๊ฐ'},
title = '์นดํ
๊ณ ๋ฆฌ๋ณ ํ๊ท ๋ฐฐ์ก ์๊ฐ')
delivery_score_df = olist_df[['order_delivered_customer_date', 'purchase_date', 'review_score']].dropna(axis = 0).reset_index()
delivery_score_df = delivery_score_df.drop(columns = {'index'})
# ๋ฐฐ์ก๋ ๋ ์ง๋ง ์ถ์ถ
delivered_date = delivery_score_df['order_delivered_customer_date'].str.split()
# ๋ฆฌ์คํธ์ ์ ์ฅ
delivered_date_list = []
for x in range(delivered_date.shape[0]) :
delivered_date_list.append(delivered_date[x][0])
# datetime ํ์์ผ๋ก ์ปฌ๋ผ์ ์ ์ฅ
delivery_score_df['order_delivered_customer_date'] = pd.to_datetime(delivered_date_list)
# ์ฃผ๋ฌธ ์๊ฐ๊ณผ ๋ฐฐ์ก ๋์ฐฉ ๋ ์ง์ ์ฐจ์ด๋ฅผ ์ปฌ๋ผ์ ์ ์ฅํ๊ณ ๋๋จธ์ง ์ปฌ๋ผ ์ญ์
delivery_score_df['delivered_date'] = delivery_score_df['order_delivered_customer_date'] - delivery_score_df['purchase_date']
delivery_score_df = delivery_score_df.drop(columns = {'order_delivered_customer_date', 'purchase_date'})
delivery_score_df['delivered_date'] = delivery_score_df['delivered_date'].dt.days
delivery_score_df
# ๋ฐฐ์ก๊น์ง ๊ฑธ๋ฆฐ ๋ ์ง ๋ฒ์ฃผํ
delivery_dic = {1 : '0~5days', 2 : '5~10days', 3 : '10~15days', 4 : '15~20days', 5 : '20days~'}
delivery_legend_list = []
for date in delivery_score_df['delivered_date'] :
if 0 <= date < 5 :
delivery_legend_list.append(1)
elif 5 <= date < 10 :
delivery_legend_list.append(2)
elif 10 <= date < 15 :
delivery_legend_list.append(3)
elif 15 <= date < 20 :
delivery_legend_list.append(4)
else :
delivery_legend_list.append(5)
delivery_score_df['delivery_legend'] = delivery_legend_list
# ๋ ์ง ๋ฒ์ฃผํ๋น ํ๊ท ๋ฆฌ๋ทฐ ์ ์๋ฅผ ๊ตฌํ๋ ํผ๋ฒํ
์ด๋ธ ์์ฑ
delivery_score_pt = pd.pivot_table(data = delivery_score_df,
index = 'delivery_legend',
values = 'review_score',
aggfunc = 'mean').reset_index()
# ๋ฐ์ดํฐ๋ฅผ delivery_dic์ value ๊ฐ์ผ๋ก ๋ณ๊ฒฝ
delivery_score_pt['delivery_legend'] = delivery_dic.values()
delivery_score_pt
px.bar(delivery_score_pt, x = 'delivery_legend', y = 'review_score',
labels = {'delivery_legend' : '๋ฐฐ์ก์๊ฐ', 'review_score' : 'ํ๊ท ํ์ '},
title='๋ฐฐ์ก์๊ฐ๋ณ ํ๊ท ํ์ ', color=["rgb(239,209,159)", "rgb(255,185,144)", "rgb(255,190,159)", "rgb(255,163,139)", "rgb(255,179,171)"],
color_discrete_map="identity")
# ์๋์ ๊ฒฝ๋๋ฅผ ํตํด ๊ฑฐ๋ฆฌ(๋จ์ : km) ๊ตฌํ๋ ํจ์
def haversine_distance(lat1, lon1, lat2, lon2):
# ์ง๊ตฌ์ ๋ฐ์ง๋ฆ (ํ๊ท ๋ฐ์ง๋ฆ์ ์ฝ 6,371 km)
R = 6371.0
# ์๋์ ๊ฒฝ๋๋ฅผ ๋ผ๋์์ผ๋ก ๋ณํ
lat1 = math.radians(lat1)
lon1 = math.radians(lon1)
lat2 = math.radians(lat2)
lon2 = math.radians(lon2)
# ์๋์ ๊ฒฝ๋ ๊ฐ์ ์ฐจ์ด ๊ณ์ฐ
dlat = lat2 - lat1
dlon = lon2 - lon1
# Haversine ๊ณต์์ ์ฌ์ฉํ์ฌ ๊ฑฐ๋ฆฌ ๊ณ์ฐ
a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
distance = R * c
return distance
๋ค์๊ณผ ๊ฐ์ ํจ์๋ฅผ ๋ง๋ค์ด ๊ตฌ๋งค์์ ์๋, ๊ฒฝ๋์ ํ๋งค์์ ์๋, ๊ฒฝ๋๋ฅผ ํตํ ๊ฑฐ๋ฆฌ๋ฅผ ์์๋ณด์๋ค.
px.bar(distance_dif_df, x = '๋ฐฐ์ก ์๊ฐ', y = 'ํ๊ท ๊ฑฐ๋ฆฌ(km)', color = '๋ฐฐ์ก ์๊ฐ')
# ๊ตฌ๋งค์ ์ฃผ์ ๋ฐฐ์ก๋น ์ปฌ๋ผ๋ง ์ถ์ถ
state_freight_value_df = olist_df[['customer_state', 'freight_value']]
# ๊ตฌ๋งค์ ์ฃผ๋ณ๋ก ํ๊ท ๋ฐฐ์ก๋น ํผ๋ฒํ
์ด๋ธ ์์ฑ
state_freight_value_pt = pd.pivot_table(data = state_freight_value_df,
values = 'freight_value', index = 'customer_state', aggfunc='mean')
state_freight_value_pt = state_freight_value_pt.reset_index()
map_freight_value = folium.Map(location=[-15.7801, -47.9292], tiles="cartodbpositron", zoom_start=4)
folium.Choropleth(
geo_data=geo,
data=state_freight_value_pt,
columns=['customer_state', 'freight_value'],
key_on='id',
fill_color='PuRd',
fill_opacity=0.7,
line_opacity=0.5,
).add_to(map_freight_value)
map_freight_value
# ํ๋งค์์ ์ฃผ์ ๊ตฌ๋งค์์ ์ฃผ๊ฐ ๊ฐ์ ๊ฒ๋ง ์ถ์ถํ๊ธฐ ์ํด mask ์์ฑ
mask_same = (olist_df['seller_state'] == olist_df['customer_state'])
# ํ๋งค์์ ์ฃผ์ ๊ตฌ๋งค์์ ์ฃผ๊ฐ ๊ฐ์ ๋ฐ์ดํฐ
cus_sell_same = olist_df[mask_same].reset_index()
# ํ๋งค์์ ์ฃผ์ ๊ตฌ๋งค์์ ์ฃผ๊ฐ ๋ค๋ฅธ ๋ฐ์ดํฐ
cus_sell_dif = olist_df[~mask_same].reset_index()
cus_sell_same['state'] = '๊ฐ์ ์ฃผ๋ก ๋ฐฐ์ก'
cus_sell_dif['state'] = '๋ค๋ฅธ ์ฃผ๋ก ๋ฐฐ์ก'
cus_sell_df = pd.concat([cus_sell_same, cus_sell_dif])
cus_sell_df = cus_sell_df.reset_index().drop(columns = 'index')
px.histogram(cus_sell_df, x = 'freight_value', color = 'state')
๋ฐ๋ผ์ ๋์ผ ์ง์ญ์ ํ๋งค์๊ฐ ์์ ๊ฒฝ์ฐ ๋ฐฐ์ก๋น๊ฐ ์ฆ๊ฐํ๋ฉฐ ๊ทธ์ ๋ฐ๋ฅธ ๊ณ ๊ฐ์ ์ง์ถ ๊ธ์ก์ด ์ฆ๊ฐํ๊ณ ๋ฐฐ์ก ์๊ฐ์ด ์ฆ๊ฐํ์ฌ ๊ตฌ๋งค์์ ์๋น์ค ๋ง์กฑ๋๊ฐ ๋ฎ์์ง ๊ฒ์ด๋ค.
์ด๋ฅผ ํด๊ฒฐํ๊ธฐ ์ํด ์ง์ญ๋ณ ์ ํธ ์นดํ ๊ณ ๋ฆฌ๋ฅผ ์กฐ์ฌํ์ฌ ์ ํธ ์นดํ ๊ณ ๋ฆฌ์ ํ๋งค์๊ฐ ๋ถ์กฑํ ๊ฒฝ์ฐ ํ๋งค์๋ฅผ ์ ์ ์ํฌ ์ ์๋ ๋ง์ผํ ์ ์งํํ ์ ์๋๋ก ์ง์ญ๊ณผ ์ ํธ ์นดํ ๊ณ ๋ฆฌ๋ฅผ ๋ถ์ํ๊ณ ์ ํ์ผ๋ฉฐ, ์ต์ ์ ๋ฌผ๋ฅ์ผํฐ ์์น๋ฅผ ๋ถ์ํด ๋ฐฐ์ก ์๊ฐ์ ์ต์ํํ ์ ์๋ ๋ฐฉ๋ฒ์ ๊ณ ๋ฏผํ๋ค.
ํ๋งค์๊ฐ ๋ง์ ๋จ๋ถ์ ๋จ๋๋ถ๋ฅผ ์ ์ธํ๊ณ ๋๋จธ์ง ์ง์ญ์ ๋ํ ์ ํธ ์นดํ
๊ณ ๋ฆฌ์ ํ๋งค์ ์๋ฅผ ๊ฐ์ด ์์๋ณด๊ณ ์ ํ๋ค.
๋์ฒด๋ก ์ฃผ๋ฌธ ๊ฑด์์ ๋นํด ํ๋งค์์๋ ๋ฎ๊ฒ ๋ํ๋๋ ๊ฒ์ ํ์ธํ ์ ์๋ค.
๋ฐ๋ผ์ ์ฃผ๋ฌธ ๊ฑด์์ ํ๋งค์ ์๋ฅผ ๊ณ ๋ คํ์ ๋
1. ๋ถ๋๋ถ์์๋ health_beauty ์นดํ
๊ณ ๋ฆฌ ์ํ์ ์ทจ๊ธํ๋ ํ๋งค์ ์ ์
๋ง์ผํ
์ ์งํํ๋ ๊ฒ์ด ๊ฐ์ฅ ํจ๊ณผ์ ์ผ ๊ฒ์ผ๋ก ์์๋๋ค.
2. ์ค์๋ถ๋ health_beauty ์ sports_leisure ์นดํ
๊ณ ๋ฆฌ ์ํ์ ๋ํ ํ๋งค์ ์ ์
๋ง์ผํ
์ ์งํํ๋ ๊ฒ์ด ๊ฐ์ฅ ํจ๊ณผ์ ์ผ ๊ฒ์ผ๋ก ์์๋๋ค.
3. ๋ถ๋ถ์์๋ health_beauty ์นดํ
๊ณ ๋ฆฌ ์ํ์ ์ทจ๊ธํ๋ ํ๋งค์ ์ ์
๋ง์ผํ
์ ์งํํ๋ ๊ฒ์ด ๊ฐ์ฅ ํจ๊ณผ์ ์ผ ๊ฒ์ผ๋ก ์์๋๋ค.
# ํน์ ์ปฌ๋ผ๋ง ์ถ์ถ
geo_seller_customer_df = olist_df[['customer_id', 'seller_id', 'order_approved_at', 'order_delivered_customer_date', 'delivery_date', 'customer_zip_code_prefix', 'seller_zip_code_prefix']]
geo_df = geolocation_df[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']]
geo_df = geo_df.rename(columns = {'geolocation_zip_code_prefix' : 'zip_code_prefix'})
# ์ค๋ณตํ ์ ๊ฑฐ
geo_df = geo_df[~geo_df.duplicated(['zip_code_prefix'])]
# ์ธ๋ฑ์ค ์ด๊ธฐํ
geo_df = geo_df.reset_index().drop(columns = {'index'})
# ๋ฐฐ์ก์ผ์ด 20์ผ ์ด์ ๊ฑธ๋ฆฐ ๊ฐ๋ง ์ถ์ถํ๊ธฐ ์ํด ํํฐ ์์ฑ
mask_delivery = (geo_seller_customer_df['delivery_date'] >= 20) == True
# ๋ฐฐ์ก์ผ์ด 20์ผ ์ด์ ๊ฑธ๋ฆฐ ๋ฐ์ดํฐ์ ๊ทธ๋ ์ง ์์ ๋ฐ์ดํฐ ๋ถ๋ฆฌ
geo_seller_customer_df_20 = geo_seller_customer_df[mask_delivery]
geo_seller_customer_df_0 = geo_seller_customer_df[~mask_delivery]
# geo ๋ฐ์ดํฐ์
๊ณผ joinํ์ฌ ๊ตฌ๋งค์์ ์๋, ๊ฒฝ๋ ๊ฐ ๋ถ๋ฌ์ค๊ธฐ
geo_seller_customer_df_20 = geo_seller_customer_df_20.merge(geo_df, how = 'inner', on = 'customer_zip_code_prefix')
# geo ๋ฐ์ดํฐ์
๊ณผ joinํ์ฌ ํ๋งค์์ ์๋, ๊ฒฝ๋ ๊ฐ ๋ถ๋ฌ์ค๊ธฐ
geo_seller_customer_df_20 = geo_seller_customer_df_20.merge(geo_df, how = 'inner', on = 'seller_zip_code_prefix')
# ์ปฌ๋ผ๋ช
๋ณ๊ฒฝ
geo_seller_customer_df_20 = geo_seller_customer_df_20.rename(columns = {'geolocation_lat_x' : 'customer_lat', 'geolocation_lng_x' : 'customer_lng',
'geolocation_lat_y' : 'seller_lat', 'geolocation_lng_y' : 'seller_lng'})
# ๊ตฌ๋งค์์ ํ๋งค์์ ์์น ์ค์๊ฐ ๊ณ์ฐ
geo_seller_customer_df_20['middle_location_lat'] = (geo_seller_customer_df_20['customer_lat'] + geo_seller_customer_df_20['seller_lat']) / 2
geo_seller_customer_df_20['middle_location_lng'] = (geo_seller_customer_df_20['customer_lng'] + geo_seller_customer_df_20['seller_lng']) / 2
import folium
import json
# ์ง๋ ์์ฑ
map = folium.Map(location=[-15.7801, -47.9292], tiles="cartodbpositron", zoom_start=4)
geo = json.load(open('./brazil_geo.json'))
# FeatureGroups ์์ฑ
customer_group = folium.FeatureGroup(name='Customer Locations')
seller_group = folium.FeatureGroup(name='Seller Locations')
middle_location_group = folium.FeatureGroup(name='Middle Locations')
pax_group = folium.FeatureGroup(name='Pax Hubs')
# ๊ตฌ๋งค์ ์์น ํ์
for index, row in geo_seller_customer_df_20.iterrows():
customer_lat = row['customer_lat']
customer_lng = row['customer_lng']
folium.Circle(location=[customer_lat, customer_lng], radius=1, color = 'rgb(6, 224, 208)').add_to(customer_group)
# ํ๋งค์ ์์น ํ์
for index, row in geo_seller_customer_df_20.iterrows():
seller_lat = row['seller_lat']
seller_lng = row['seller_lng']
folium.Circle(location=[seller_lat, seller_lng], radius=1, color='rgb(255, 0, 255)').add_to(seller_group)
# ๊ตฌ๋งค์์ ํ๋งค์์ ์์น ์ค์๊ฐ ํ์
for index, row in geo_seller_customer_df_20.iterrows():
middle_lat = row['middle_location_lat']
middle_lng = row['middle_location_lng']
folium.Circle(location=[middle_lat, middle_lng], radius=1, color='rgb(0, 0, 128)').add_to(middle_location_group)
# Pax์ ํ๋ธ ์์น ๋ง์ปค๋ก ํ์
for index, row in pax_hub_df.iterrows():
pax_lat = row['pax_hub_lat']
pax_lng = row['pax_hub_lng']
folium.Marker(location=[pax_lat, pax_lng], icon=folium.Icon(color='beige')).add_to(pax_group)
# ๊ฐ ์ฃผ๋ณ ๊ฒฝ๊ณ์ ํ์
folium.Choropleth(geo_data = geo, line_color='black', line_opacity = 1, fill_opacity=0).add_to(map)
customer_group.add_to(map)
seller_group.add_to(map)
middle_location_group.add_to(map)
pax_group.add_to(map)
# ๋ ์ด์ด ์ปจํธ๋กค์ ๋งต์ ๋ถ์ฐฉ
folium.LayerControl().add_to(map)
map
๋ฐฐ์ก์ด 20์ผ ์ด์ ๊ฑธ๋ฆฐ ๊ตฌ๋งค์์ ํ๋งค์์ ์์น ๋ถํฌ๋ฅผ ์ดํด๋ณด๋ฉด ์์ ๊ฐ๋ค.
๊ทธ์ ๋ฐ๋ฅธ ์์น ์ค์๊ฐ์ ์์ ๊ฐ๋ค.
๊ฐ๊ฐ์ ๋จ๋ถ์ ๋จ๋๋ถ๋ฅผ ์ ์ธํ ๋๋จธ์ง ์ง์ญ์์ ํ๋ํด๋ณด๋ฉด
ํด๋น ์ฃผ์์ ์ค์๊ฐ์ด ๋ง์ด ๋ถํฌํด ์๋ ๊ฒ์ ํ์ธํ ์ ์๋ค.
์ฃผํฉ์ ๋ง์ปค๋ ์ค์ Pax ํ์ฌ์ ๋ฌผ๋ฅ ์ผํฐ์ ์์น์ด๊ณ ๊ฐ์ ์ง์ญ์์ ๋น๊ตํ์ ๋ ๋นจ๊ฐ ์ ๋ถ๋ถ์์ ์ ์ฌํ๋ค๋ ๊ฒ์ ํ์ธํ๋ค.
1. ๋ถ๋๋ถ์์๋ health_beauty ์นดํ ๊ณ ๋ฆฌ ์ํ์ ์ทจ๊ธํ๋ ํ๋งค์ ์ ์
๋ถ๋๋ถ์ ๋ฐ์ด์ ์ฃผ์ ์๋ ๋นํ ๋ฆฌ์๋ค ์ฝฉ๊ธฐ์คํ์ ํ๋ธ ์์น ์ ์
2. ์ค์๋ถ์์๋ bed_bath_table, sports_leisure ์นดํ ๊ณ ๋ฆฌ ์ํ์ ์ทจ๊ธํ๋ ํ๋งค์ ์ ์
์ค์๋ถ์ ๋งํ ๊ทธ๋ก์ ์ฃผ์ ์๋ ์๋ ธํ์ ํ๋ธ ์์น ์ ์
3. ๋ถ๋ถ์์๋ health_beauty ์นดํ ๊ณ ๋ฆฌ ์ํ์ ์ทจ๊ธํ๋ ํ๋งค์ ์ ์
๋ถ๋ถ์ ํ ์นธ์นญ์ค ์ฃผ์ ์๋ ํ๋ผ๋์ ํ๋ธ ์์น ์ ์
๊ต์ฅํ ์๋ฏธ์๋ ๊ฒฐ๊ณผ๊ฐ ๋์์ ๋ง์กฑ์ค๋ฌ์ด ํ๋ก์ ํธ์๋ค.
์ค์ EDA๋ฅผ ํตํด ๋ฐ์ดํฐ์ ํน์ง์ ์์๋ณด๊ณ ๊ทธ๊ฒ์ ๊น์ด ๋ถ์ํด ์ค์ ๋ก ์ ์ํ ์ ์๋ ๊ฒฐ๊ณผ๊น์ง ๋์ถํ๋ค๋ ์ ์์ ํ๋ก์ ํธ ์งํ ๊ณผ์ ๋ ๋งค๋๋ฝ๊ฒ ํ๋ฌ๊ฐ๋ ๊ฒ ๊ฐ๋ค.
๋ง์ด ์ฑ์ฅํ ์ ์๋ ๊ธฐํ์๊ณ , ๋ ์ด๋ฒ ํ๋ก์ ํธ๋ฅผ ํ๋ฉด์ ๋ ์์ ์ ์ฑ์ฐฐํ ์ ์๋ ๊ธฐํ๋ ๋ ๊ฒ๊ฐ๋ค.
๋ ์์ ์ ๋ํด ์ฑ์ฐฐํ ๋ถ๋ถ์ ์์ ํ์๋ฉด
- ํ๋์ ๋ฐ์ดํฐ๋ ๋ณด๋ ๊ด์ ์ ๋ฐ๋ผ ๋ค๋ฅด๊ฒ ํด์๋ ์ ์๋ค. ์์ผ๋ฅผ ๋ํ์ ํ์ธํด์ผ๋๋ค. (์ด๋ฏธ ๋์ด๋ ๋ ๋ํ์ผ ๋๋ค.)
- ๋ฐ์ดํฐ ์๊ฐํ๋ ๋ณด๋ ์ฌ๋์ ๋ง์ถฐ์ ์งํํด์ผ๋๋ค. ํ๋กฏ์ ์ ํํ๋ ๊ฒ ๋ฟ๋ง ์๋๋ผ ์ธ์ธํ ๋จ์๊น์ง ๋ญ ์๋ฏธํ๋ ์ง ๋ช ํํ๊ฒ ๋ณด์ฌ์ฃผ์ด์ผ ํ๊ธฐ ๋๋ฌธ์ ๊ผผ๊ผผํ๊ฒ ํ์ธํด์ผ๋๋ค.
- ๋ฐ์ดํฐ ์๊ฐํ๋ ์ฌ๋ฌ ํด์ ์ฌ์ฉํด ๋ณด๋ ๊ฒ์ด ์ข๋ค. ๊ฐ๊ฐ์ ํด๋ง๋ค ์ฅ์ ์ด ์๊ณ ์ ๋ฌ๋ ฅ๋ ๋ฐ์ดํฐ๋ง๋ค ํด๋ง๋ค ๋ฌ๋ผ์ง๋ค.(์ด์ ์๋ ์๊ฐํ๋ฅผ ํ ๋ python๋ง ๊ณ ์งํ๋ค..)
- ์ ๋ง ๋ฐ์ดํฐ๋ ๋ด ์ ๋ง์ ๋ง๊ฒ ์กด์ฌํ์ง ์๋๋ค.
- ํ ํ๋์์๋ ํ๋ ฅ์ด ์ ์ผ ์ค์ํ ๊ฒ ๊ฐ๋ค. ํ๋ ฅ์ ํ๊ฒ ๋๋ฉด ๋ด๊ฐ ๋ณด์ง ๋ชปํ๋ ๊ฒ์ ํ์์ด ๋ณด์ฌ์ฃผ๊ณ ํ์์ด ๋ณด์ง ๋ชปํ๋ ๊ฒ์ ๋ด๊ฐ ์๋ ค์ค ์ ์๋ค. ๊ทธ์ ๋ฐ๋ผ ํ๋ฆฌํฐ๋ ๋์์ง๊ฒ ๋์ด์๋ค.
์๋ ํ์ธ์! ์ข์ ์ ๋ณด ๊ฐ์ฌํฉ๋๋ค. ๋ค๋ฆ์ด ์๋๋ผ ๋ฐ์ดํฐ๋ถํธ์บ ํ์์ ํ๋ก์ ํธ๋ฅผ ์งํ ์ค์ธ๋ฐ ๋ฌผ๋ฅ์ผํฐ ๊ด๋ จ ์ ๋ณด๋ฅผ ์ฐพ์ ์ ์์ด์ ์ง๋ฌธ ๋จ๊น๋๋ค. ํน์ ๊ด์ฐฎ์ผ์๋ค๋ฉด ์ด๋์ Pax ๋ฌผ๋ฅ์ผํฐ์ ์ ๋ณด๋ฅผ ์ฐพ์ผ์ จ๋์ง ์ฌ์ญค๋ด๋ ๊ด์ฐฎ์๊น์?