๐Ÿ“š [Python] ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

์ „์ฃผ์€ยท2023๋…„ 2์›” 22์ผ
0
post-thumbnail

Pandas

  • ๋ฐ์ดํ„ฐ ์ž‘์—…์„ ์‰ฝ๊ณ  ์ง๊ด€์ ์œผ๋กœ ํ•  ์ˆ˜ ์žˆ๋„๋ก ์„ค๊ณ„๋œ ๋น ๋ฅด๊ณ  ์œ ์—ฐํ•œ ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ๋ฅผ ์ œ๊ณตํ•˜๋Š” ๋ชจ๋“ˆ

๊ธฐ๋ณธ๊ธฐ๋Šฅ

์„ค์น˜ ๋ฐ import

!pip install Pandas
import pandas as pd

Series์™€ DateFrame

  • 1์ฐจ์›, 1๊ฐœ์˜ column ์€ Series๋ผ๊ณ  ๋ถ€๋ฆ„
  • ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„: ๊ฐ€๋กœ์ถ•, ์„ธ๋กœ์ถ•์ด ์žˆ๋Š” 2์ฐจ์›์˜ ๋‹ค์–‘ํ•œ ํƒ€์ž… ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•˜๋Š” ์ž๋ฃŒ๊ตฌ์กฐ
# ๋ฆฌ์ŠคํŠธ๋ฅผ ์‹œ๋ฆฌ์ฆˆ๋กœ
pd.Series([1, 2, 3, 4]) 
# 2์ฐจ์› ๋ฆฌ์ŠคํŠธ ์‹œ๋ฆฌ์ฆˆ ๊ฐ€๋Šฅ!
product1 = [['์‚ผ์„ฑ',900000, '๊ฐค๋Ÿญ์‹œ'],['์• ํ”Œ',1400000, '์•„์ดํฐ14'],['์—˜์ง€',1500000,'์—˜์ง€๊ทธ๋žจ']] 
# ์‹œ๋ฆฌ์ฆˆ๋ฅผ dataframe์œผ๋กœ 
pd.DataFrame(product1)

๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ

# ์ฝ”๋žฉ์— ๋„ฃ์€ ๋ฐ์ดํ„ฐ
pd.read_csv('korean-idol.csv')
# ์™ธ๋ถ€ ๋ฐ์ดํ„ฐ
pd.read_csv('https://bit.ly/ds-house-price')
# ๊ตฌ๊ธ€ ๋“œ๋ผ์ด๋ธŒ์— ์žˆ๋Š” ๋ฐ์ดํ„ฐ
from google.colab import drive 
drive.mount('/content/drive')
pd.read_csv('ํŒŒ์ผ๊ฒฝ๋กœ')
# ์—‘์…€ํŒŒ์ผ ๊ฐ€์ ธ์˜ค๊ธฐ
pd.read_excel('ํŒŒ์ผ๊ฒฝ๋กœ')

๋ฐ์ดํ„ฐ ์ •๋ณด ํ™•์ธํ•˜๊ธฐ

# ๊ธฐ๋ณธ๋•์ธ ํ–‰์˜ ์ •๋ณด์™€ ๋ฐ์ดํ„ฐ ํƒ€์ž…์„ ํ™•์ธ
df.info()
# ์ปฌ๋Ÿผ(์—ด) ํ™•์ธ
df.columns
# ์ปฌ๋Ÿผ์ด๋ฆ„ ๋ณ€๊ฒฝ
df.columns = ๋ฆฌ์ŠคํŠธ
# ํ†ต๊ณ„์ •๋ณด ํ™•์ธ - ๊ฐœ์ˆ˜, ํ‰๊ท , ํ‘œ์ค€ํŽธ์ฐจ, ์ตœ์†Œ, ์‚ฌ๋ถ„์œ„๊ฐ’, ์ตœ๋Œ€
df.describe()
# ์˜ค๋ธŒ์ ํŠธ ํฌํ•จ์‹œ - ๊ฐœ์ˆ˜, uniqueํ•œ ๊ฐ’ ๊ฐœ์ˆ˜, ๋นˆ๋„์ˆ˜ ๋†’์€๊ฒƒ, ๊ทธ ๊ฐœ์ˆ˜
df.describe(include=object)
# ์›ํ•˜๋Š” ๊ฐœ์ˆ˜์˜ ๋ฐ์ดํ„ฐ ๋ณด๊ธฐ
df.head() # ์ƒ์œ„ nav 5๊ฐœ์˜ row ์ถœ๋ ฅ
df.head(n) # ์ƒ์œ„ nav m๊ฐœ์˜๋ฐ์ดํ„ฐ ์ถœ๋ ฅ
df.tail(n) # ํ•˜์œ„ nav m๊ฐœ์˜๋ฐ์ดํ„ฐ ์ถœ๋ ฅ

df['ํ‚ค'].sum()
df['ํ‚ค'].count()
df['ํ‚ค'].mean()
df['ํ‚ค'].max()
df['ํ‚ค'].var()
df['ํ‚ค'].std() 

์ •๋ ฌํ•˜๊ธฐ

# ์ธ๋ฑ์Šค๋กœ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
df.sort_index(ascending=False)
# ํŠน์ • ์ปฌ๋Ÿผ ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ
df.sort_values(by='์ปฌ๋Ÿผ๋ช…')
# ํŠน์ • ์ปฌ๋Ÿผ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
df.sort_values(by='์ปฌ๋Ÿผ๋ช…',ascending=False)
# NaN์„ ๊ฐ€์žฅ ์œ„๋กœ
df.sort_values(by='์ปฌ๋Ÿผ๋ช…', na_position='first')
# ๋‘๋ฒˆ ์ •๋ ฌ
df.sort_values(by=['์ปฌ๋Ÿผ๋ช…1', '์ปฌ๋Ÿผ๋ช…2'], ascending=[False,True],  na_position='first')

์ปฌ๋Ÿผ์œผ๋กœ ๋ฐ์ดํ„ฐ ๋‹ค๋ฃจ๊ธฐ

์ปฌ๋Ÿผ์œผ๋กœ ํ–‰ ๋ฒ”์œ„ ์„ ํƒํ•˜๊ธฐ

df.loc[:,์ปฌ๋Ÿผ๋ฆฌ์ŠคํŠธ]
df.loc[3:8,['name','height']]
# ์ธ๋ฑ์Šค ํฌํ•จ
df.iloc[1:4,0:2]

Boolean indexing

df['height']>=180 # ๊ฒฐ๊ณผ๊ฐ€ True False๋กœ ๋‚˜์˜ด
df[df['height']>=180] # True์ธ ๊ฒƒ๋งŒ ์ถœ๋ ฅ
df[df['height']>=180][['name','gender','height']] # ๊ทธ์ค‘์— ์›ํ•˜๋Š” ์ปฌ๋Ÿผ๋งŒ ์ถœ๋ ฅ ๊ฐ€๋Šฅ

isin ํ™œ์šฉํ•˜๊ธฐ

  • isin์€ ์กฐ๊ฑด์„ ๊ฑธ๊ณ ์ž ํ•˜๋Š” ๊ฐ’์ด ์ •์˜ํ•œ list์— ์žˆ์„ ๋•Œ๋งŒ ์ƒ‰์ธํ•˜๋ ค๋Š” ๊ฒฝ์šฐ์— ์‚ฌ์šฉ
company = ['ํ”Œ๋ ˆ๋”์Šค','SM'] # ์—ฌ๊ธฐ์— ํฌํ•จ๋˜๋ฉด True
df['company'].isin(company) # True False๋กœ ๋‚˜์˜ด
df[df['company'].isin(company)] # True์ธ ๊ฒƒ๋งŒ ์ถœ๋ ฅ
df.loc[df['company '].isin(company),['name','company ']] # ๊ทธ ์ค‘์— ์›ํ•˜๋Š” ์ปฌ๋Ÿฌ๋งŒ ์ถœ๋ ฅ ๊ฐ€๋Šฅ

๊ฒฐ์ธก๊ฐ’๊ณผ Null๊ฐ’ ์ฒ˜๋ฆฌํ•˜๊ธฐ

  • NaN์€ ๋น„์–ด์žˆ๋Š” ๊ฐ’ ๋˜๋Š” ๊ฒฐ์ธก๊ฐ’์ด๋ผ๊ณ  ๋ถ€๋ฆ„
  • pandas์—์„œ๋Š” null์„ NaN(Not a Number)์œผ๋กœ ํ‘œ๊ธฐ๋จ
df.isna() # Nan ์—ฌ๋ถ€ True False DataFrame์œผ๋กœ ์ถœ๋ ฅ
df.isnull() # Null ์—ฌ๋ถ€
df[df['group'].isnull()] # True์ธ ๊ฒƒ๋งŒ ์ถœ๋ ฅ
df[df['group'].isnull()]['name']
df['name'][df['group'].isnull()] # ์ˆœ์„œ ๋ฐ”๊ฟ”๋„ ์ƒ๊ด€์—†์Œ
df['name'][df['group'].notnull()] # notnull๋„ ์žˆ์Œ

๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌํ•˜๊ธฐ

# fillna(): ๊ฒฐ์ธก๊ฐ’์„ ์ฑ„์›Œ์ฃผ๋Š” ํ•จ์ˆ˜
df['ํ‚ค'].fillna(-1) # ๋ฏธ๋ฆฌ๋ณด๊ธฐ
df2['ํ‚ค'].fillna(-1, inplace=True ) # ๋ฐ”๊พธ๋Š” ๊ฒƒ๊นŒ์ง€
height = df2['ํ‚ค'].mean()
df2['ํ‚ค'] = df2['ํ‚ค'].fillna(height)

# ๊ฒฐ์ธก๊ฐ’์ด ์žˆ๋Š” ํ–‰์„ ์ œ๊ฑฐ
df.dropna() # ๊ธฐ๋ณธ๊ฐ’์€ ํ–‰์„ ์ œ๊ฑฐ #df.dropna(axis=0)

#๊ฒฐ์ธก๊ฐ’์ด ์žˆ๋Š” ํ–‰์„ ์ œ๊ฑฐ
df.dropna(axis=1) #๊ธฐ๋ณธ๊ฐ’์€ ํ–‰์„ ์ œ๊ฑฐ, ๊ฒฐ์ธก๊ฐ’์ด ํ•œ๊ฐœ๋ผ๋„ ์žˆ์„ ๊ฒฝ์šฐ ์‚ญ์ œ df.dropna(axis=0,how='any')
# ํ–‰์˜ ๋ฐ์ดํ„ฐ๊ฐ€ ๋ชจ๋‘ Nan์ธ ๊ฒฝ์šฐ๋งŒ ์‚ญ์ œ
df.dropna(axis=0,how='all') 

๋ฐ์ดํ„ฐ ๋ณต์‚ฌํ•˜๊ธฐ

copy_df = df.copy()

row, column ์ถ”๊ฐ€ ๋ฐ ์‚ญ์ œํ•˜๊ธฐ

row ์ถ”๊ฐ€ํ•˜๊ธฐ

  • dict ํ˜•ํƒœ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ๋งŒ๋“ค์–ด์ฃผ๊ณ  append()ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”๊ฐ€
  • ๋ฐ˜๋“œ์‹œ ignore_index=True ์˜ต์…˜์„ ์ถ”๊ฐ€ํ•ด์•ผ ์—๋Ÿฌ๊ฐ€ ๋‚˜์ง€ ์•Š์Œ
df =df.append({'์ด๋ฆ„':'๊น€์‚ฌ๊ณผ', '๊ทธ๋ฃน':'์• ํ”Œ', '์†Œ์†์‚ฌ':'apple', '์„ฑ๋ณ„':'์—ฌ์ž', '์ƒ๋…„์›”์ผ':'2000-01-01','ํ‚ค':160, 'ํ˜ˆ์•กํ˜•':'A', '๋ธŒ๋žœ๋“œํ‰ํŒ์ง€์ˆ˜6':1234567},ignore_index=True)

column ์ถ”๊ฐ€ํ•˜๊ธฐ

df['๊ตญ์ '] ='๋Œ€ํ•œ๋ฏผ๊ทน'

row ์ œ๊ฑฐํ•˜๊ธฐ

df.drop(14, axis=0)
df.drop([1,10,14], axis=0)

column ์ œ๊ฑฐํ•˜๊ธฐ

df.drop('๊ทธ๋ฃน',axis=1)

๊ทธ๋ฃน์œผ๋กœ ๋ฌถ๊ธฐ

  • groupby ๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ๊ทธ๋ฃน์œผ๋กœ ๋ฌถ์–ด ๋ถ„์„ํ•  ๋•Œ ์‚ฌ์šฉ
df.groupby('์†Œ์†์‚ฌ') 
df.groupby('์†Œ์†์‚ฌ').count() # ์†Œ์†์‚ฌ๋ณ„๋กœ ๊ฐœ์ˆ˜๋ฅผ ์„ธ์„œ DataFrame์œผ๋กœ ์ถœ๋ ฅ
df.groupby('์†Œ์†์‚ฌ').mean()
df.groupby('์†Œ์†์‚ฌ').sum()

์ค‘๋ณต๊ฐ’ ์ œ๊ฑฐํ•˜๊ธฐ

# drop_duplicates(): ์ค‘๋ณต๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์ œ๊ฑฐ(keep='first')
df['ํ˜ˆ์•กํ˜•'].drop_duplicates()
df['ํ˜ˆ์•กํ˜•'].drop_duplicates(keep='last') # ๋งˆ์ง€๋ง‰ index๋กœ ๋‚จ๊ฒจ๋‘”๋‹ค!

๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ๋ณ‘ํ•ฉํ•˜๊ธฐ

concat(): ๋ฐ์ดํ„ฐ๋ฅผ ํ•ฉ์นจ. sort=Falsse ์˜ต์…˜์œผ๋กœ ์ˆœ์„œ๊ฐ€ ์œ ์ง€๋˜๋„๋ก ํ•จ

  • ์ปฌ๋Ÿผ์ด ๊ฐ™์•„ ๊ฒฝ์šฐ ์œ„์•„๋ž˜๋กœ ํ•ฉ์น ๋•Œ
pd.concat([df1, df_copy],sort=False)
# reset_index(): index๋ฅผ ์ƒˆ๋กญ๊ฒŒ ์ ์šฉ, drop=True ๊ธฐ์กด index๋ฅผ ์‚ญ์ œ
df_concat.reset_index(drop=True)
  • ์ƒˆ๋กœ์šด ์ปฌ๋Ÿผ์œผ๋กœ ์ขŒ์šฐ๋กœ ํ•ฉ์น  ๋•Œ
pd.concat([df1,df2], axis=1)
# index๊ธฐ์ค€์œผ๋กœ ํ•ฉ์ณ์ง„๋‹ค! -> ํ•ด๋‹น์ธ๋ฑ์Šค๊ฐ€ ์—†์„ ๋•Œ NaN์œผ๋กœ ๋ณ€ํ™˜๋˜ ํ•ฉ์ณ์ง

merge(): ํŠน์ • ๊ณ ์œ ํ•œ ํ‚ค ๊ฐ’์„ ๊ธฐ์ค€์œผ๋กœ ๋ณ‘ํ•ฉ (sql join๊ณผ ๊ฐ™์Œ)

  • merge(๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„1, ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„2, on='unique id', how='๋ณ‘ํ•ฉ์˜ ๊ธฐ์ค€')
  • ๋ณ‘ํ•ฉ์˜ ๊ธฐ์ค€: left, right, inner, outer
pd.merge(df1, df_right, on='์ด๋ฆ„', how='left') #df1์„ ๊ธฐ์ค€์œผ๋กœ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
pd.merge(df1, df_right, on='์ด๋ฆ„', how='inner') # ๋ฐ์ดํ„ฐ๋ฅผ ๋‘˜๋‹ค ๊ฐ–๊ณ ์žˆ์„๋•Œ๋กœ !!     pd.merge(df1, df_right, on='์ด๋ฆ„', how='right')
pd.merge(df1, df_right, on='์ด๋ฆ„', how='outer')

Series์˜ ํƒ€์ž… ๋ณ€ํ™˜ํ•˜๊ธฐ

astype(): ๋ฐ์ดํ„ฐ์˜ ํƒ€์ž…์„ ๋ณ€ํ™˜

df['ํ‚ค'].astype(int)

๋‚ ์งœ ๋ณ€ํ™˜ํ•˜๊ธฐ

to_datetime(): object ํƒ€์ž…์—์„œ datetimeํƒ€์ž…์œผ๋กœ ๋ณ€ํ™˜

df['์ƒ๋…„์›”์ผ'] = pd.to_datetime(df['์ƒ๋…„์›”์ผ'])

๋‹ฌ, ์ผ, ์‹œ๊ฐ„, ์ฃผ, ์š”์ผ

df['์ƒ๋…„์›”์ผ'].dt.month
df['์ƒ๋…„์›”์ผ'].dt.day
df['์ƒ๋…„์›”์ผ'].dt.hour
df['์ƒ๋…„์›”์ผ'].dt.week
df['์ƒ๋…„์›์ผ'].dt.dayofweek #์š”์ผ: 0(์›”)
df['์ƒ๋…„์›”์ผ'].dt.date         # YYYY-MM-DD(๋ฌธ์ž)
df['์ƒ๋…„์›”์ผ'].dt.isocalendar().week # 1๋…„์˜ ๋ช‡๋ฒˆ์งธ ์ฃผ์ฐจ์ธ์ง€

# ํŒŒ์ƒ๋ณ€์ˆ˜ ๋งŒ๋“ค์ž!!
df['์ถœ์ƒ๋…„๋„'] = df['์ƒ๋…„์›์ผ'].dt.year
df['์ถœ์ƒ๋…„๋„'] = df['์ƒ๋…„์›์ผ'].dt.month
df['์ถœ์ƒ๋…„๋„'] = df['์ƒ๋…„์›์ผ'].dt.day

apply

  • Series๋‚˜ DateFrame์— ์ข€ ๋” ๊ตฌ์ฒด์ ์ธ ๋กœ์ง์„ ์ ์šฉํ•˜๊ณ  ์‹ถ์€ ๊ฒฝ์šฐ์— ์‚ฌ์šฉ
  • apply ์ ์šฉํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” ๋ณ„๋„์˜ ํ•จ์ˆ˜๊ฐ€ ๋จผ์ € ์ •์˜๋˜์•ผ ํ•จ
  • ํ•จ์ˆ˜๋ฅผ ๋งค๊ฐœ๋ณ€์ˆ˜๋กœ ๋„˜๊ฒจ์คŒ
df.loc[df['์„ฑ๋ณ„']=='๋‚จ์ž', '์„ฑ๋ณ„'] = 1
df.loc[df['์„ฑ๋ณ„']=='์—ฌ์ž', '์„ฑ๋ณ„'] = 0

# ์ด๊ฒƒ์„ ํ•จ์ˆ˜๋กœ ๋งŒ๋“ค์–ด apply ์ ์šฉ๊ฐ€๋Šฅ
def male_or_female(x):
  if x == '๋‚จ์ž':
    return 1
  elif x == '์—ฌ์ž':
    return 0
df['์„ฑ๋ณ„'].apply(male_or_female)

# ํŒŒ์ด์ฌ์€ ํ•จ์ˆ˜๋ฅผ ์ •์˜ํ•˜๋ฉด ๋ฉ”๋ชจ๋ฆฌ์— ๊ณ„์† ๋‚จ์•„์žˆ๊ธฐ๋•Œ๋ฌธ์— ์ผํšŒ์šฉ์€ ๋žŒ๋‹ค์‹์„ ์จ์ค€๋‹ค!
df['์„ฑ๋ณ„'].apply(lambda x:1 if x=='๋‚จ์ž' else 0)

map

map_gender = {'๋‚จ์ž':1,'์—ฌ์ž':0} # ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ํ†ตํ•ด ์ ์šฉ => map
df['์„ฑ๋ณ„'].map(map_gender) # ๋ฏธ๋ฆฌ๋ณด๊ธฐ
df['NEW์„ฑ๋ณ„']= df['์„ฑ๋ณ„'].map(map_gender) #์ ์šฉ

๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์˜ ์‚ฐ์ˆ ์—ฐ์‚ฐ

df= pd.DataFrame({
    'ํŒŒ์ด์ฌ':[60, 70, 80, 85, 75],
    '์ž๋ฃŒ๊ตฌ์กฐ':[40, 50, 45, 70, 55],
    '๋ฐ์ดํ„ฐ๋ถ„์„':[80, 75, 100, 90, 95]
})
df['ํŒŒ์ด์ฌ'] + df['์ž๋ฃŒ๊ตฌ์กฐ'] + df['๋ฐ์ดํ„ฐ๋ถ„์„']
df['ํŒŒ์ด์ฌ'] - df['์ž๋ฃŒ๊ตฌ์กฐ']
df['ํŒŒ์ด์ฌ'] * df['์ž๋ฃŒ๊ตฌ์กฐ']
df['ํŒŒ์ด์ฌ'] / df['์ž๋ฃŒ๊ตฌ์กฐ']
df['์ด์ '] = df['ํŒŒ์ด์ฌ'] + df['์ž๋ฃŒ๊ตฌ์กฐ'] + df['๋ฐ์ดํ„ฐ๋ถ„์„']
df['ํ‰๊ท '] = df['์ด์ '] /3
# ํ–‰์˜ ๊ฐœ์ˆ˜๊ฐ€ ๋‹ค๋ฅผ ๊ฒฝ์šฐ ๋น ์ง„ ๋ฐ์ดํ„ฐ๋ฅผ Nan์œผ๋กœ ์ทจ๊ธ‰
# ์ˆซ์ž๋ผ๋ฆฌ๋งŒ ์—ฐ์‚ฐ๊ฐ€๋Šฅ!

์› ํ•ซ ์ธ์ฝ”๋”ฉ(One-Hot-Encoding) โœจโœจโœจโœจ

  • ๋”๋ฏธ ๋ณ€์ˆ˜
  • ์› ํ•ซ ์ธ์ฝ”๋”ฉ์€ ํ•œ๊ฐœ์˜ ์š”์†Œ๋Š” True, ๋‚˜๋จธ์ง€ ์š”์†Œ๋Š” Falses๋กœ ๋งŒ๋“ค์–ด ์นดํ…Œ๊ณ ๋ฆฌํ˜•์œผ๋กœ ํ‘œํ˜„ํ•˜๋Š” ๊ธฐ์ˆ 
  • df['ํ˜ˆ์•กํ˜•_code']๋ฅผ ๋จธ์‹ ๋Ÿฌ๋‹ ์•Œ๊ณ ๋ฆฌ์ฆ˜์— ๋„ฃ์–ด ๋ฐ์ดํ„ฐ๋ฅผ ์˜ˆ์ธกํ•˜๋ ค๊ณ  ์ง€์‹œ๋ฅผ ํ•˜๋ฉด ์ปดํ“จํ„ฐ๋Š” ๊ฐ’๋“ค๊ฐ„์˜ ๊ด€๊ผ๋ฅผ ์Šค์Šค๋กœ ํ˜•์„ฑํ•˜๊ฒŒ ๋จ
  • ๋งŒ์•ฝ Bํ˜•์€ 1, ABํ˜•์€ 2๋ผ๋Š” ๊ฐ’์„ ๊ฐ€์ง€๊ณ  ์žˆ์„ ๋•Œ, ์ปดํ“จํ„ฐ๋Š” 'Bํ˜•' +'ABํ˜•' = 'Oํ˜•' ๋ผ๊ณ  ์ด์ƒํ•œ ๊ด€๊ณ„๋ฅผ ๋งบ์„ ์ˆ˜ ์žˆ๊ฒŒ ๋จ
  • ๋ณ„๋„์˜ column์„ ํ˜•์„ฑํ•ด์ฃผ๊ณ  1๊ฐœ์˜ column์—๋Š” True, ๋‚˜๋จธ์ง€๋Š” False๋ผ๊ณ  ๋„ฃ์–ด์คŒ์œผ๋กœ 'A,B,AB,O'ํ˜•์˜ ๊ด€๊ณ„๋Š” ๋…๋ฆฝ์ ์ด๋‹ค๋ผ๋Š” ์นดํ…Œ๊ณ ๋ฆฌ๋กœ ํ‘œํ˜„ํ•ด์ฃผ๋Š” ๋ฐฉ์‹ -> ์›ํ•ซ ์ธ์ฝ”๋”ฉ
blood_map = {'A':0, 'B':1,'AB':2,'O':3}
df['ํ˜ˆ์•กํ˜•_code'] = df['ํ˜ˆ์•กํ˜•'].map(blood_map)
df['ํ˜ˆ์•กํ˜•_code'].value_counts()
pd.get_dummies(df['ํ˜ˆ์•กํ˜•_code'])

๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์˜ˆ์‹œ

df = pd.read_csv('https://bit.ly/ds-house-price')
df.info()
# ๋ถ„์–‘๊ฐ€๊ฒฉ์˜ dtype์„ int64๋กœ ๋ณ€๊ฒฝํ•˜๊ธฐ
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: '  ' 
# '  '๊ณต๋ฐฑ์ด ์กด์žฌํ•˜๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
df.loc[df['๋ถ„์–‘๊ฐ€๊ฒฉ']=='  ']
# stript()๋ฅผ ์‚ฌ์šฉํ•ด์„œ ๊ณต๋ฐฑ์ด ์žˆ๋Š” ๋ฐ์ดํ„ฐ์˜ ํ•ด๋‹น ์•ž๋’ค ๊ณต๋ฐฑ์„ ์‚ญ์ œ
df['๋ถ„์–‘๊ฐ€๊ฒฉ']= df['๋ถ„์–‘๊ฐ€๊ฒฉ'].str.strip()
#int๋กœ ๋ฐ”๊พธ๊ธฐ ๋‹ค์‹œ์‹œ๋„
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: ''
df.loc[df['๋ถ„์–‘๊ฐ€๊ฒฉ']=='']
# ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ๋ฅผ 0์œผ๋กœ ๋ณ€๊ฒฝ
df.loc[df['๋ถ„์–‘๊ฐ€๊ฒฉ']=='','๋ถ„์–‘๊ฐ€๊ฒฉ'] = 0
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int) # ValueError: cannot convert float NaN to integer
# NaN์„ 0์œผ๋กœ ๋ฐ˜ํ™˜
df['๋ถ„์–‘๊ฐ€๊ฒฉ']=df['๋ถ„์–‘๊ฐ€๊ฒฉ'].fillna(0)
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: '6,657' : 1000์›๋‹จ์œ„
# ์ฝค๋งˆ๋ฅผ ์ œ๊ฑฐ
df['๋ถ„์–‘๊ฐ€๊ฒฉ'] = df['๋ถ„์–‘๊ฐ€๊ฒฉ'].str.replace(',','')
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int) # ValueError: cannot convert float NaN to integer
df['๋ถ„์–‘๊ฐ€๊ฒฉ']=df['๋ถ„์–‘๊ฐ€๊ฒฉ'].fillna(0)
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int)  # ValueError: invalid literal for int() with base 10: '-'
# - ๋ฅผ ์ œ๊ฑฐ
df['๋ถ„์–‘๊ฐ€๊ฒฉ'] = df['๋ถ„์–‘๊ฐ€๊ฒฉ'].str.replace('-','')
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int)  # ValueError: cannot convert float NaN to integer
df['๋ถ„์–‘๊ฐ€๊ฒฉ']=df['๋ถ„์–‘๊ฐ€๊ฒฉ'].fillna(0)
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int)
# ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ๋ฅผ 0์œผ๋กœ ๋ณ€๊ฒฝ
df.loc[df['๋ถ„์–‘๊ฐ€๊ฒฉ']=='','๋ถ„์–‘๊ฐ€๊ฒฉ'] = 0
df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int)
df.info()
df['๋ถ„์–‘๊ฐ€๊ฒฉ'] = df['๋ถ„์–‘๊ฐ€๊ฒฉ'].astype(int) # ๊ฐ’์„ ์ €์žฅํ•ด์•ผ ๋ฐ”๋€๋‹ค!
# ๊ทœ๋ชจ๋ถ€๋ถ„ column์— ๋ถˆํ•„์š”ํ•œ '์ „์šฉ๋ฉด์ ' ๊ธ€์ž๋ฅผ ์‚ญ์ œ
df['๊ทœ๋ชจ๊ตฌ๋ถ„'] = df['๊ทœ๋ชจ๊ตฌ๋ถ„'].str.replace('์ „์šฉ๋ฉด์ ','')
df['๊ทœ๋ชจ๊ตฌ๋ถ„'].value_counts()
# ์ง€์—ญ๋ช…๋ณ„๋กœ ํ‰๊ท  ๋ถ„์–‘๊ฐ€๊ฒฉ์„ ํ™•์ธ
df.groupby('์ง€์—ญ๋ช…')['๋ถ„์–‘๊ฐ€๊ฒฉ'].mean()
# ์ž‘์—…ํŒŒ์ผ ์ €์žฅ
df.to_csv('ds-house-price-clean.csv', index=False)
profile
๋” ๋ฉ€๋ฆฌ ๋ณด๋„๋ก ํ•ด์š”. ์ง„์ฃผ๋Š” ํ•˜๋ฃจ์•„์นจ์— ์ƒ๊ฒจ๋‚˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.

0๊ฐœ์˜ ๋Œ“๊ธ€