Pandas
- ๋ฐ์ดํฐ ์์
์ ์ฝ๊ณ ์ง๊ด์ ์ผ๋ก ํ ์ ์๋๋ก ์ค๊ณ๋ ๋น ๋ฅด๊ณ ์ ์ฐํ ๋ฐ์ดํฐ ๊ตฌ์กฐ๋ฅผ ์ ๊ณตํ๋ ๋ชจ๋
๊ธฐ๋ณธ๊ธฐ๋ฅ
์ค์น ๋ฐ import
!pip install Pandas
import pandas as pd
Series์ DateFrame
- 1์ฐจ์, 1๊ฐ์ column ์ Series๋ผ๊ณ ๋ถ๋ฆ
- ๋ฐ์ดํฐ ํ๋ ์: ๊ฐ๋ก์ถ, ์ธ๋ก์ถ์ด ์๋ 2์ฐจ์์ ๋ค์ํ ํ์
๋ฐ์ดํฐ๋ฅผ ์ ์ฅํ๋ ์๋ฃ๊ตฌ์กฐ
pd.Series([1, 2, 3, 4])
product1 = [['์ผ์ฑ',900000, '๊ฐค๋ญ์'],['์ ํ',1400000, '์์ดํฐ14'],['์์ง',1500000,'์์ง๊ทธ๋จ']]
pd.DataFrame(product1)
๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
pd.read_csv('korean-idol.csv')
pd.read_csv('https://bit.ly/ds-house-price')
from google.colab import drive
drive.mount('/content/drive')
pd.read_csv('ํ์ผ๊ฒฝ๋ก')
pd.read_excel('ํ์ผ๊ฒฝ๋ก')
๋ฐ์ดํฐ ์ ๋ณด ํ์ธํ๊ธฐ
df.info()
df.columns
df.columns = ๋ฆฌ์คํธ
df.describe()
df.describe(include=object)
df.head()
df.head(n)
df.tail(n)
df['ํค'].sum()
df['ํค'].count()
df['ํค'].mean()
df['ํค'].max()
df['ํค'].var()
df['ํค'].std()
์ ๋ ฌํ๊ธฐ
df.sort_index(ascending=False)
df.sort_values(by='์ปฌ๋ผ๋ช
')
df.sort_values(by='์ปฌ๋ผ๋ช
',ascending=False)
df.sort_values(by='์ปฌ๋ผ๋ช
', na_position='first')
df.sort_values(by=['์ปฌ๋ผ๋ช
1', '์ปฌ๋ผ๋ช
2'], ascending=[False,True], na_position='first')
์ปฌ๋ผ์ผ๋ก ๋ฐ์ดํฐ ๋ค๋ฃจ๊ธฐ
์ปฌ๋ผ์ผ๋ก ํ ๋ฒ์ ์ ํํ๊ธฐ
df.loc[:,์ปฌ๋ผ๋ฆฌ์คํธ]
df.loc[3:8,['name','height']]
df.iloc[1:4,0:2]
Boolean indexing
df['height']>=180
df[df['height']>=180]
df[df['height']>=180][['name','gender','height']]
isin ํ์ฉํ๊ธฐ
- isin์ ์กฐ๊ฑด์ ๊ฑธ๊ณ ์ ํ๋ ๊ฐ์ด ์ ์ํ list์ ์์ ๋๋ง ์์ธํ๋ ค๋ ๊ฒฝ์ฐ์ ์ฌ์ฉ
company = ['ํ๋ ๋์ค','SM']
df['company'].isin(company)
df[df['company'].isin(company)]
df.loc[df['company '].isin(company),['name','company ']]
๊ฒฐ์ธก๊ฐ๊ณผ Null๊ฐ ์ฒ๋ฆฌํ๊ธฐ
- NaN์ ๋น์ด์๋ ๊ฐ ๋๋ ๊ฒฐ์ธก๊ฐ์ด๋ผ๊ณ ๋ถ๋ฆ
- pandas์์๋ null์ NaN(Not a Number)์ผ๋ก ํ๊ธฐ๋จ
df.isna()
df.isnull()
df[df['group'].isnull()]
df[df['group'].isnull()]['name']
df['name'][df['group'].isnull()]
df['name'][df['group'].notnull()]
๊ฒฐ์ธก๊ฐ ์ฒ๋ฆฌํ๊ธฐ
df['ํค'].fillna(-1)
df2['ํค'].fillna(-1, inplace=True )
height = df2['ํค'].mean()
df2['ํค'] = df2['ํค'].fillna(height)
df.dropna()
df.dropna(axis=1)
df.dropna(axis=0,how='all')
๋ฐ์ดํฐ ๋ณต์ฌํ๊ธฐ
copy_df = df.copy()
row, column ์ถ๊ฐ ๋ฐ ์ญ์ ํ๊ธฐ
row ์ถ๊ฐํ๊ธฐ
- dict ํํ์ ๋ฐ์ดํฐ๋ฅผ ๋ง๋ค์ด์ฃผ๊ณ append()ํจ์๋ฅผ ์ฌ์ฉํ์ฌ ๋ฐ์ดํฐ๋ฅผ ์ถ๊ฐ
- ๋ฐ๋์ ignore_index=True ์ต์
์ ์ถ๊ฐํด์ผ ์๋ฌ๊ฐ ๋์ง ์์
df =df.append({'์ด๋ฆ':'๊น์ฌ๊ณผ', '๊ทธ๋ฃน':'์ ํ', '์์์ฌ':'apple', '์ฑ๋ณ':'์ฌ์', '์๋
์์ผ':'2000-01-01','ํค':160, 'ํ์กํ':'A', '๋ธ๋๋ํํ์ง์6':1234567},ignore_index=True)
column ์ถ๊ฐํ๊ธฐ
df['๊ตญ์ '] ='๋ํ๋ฏผ๊ทน'
row ์ ๊ฑฐํ๊ธฐ
df.drop(14, axis=0)
df.drop([1,10,14], axis=0)
column ์ ๊ฑฐํ๊ธฐ
df.drop('๊ทธ๋ฃน',axis=1)
๊ทธ๋ฃน์ผ๋ก ๋ฌถ๊ธฐ
- groupby ๋ ๋ฐ์ดํฐ๋ฅผ ๊ทธ๋ฃน์ผ๋ก ๋ฌถ์ด ๋ถ์ํ ๋ ์ฌ์ฉ
df.groupby('์์์ฌ')
df.groupby('์์์ฌ').count()
df.groupby('์์์ฌ').mean()
df.groupby('์์์ฌ').sum()
์ค๋ณต๊ฐ ์ ๊ฑฐํ๊ธฐ
# drop_duplicates(): ์ค๋ณต๋ ๋ฐ์ดํฐ๋ฅผ ์ ๊ฑฐ(keep='first')
df['ํ์กํ'].drop_duplicates()
df['ํ์กํ'].drop_duplicates(keep='last') # ๋ง์ง๋ง index๋ก ๋จ๊ฒจ๋๋ค!
๋ฐ์ดํฐํ๋ ์ ๋ณํฉํ๊ธฐ
concat(): ๋ฐ์ดํฐ๋ฅผ ํฉ์นจ. sort=Falsse ์ต์
์ผ๋ก ์์๊ฐ ์ ์ง๋๋๋ก ํจ
- ์ปฌ๋ผ์ด ๊ฐ์ ๊ฒฝ์ฐ ์์๋๋ก ํฉ์น ๋
pd.concat([df1, df_copy],sort=False)
df_concat.reset_index(drop=True)
- ์๋ก์ด ์ปฌ๋ผ์ผ๋ก ์ข์ฐ๋ก ํฉ์น ๋
pd.concat([df1,df2], axis=1)
merge(): ํน์ ๊ณ ์ ํ ํค ๊ฐ์ ๊ธฐ์ค์ผ๋ก ๋ณํฉ (sql join๊ณผ ๊ฐ์)
- merge(๋ฐ์ดํฐ ํ๋ ์1, ๋ฐ์ดํฐ ํ๋ ์2, on='unique id', how='๋ณํฉ์ ๊ธฐ์ค')
- ๋ณํฉ์ ๊ธฐ์ค: left, right, inner, outer
pd.merge(df1, df_right, on='์ด๋ฆ', how='left')
pd.merge(df1, df_right, on='์ด๋ฆ', how='inner')
pd.merge(df1, df_right, on='์ด๋ฆ', how='outer')
Series์ ํ์
๋ณํํ๊ธฐ
astype(): ๋ฐ์ดํฐ์ ํ์
์ ๋ณํ
df['ํค'].astype(int)
๋ ์ง ๋ณํํ๊ธฐ
to_datetime(): object ํ์
์์ datetimeํ์
์ผ๋ก ๋ณํ
df['์๋
์์ผ'] = pd.to_datetime(df['์๋
์์ผ'])
๋ฌ, ์ผ, ์๊ฐ, ์ฃผ, ์์ผ
df['์๋
์์ผ'].dt.month
df['์๋
์์ผ'].dt.day
df['์๋
์์ผ'].dt.hour
df['์๋
์์ผ'].dt.week
df['์๋
์์ผ'].dt.dayofweek
df['์๋
์์ผ'].dt.date
df['์๋
์์ผ'].dt.isocalendar().week
df['์ถ์๋
๋'] = df['์๋
์์ผ'].dt.year
df['์ถ์๋
๋'] = df['์๋
์์ผ'].dt.month
df['์ถ์๋
๋'] = df['์๋
์์ผ'].dt.day
apply
- Series๋ DateFrame์ ์ข ๋ ๊ตฌ์ฒด์ ์ธ ๋ก์ง์ ์ ์ฉํ๊ณ ์ถ์ ๊ฒฝ์ฐ์ ์ฌ์ฉ
- apply ์ ์ฉํ๊ธฐ ์ํด์๋ ๋ณ๋์ ํจ์๊ฐ ๋จผ์ ์ ์๋์ผ ํจ
- ํจ์๋ฅผ ๋งค๊ฐ๋ณ์๋ก ๋๊ฒจ์ค
df.loc[df['์ฑ๋ณ']=='๋จ์', '์ฑ๋ณ'] = 1
df.loc[df['์ฑ๋ณ']=='์ฌ์', '์ฑ๋ณ'] = 0
def male_or_female(x):
if x == '๋จ์':
return 1
elif x == '์ฌ์':
return 0
df['์ฑ๋ณ'].apply(male_or_female)
df['์ฑ๋ณ'].apply(lambda x:1 if x=='๋จ์' else 0)
map
map_gender = {'๋จ์':1,'์ฌ์':0}
df['์ฑ๋ณ'].map(map_gender)
df['NEW์ฑ๋ณ']= df['์ฑ๋ณ'].map(map_gender)
๋ฐ์ดํฐ ํ๋ ์์ ์ฐ์ ์ฐ์ฐ
df= pd.DataFrame({
'ํ์ด์ฌ':[60, 70, 80, 85, 75],
'์๋ฃ๊ตฌ์กฐ':[40, 50, 45, 70, 55],
'๋ฐ์ดํฐ๋ถ์':[80, 75, 100, 90, 95]
})
df['ํ์ด์ฌ'] + df['์๋ฃ๊ตฌ์กฐ'] + df['๋ฐ์ดํฐ๋ถ์']
df['ํ์ด์ฌ'] - df['์๋ฃ๊ตฌ์กฐ']
df['ํ์ด์ฌ'] * df['์๋ฃ๊ตฌ์กฐ']
df['ํ์ด์ฌ'] / df['์๋ฃ๊ตฌ์กฐ']
df['์ด์ '] = df['ํ์ด์ฌ'] + df['์๋ฃ๊ตฌ์กฐ'] + df['๋ฐ์ดํฐ๋ถ์']
df['ํ๊ท '] = df['์ด์ '] /3
์ ํซ ์ธ์ฝ๋ฉ(One-Hot-Encoding) โจโจโจโจ
- ๋๋ฏธ ๋ณ์
- ์ ํซ ์ธ์ฝ๋ฉ์ ํ๊ฐ์ ์์๋ True, ๋๋จธ์ง ์์๋ Falses๋ก ๋ง๋ค์ด ์นดํ
๊ณ ๋ฆฌํ์ผ๋ก ํํํ๋ ๊ธฐ์
- df['ํ์กํ_code']๋ฅผ ๋จธ์ ๋ฌ๋ ์๊ณ ๋ฆฌ์ฆ์ ๋ฃ์ด ๋ฐ์ดํฐ๋ฅผ ์์ธกํ๋ ค๊ณ ์ง์๋ฅผ ํ๋ฉด ์ปดํจํฐ๋ ๊ฐ๋ค๊ฐ์ ๊ด๊ผ๋ฅผ ์ค์ค๋ก ํ์ฑํ๊ฒ ๋จ
- ๋ง์ฝ Bํ์ 1, ABํ์ 2๋ผ๋ ๊ฐ์ ๊ฐ์ง๊ณ ์์ ๋, ์ปดํจํฐ๋ 'Bํ' +'ABํ' = 'Oํ' ๋ผ๊ณ ์ด์ํ ๊ด๊ณ๋ฅผ ๋งบ์ ์ ์๊ฒ ๋จ
- ๋ณ๋์ column์ ํ์ฑํด์ฃผ๊ณ 1๊ฐ์ column์๋ True, ๋๋จธ์ง๋ False๋ผ๊ณ ๋ฃ์ด์ค์ผ๋ก 'A,B,AB,O'ํ์ ๊ด๊ณ๋ ๋
๋ฆฝ์ ์ด๋ค๋ผ๋ ์นดํ
๊ณ ๋ฆฌ๋ก ํํํด์ฃผ๋ ๋ฐฉ์ -> ์ํซ ์ธ์ฝ๋ฉ
blood_map = {'A':0, 'B':1,'AB':2,'O':3}
df['ํ์กํ_code'] = df['ํ์กํ'].map(blood_map)
df['ํ์กํ_code'].value_counts()
pd.get_dummies(df['ํ์กํ_code'])
๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์์
df = pd.read_csv('https://bit.ly/ds-house-price')
df.info()
# ๋ถ์๊ฐ๊ฒฉ์ dtype์ int64๋ก ๋ณ๊ฒฝํ๊ธฐ
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: ' '
# ' '๊ณต๋ฐฑ์ด ์กด์ฌํ๋ ๋ฐ์ดํฐ๊ฐ ์๋์ง ํ์ธ
df.loc[df['๋ถ์๊ฐ๊ฒฉ']==' ']
# stript()๋ฅผ ์ฌ์ฉํด์ ๊ณต๋ฐฑ์ด ์๋ ๋ฐ์ดํฐ์ ํด๋น ์๋ค ๊ณต๋ฐฑ์ ์ญ์
df['๋ถ์๊ฐ๊ฒฉ']= df['๋ถ์๊ฐ๊ฒฉ'].str.strip()
#int๋ก ๋ฐ๊พธ๊ธฐ ๋ค์์๋
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: ''
df.loc[df['๋ถ์๊ฐ๊ฒฉ']=='']
# ๋ฐ์ดํฐ๊ฐ ์๋ ๊ฒฝ์ฐ๋ฅผ 0์ผ๋ก ๋ณ๊ฒฝ
df.loc[df['๋ถ์๊ฐ๊ฒฉ']=='','๋ถ์๊ฐ๊ฒฉ'] = 0
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: cannot convert float NaN to integer
# NaN์ 0์ผ๋ก ๋ฐํ
df['๋ถ์๊ฐ๊ฒฉ']=df['๋ถ์๊ฐ๊ฒฉ'].fillna(0)
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: '6,657' : 1000์๋จ์
# ์ฝค๋ง๋ฅผ ์ ๊ฑฐ
df['๋ถ์๊ฐ๊ฒฉ'] = df['๋ถ์๊ฐ๊ฒฉ'].str.replace(',','')
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: cannot convert float NaN to integer
df['๋ถ์๊ฐ๊ฒฉ']=df['๋ถ์๊ฐ๊ฒฉ'].fillna(0)
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: invalid literal for int() with base 10: '-'
# - ๋ฅผ ์ ๊ฑฐ
df['๋ถ์๊ฐ๊ฒฉ'] = df['๋ถ์๊ฐ๊ฒฉ'].str.replace('-','')
df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ValueError: cannot convert float NaN to integer
df['๋ถ์๊ฐ๊ฒฉ']=df['๋ถ์๊ฐ๊ฒฉ'].fillna(0)
df['๋ถ์๊ฐ๊ฒฉ'].astype(int)
# ๋ฐ์ดํฐ๊ฐ ์๋ ๊ฒฝ์ฐ๋ฅผ 0์ผ๋ก ๋ณ๊ฒฝ
df.loc[df['๋ถ์๊ฐ๊ฒฉ']=='','๋ถ์๊ฐ๊ฒฉ'] = 0
df['๋ถ์๊ฐ๊ฒฉ'].astype(int)
df.info()
df['๋ถ์๊ฐ๊ฒฉ'] = df['๋ถ์๊ฐ๊ฒฉ'].astype(int) # ๊ฐ์ ์ ์ฅํด์ผ ๋ฐ๋๋ค!
# ๊ท๋ชจ๋ถ๋ถ column์ ๋ถํ์ํ '์ ์ฉ๋ฉด์ ' ๊ธ์๋ฅผ ์ญ์
df['๊ท๋ชจ๊ตฌ๋ถ'] = df['๊ท๋ชจ๊ตฌ๋ถ'].str.replace('์ ์ฉ๋ฉด์ ','')
df['๊ท๋ชจ๊ตฌ๋ถ'].value_counts()
# ์ง์ญ๋ช
๋ณ๋ก ํ๊ท ๋ถ์๊ฐ๊ฒฉ์ ํ์ธ
df.groupby('์ง์ญ๋ช
')['๋ถ์๊ฐ๊ฒฉ'].mean()
# ์์
ํ์ผ ์ ์ฅ
df.to_csv('ds-house-price-clean.csv', index=False)