python for data analysis 책의 7장을 참고하여 작성하였습니다
import pandas as pd
data=pd.DataFrame({'k1':['one']*3 + ['two']*4,
'k2':[1,1,2,3,3,4,4]})
data
data.drop_duplicates()
data['v1']=range(7)
# 특정 컬럼에 기반해서 중복을 걸러내기
data.drop_duplicates(['k1'])
data.drop_duplicates(['k1','k2'], keep='last') # 마지막 값을 남기기
import pandas as pd
data=pd.DataFrame({'food':['bac','pulled pork', 'bac', 'pas','pas', 'bac', 'pas', 'honey', 'nova'],
'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
meat_to_animal ={
'bac':'pig',
'pulled pork':'pig',
'pas':'cow',
'honey':'pig',
'nova':'salmon'
}
data['animal']=data['food'].map(lambda x: meat_to_animal[x.lower()])
data
여러 개의 값을 한 번에 치환하고 싶다면?
from pandas import Series
import numpy as np
data=Series([1., -999., 2., -999., -1000., 3.])
data
data.replace([-999,-1000], [np.nan,0])
data.replace({-999:np.nan, -1000:0})
rename도 활용 가능(비슷한 원리로 쓰임)
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins =[18,25,35,60,100]
cats=pd.cut(ages, bins, right=True) # right=False로 괄호와 대괄호 위치 변경 가능
cats
cats.codes
cats.categories
pd.value_counts(cats)
group_names=['Youth','YoungAdult','MiddleAge','Senior']
pd.cut(ages,bins,labels=group_names)
data=np.random.rand(20)
pd.cut(data,4,precision=2)
data=np.random.rand(1000)
cats=pd.qcut(data,4)
cats
pd.qcut(data, [0,0.1,0.5,0.9,1.])
더미나 표시 행렬로 변환하기, dataframe의 한 컬럼이 여러 카테고리에 속한다면 어떻게?
1. 데이터 부르기
mnames=['movie_id','title','genres']
movies=pd.read_table('pydata-book/datasets/movielens/movies.dat',
sep='::', header=None, names=mnames)
movies[:10]
genre_iter=(set(x.split('|')) for x in movies.genres)
genres=sorted(set.union(*genre_iter))
genres
dummies=pd.DataFrame(np.zeros((len(movies),len(genres))), columns=genres)
dummies
for i, gen in enumerate(movies.genres):
dummies.loc[i, gen.split('|')]=1
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic