본 게시물은 python for Data analysis 책을 참고하여 작성하였다.
import numpy as np
import pandas as pd
df=pd.DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1': np.random.randn(5),
'data2':np.random.rand(5)})
df
means=df['data1'].groupby([df['key1'], df['key2']]).mean()
means
means.unstack()
for name, group in df.groupby('key1'):
print(name)
print(group)
for (k1,k2),group in df.groupby(['key1','key2']):
print((k1,k2))
print(group)
df.groupby('key1')['data1']
df.groupby('key1')[['data1']]
# 1. 칼럼 이름으로 접근한 경우, 2. 칼럼 이름이 담긴 배열로 접근하기
# 둘의 차이
df.groupby(['key1','key2'])[['data2']].mean()
# 특정 열에 대해서만 평균을 구하고 싶은 경우
import numpy as np
import pandas as pd
people=pd.DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jin','Travis'])
people.loc[2:3,['b','c']]=np.nan # 특정 위치에 NaN 값 추가하기
people
mapping={'a':'Red','b':'Red','c':'Blue',
'd':'Blue','e':'Red','f':'Orange'}
by_column=people.groupby(mapping, axis=1)
by_column.sum()
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
[1,3,5,1,3]], names=['cty','tenor'])
columns
hier_df = pd.DataFrame(np.random.randn(4,5), columns=columns)
hier_df
hier_df.groupby(level='cty', axis=1).count()
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg([('foo','mean'),('bar',np.std)])
# foo와 bar라는 이름으로 mean과 np.std가 계산됨
frame = pd.DataFrame({'data1':np.random.randn(1000),
'data2':np.random.randn(1000)})
print(frame)
factor = pd.cut(frame.data1, 4)
factor
def get_stats(group):
return {'min':group.min(), 'max':group.max(),
'count':group.count(), 'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()