# In[1]
class display(object):
"""Display HTML representation of multiple objects"""
template="""<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}{1}
"""
def __init__(self,*args):
self.args=args
def _repr_html_(self):
return '\n'.join(self.template.format(a,eval(a)._repr_html_())
for a in self.args)
def __repr__(self):
return '\n\n'.join(a+'\n'+repr(eval(a))
for a in self.args)
pd.merge
is a subset of what is known as relational algebra, which is a formal set of rules for manipulating relational data that forms the conceptual foundation of operations available in most databases.pd.merge
function and the related join
method of Series and DataFrame objects.# In[2]
df1=pd.DataFrame({'employee':['Bob','Jake','Lisa','Sue'],
'group':['Accounting','Engineering','Engineering','HR']})
df2=pd.DataFrame({'employee':['Lisa','Bob','Jake','Sue'],
'hire_data':[2004,2008,2012,2014]})
display('df1','df2')
# Out[2]
df1
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
df2
employee hire_data
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
pd.merge
function.# In[3]
df3=pd.merge(df1,df2)
df3
# Out[3]
employee group hire_data
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
pd.merge
function recognizes that each DataFrame has an employee column, and automatically joins using this column as a key.employee
column differs between df1
and df2
, and the pd.merge
function correctly accounts for this.# In[4]
df4=pd.DataFrame({'group':['Accounting','Engineering','HR'],
'supervisor':['Carly','Guido','Steve']})
display('df3','df4','pd.merge(df3,df4)')
# Out[4]
df3
employee group hire_data
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
df4
group supervisor
0 Accounting Carly
1 Engineering Guido
2 HR Steve
pd.merge(df3,df4)
employee group hire_data supervisor
0 Bob Accounting 2008 Carly
1 Jake Engineering 2012 Guido
2 Lisa Engineering 2004 Guido
3 Sue HR 2014 Steve
# In[5]
df5=pd.DataFrame({'group':['Accounting','Accounting','Engineering','Engineering','HR','HR'],
'skills':['math','spreadsheets','software','math','spreadsheets','organization']})
display('df1','df5','pd.merge(df1,df5)')
# Out[5]
df1
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
df5
group skills
0 Accounting math
1 Accounting spreadsheets
2 Engineering software
3 Engineering math
4 HR spreadsheets
5 HR organization
pd.merge(df1,df5)
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering software
3 Jake Engineering math
4 Lisa Engineering software
5 Lisa Engineering math
6 Sue HR spreadsheets
7 Sue HR organization
on
keyword, which takes a column name of a list of column names.# In[6]
display('df1','df2',"pd.merge(df1,df2,on='employee')")
# Out[6]
df1
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
df2
employee hire_data
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
pd.merge(df1,df2,on='employee')
employee group hire_data
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
# In[7]
df3=pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
'salary':[70000,80000,120000,90000]})
display('df1','df3',"pd.merge(df1,df3,left_on='employee',right_on='name')")
# Out[7]
df1
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
df3
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
pd.merge(df1,df3,left_on='employee',right_on='name')
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
DataFrame.drop()
method.# In[8]
pd.merge(df1,df3,left_on='employee',right_on='name').drop('name',axis=1)
# Out[8]
employee group salary
0 Bob Accounting 70000
1 Jake Engineering 80000
2 Lisa Engineering 120000
3 Sue HR 90000
left_index
and/or right_index
flags in pd.merge()
# In[9]
df1a=df1.set_index('employee')
df2a=df2.set_index('employee')
display('df1a','df2a')
# Out[9]
df1a
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
df2a
hire_data
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
# In[10]
display('df1a','df2a',"pd.merge(df1a,df2a,left_index=True,right_index=True)")
# Out[10]
df1a
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
df2a
hire_data
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
pd.merge(df1a,df2a,left_index=True,right_index=True)
group hire_data
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
DataFrame.join()
method, which performs an index-based merge without extra keywords.# In[11]
df1a.join(df2a)
# Out[11]
group hire_data
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
left_index
with right_on
or left_on
with right_index
to get the desired behavior.# In[12]
display('df1a','df3',"pd.merge(df1a,df3,left_index=True,right_on='name')")
# Out[12]
df1a
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
df3
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
pd.merge(df1a,df3,left_index=True,right_on='name')
group name salary
0 Accounting Bob 70000
1 Engineering Jake 80000
2 Engineering Lisa 120000
3 HR Sue 90000
# In[13]
df6=pd.DataFrame({'name':['Peter','Paul','Mary'],
'food':['fish','beans','bread']})
df7=pd.DataFrame({'name':['Mary','Joseph'],
'drink':['wine','beer']}, columns=['name','drink'])
display('df6','df7',"pd.merge(df6,df7)")
# Out[13]
df6
name food
0 Peter fish
1 Paul beans
2 Mary bread
df7
name drink
0 Mary wine
1 Joseph beer
pd.merge(df6,df7)
name food drink
0 Mary bread wine
how
keyword, which defaults to inner
# In[14]
pd.merge(df6,df7,how='inner')
# Out[14]
name food drink
0 Mary bread wine
how
keyword are outer
, left
, and right
.outer
join returns a join over the union of the input columns and fills in missing values with NAs.# In[15]
display('df6','df7',"pd.merge(df6,df7,how='outer')")
# Out[15]
df6
name food
0 Peter fish
1 Paul beans
2 Mary bread
df7
name drink
0 Mary wine
1 Joseph beer
pd.merge(df6,df7,how='outer')
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer
left
and right
join return joins over the left entries and right entries, respectively.# In[16]
display('df6','df7',"pd.merge(df6,df7,how='left')")
# Out[16]
df6
name food
0 Peter fish
1 Paul beans
2 Mary bread
df7
name drink
0 Mary wine
1 Joseph beer
pd.merge(df6,df7,how='left')
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
# In[17]
display('df6','df7',"pd.merge(df6,df7,how='right')")
# Out[17]
df6
name food
0 Peter fish
1 Paul beans
2 Mary bread
df7
name drink
0 Mary wine
1 Joseph beer
pd.merge(df6,df7,how='right')
name food drink
0 Mary bread wine
1 Joseph NaN beer
# In[18]
df8=pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
'rank':[1,2,3,4]})
df9=pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
'rank':[3,1,4,2]})
display('df8','df9','pd.merge(df8,df9,on="name")')
# Out[18]
df8
name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
df9
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
pd.merge(df8,df9,on="name")
name rank_x rank_y
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
merge
function automatically appends the suffixes _x
and _y
to make the output columns unique.suffixes
keyword.# In[19]
pd.merge(df8,df9,on="name",suffixes=["_L","_R"])
# Out[19]
name rank_L rank_R
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
Reference these urls if you want to know about join
, merge
and concat
:
1. Combining DataFrame
2. About merge method