データセットの結合とマージ
データフレームをデータベース風に結合する
code: Python
df1 = pd.DataFrame({'key': 'b', 'b', 'a', 'c', 'a', 'a', 'b', 'data1': range(7)})
df1
--------------------------------------------------------------------------
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
--------------------------------------------------------------------------
code: Python
df2 = pd.DataFrame({'key': 'a', 'b', 'd', 'data2': range(3)})
df2
--------------------------------------------------------------------------
data2 key
0 0 a
1 1 b
2 2 d
--------------------------------------------------------------------------
code: Python
# デフォルトで内部結合
pd.merge(df1, df2, on='key')
--------------------------------------------------------------------------
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
--------------------------------------------------------------------------
code: Python
df3 = pd.DataFrame({'lkey': 'b', 'b', 'a', 'c', 'a', 'a', 'b',
'data1': range(7)})
df3
--------------------------------------------------------------------------
data1 lkey
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
--------------------------------------------------------------------------
code: Python
df4 = pd.DataFrame({'rkey': 'a', 'b', 'd',
'data2': range(3)})
df4
--------------------------------------------------------------------------
data2 rkey
0 0 a
1 1 b
2 2 d
--------------------------------------------------------------------------
code: Python
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
--------------------------------------------------------------------------
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
--------------------------------------------------------------------------
code: Python
# 外部結合
pd.merge(df1, df2, how='outer')
--------------------------------------------------------------------------
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
--------------------------------------------------------------------------
code: Python
df1 = pd.DataFrame({'key': 'b', 'b', 'a', 'c', 'a', 'b',
'data1': range(6)})
df2 = pd.DataFrame({'key': 'a', 'b', 'a', 'b', 'd',
'data2': range(5)})
pd.merge(df1, df2, on='key', how='left')
--------------------------------------------------------------------------
data1 key data2
0 0 b 1.0
1 0 b 3.0
2 1 b 1.0
3 1 b 3.0
4 2 a 0.0
5 2 a 2.0
6 3 c NaN
7 4 a 0.0
8 4 a 2.0
9 5 b 1.0
10 5 b 3.0
--------------------------------------------------------------------------
インデックスによるマージ
code: Python
left1 = pd.DataFrame({'key': 'a', 'b', 'a', 'a', 'b', 'c',
'value': range(6)})
left1
--------------------------------------------------------------------------
key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
--------------------------------------------------------------------------
code: Python
right1 = pd.DataFrame({'group_val': 3.5, 7}, index='a', 'b')
right1
--------------------------------------------------------------------------
group_val
a 3.5
b 7.0
--------------------------------------------------------------------------
code: Python
# 列ではなくインデックスをマージキーにする場合
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
--------------------------------------------------------------------------
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN
--------------------------------------------------------------------------
code: Python
left2 = pd.DataFrame(1., 2.], 3., 4., [5., 6.,
index='a', 'c', 'e',
columns='Ohio', 'Nevada')
left2
--------------------------------------------------------------------------
Ohio Nevada
a 1.0 2.0
c 3.0 4.0
e 5.0 6.0
--------------------------------------------------------------------------
code: Python
right2 = pd.DataFrame(7., 8.], 9., 10., 11., 12., [13, 14,
index='b', 'c', 'd', 'e',
columns='Missouri', 'Alabama')
right2
--------------------------------------------------------------------------
Missouri Alabama
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0
--------------------------------------------------------------------------
code: Python
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
--------------------------------------------------------------------------
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
--------------------------------------------------------------------------
code: Python
left2.join(right2, how='outer')
--------------------------------------------------------------------------
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
--------------------------------------------------------------------------
軸に沿った連結
code: Python
arr = np.arange(12).reshape((3, 4))
arr
--------------------------------------------------------------------------
array([ 0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11])
--------------------------------------------------------------------------
code: Python
np.concatenate(arr, arr, axis=1)
--------------------------------------------------------------------------
array([ 0, 1, 2, 3, 0, 1, 2, 3,
4, 5, 6, 7, 4, 5, 6, 7,
8, 9, 10, 11, 8, 9, 10, 11])
--------------------------------------------------------------------------
code: Python
s1 = pd.Series(0, 1, index='a', 'b')
s2 = pd.Series(2, 3, 4, index='c', 'd', 'e')
s3 = pd.Series(5, 6, index='f', 'g')
# デフォルトはaxis=0 (Seriesで連結)
pd.concat(s1, s2, s3)
--------------------------------------------------------------------------
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
--------------------------------------------------------------------------
code: Python
# axis=1はDataFrameで連結
pd.concat(s1, s2, s3, axis=1)
--------------------------------------------------------------------------
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
--------------------------------------------------------------------------
code: Python
df1 = pd.DataFrame({'a': 1., np.nan, 5., np.nan,
'b': np.nan, 2., np.nan, 6.,
'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': 5., 4., np.nan, 3., 7.,
'b': np.nan, 3., 4., 6., 8.})
# df1の欠損値をdf2で補う
df1.combine_first(df2)
--------------------------------------------------------------------------
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN
--------------------------------------------------------------------------