pandas系列9-数据规整

皮大大

发布于 2021-03-02 15:17:35

7940

发布于 2021-03-02 15:17:35

层次化索引hierarchical indexing

数据分散在不同的文件或者数据库中
层次化索引在⼀个轴上拥有多个（两个以上）索引级别
低维度形式处理高维度数据

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# 创建S: 索引index是一个数组组成的列表
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    1.832067
   2   -0.501033
   3   -0.602755
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
d  2   -0.826364
   3   -1.874992
dtype: float64

data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

data['b']

1   -0.731398
3   -0.707528
dtype: float64

# 部分索引选取数据子集
# 切片形式
data['b':'c']

b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
dtype: float64

# 列表形式
data.loc[['b', 'c']]

b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
dtype: float64

data.loc[['b', 'd']]

b  1   -0.731398
   3   -0.707528
d  2   -0.826364
   3   -1.874992
dtype: float64

data.loc[:, 2]

a   -0.501033
c   -0.177199
d   -0.826364
dtype: float64

# 2表示含有索引是2
data.loc[:, 2]

a   -0.501033
c   -0.177199
d   -0.826364
dtype: float64

data

a  1    1.832067
   2   -0.501033
   3   -0.602755
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
d  2   -0.826364
   3   -1.874992
dtype: float64

data.unstack()    # 将层次化索引的数据变成DF形式

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	1	2	3
a	1.832067	-0.501033	-0.602755
b	-0.731398	NaN	-0.707528
c	-0.382131	-0.177199	NaN
d	NaN	-0.826364	-1.874992

data.unstack().stack()

a  1    1.832067
   2   -0.501033
   3   -0.602755
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
d  2   -0.826364
   3   -1.874992
dtype: float64

# 对于DF类型数据
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } </code></pre>

		Ohio		Colorado
		Green	Red	Green
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

# 索引设置名字
frame.index.names = ['key1', 'key2']
# 属性设置名字
frame.columns.names = ['state', 'color']
frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame['Ohio']

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	color	Green	Red
key1	key2
a	1	0	1
a	2	3	4
b	1	6	7
b	2	9	10

from pandas import MultiIndex
MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           codes=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

重排与分级排序

重新调整某条轴上的各级别的顺序
指定级别上的值对数据进行排序
swaplevel()接受两个级别编号或名称

# 交换位置
frame.swaplevel('key1', 'key2')

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

frame

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

# level=0 通过第一层索引key1排序
frame.sort_index(level=0)

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

# level=1 通过第一层索引key2排序
frame.sort_index(level=1)

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
b	1	6	7	8
a	2	3	4	5
b	2	9	10	11

# swaplevel 不仅可以交换两个索引值，还可以交换它们的索引数值
frame.swaplevel(0, 1)

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

frame.swaplevel(0, 1).sort_index(level=0)

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
1	b	6	7	8
2	a	3	4	5
2	b	9	10	11

根据级别统计求和

通过level指定某条轴
指定行或者列

frame.sum(level='key2')

state	Ohio		Colorado
color	Green	Red	Green
key2
1	6	8	10
2	12	14	16

# axis=1表示列
frame.sum(level='color', axis=1)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	color	Green	Red
key1	key2
a	1	2	1
a	2	8	4
b	1	14	7
b	2	20	10

frame

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.sum(level='key2',axis=0)

state	Ohio		Colorado
color	Green	Red	Green
key2
1	6	8	10
2	12	14	16

# axis=0指定列
frame.sum(level='key1',axis=0)

state	Ohio		Colorado
color	Green	Red	Green
key1
a	3	5	7
b	15	17	19

# 使⽤DataFrame的列进⾏索引
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                    'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                    'd': [0, 1, 2, 0, 1, 2, 3]})
frame

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	a	b	c	d
0	0	7	one	0
1	1	6	one	1
2	2	5	one	2
3	3	4	two	0
4	4	3	two	1
5	5	2	two	2
6	6	1	two	3

# set_index函数将列转换成行索引，默认删除
frame2 = frame.set_index(['c', 'd'])
frame2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

		a	b
c	d
one	0	0	7
	1	1	6
	2	2	5
two	0	3	4
	1	4	3
	2	5	2
	3	6	1

# 将原来的索引保留
frame.set_index(['c', 'd'], drop=False)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

		a	b	c	d
c	d
one	0	0	7	one	0
	1	1	6	one	1
	2	2	5	one	2
two	0	3	4	two	0
	1	4	3	two	1
	2	5	2	two	2
	3	6	1	two	3

frame2.reset_index()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	c	d	a	b
0	one	0	0	7
1	one	1	1	6
2	one	2	2	5
3	two	0	3	4
4	two	1	4	3
5	two	2	5	2
6	two	3	6	1

合并数据集

pandas.merge：根据键将不同DF中的行连接起来，类似于数据库的join操作
pandas.concat：沿着轴将对象叠在一起
法combine_first可以将重复数据拼接在⼀起，⽤⼀个对象中的值填充另⼀个的缺失值

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})

df1

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df2

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data2
0	a	0
1	b	1
2	d	2

# 默认根据重叠列名key根据进行合并
pd.merge(df1,df2)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

pd.merge(df1, df2, on='key')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})

merge

默认是内连接
结果中的键是交集：只有a、b在两个DF中同时存在

pd.merge(df3, df4, left_on='lkey', right_on='rkey')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	data1	rkey	data2
0	b	0	b	1
1	b	1	b	1
2	b	6	b	1
3	a	2	a	0
4	a	4	a	0
5	a	5	a	0

df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                     'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

df1.merge(df2, left_on='lkey', right_on='rkey')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	value_x	rkey	value_y
0	foo	1	foo	5
1	foo	1	foo	8
2	foo	5	foo	5
3	foo	5	foo	8
4	bar	2	bar	6
5	baz	3	baz	7

# suffixes解决两个DF中重复列名的问题
df1.merge(df2, left_on='lkey', right_on='rkey',
          suffixes=('_left', '_right'))

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	value_left	rkey	value_right
0	foo	1	foo	5
1	foo	1	foo	8
2	foo	5	foo	5
3	foo	5	foo	8
4	bar	2	bar	6
5	baz	3	baz	7

# 相同的values进行合并
df1.merge(df2, how='inner')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	value	rkey
0	foo	5	foo

# 右边的值为标准，左边如果有，直接显示；没有则显示NaN
df1.merge(df2, how='right')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	value	rkey
0	foo	5	foo
1	NaN	6	bar
2	NaN	7	baz
3	NaN	8	foo

pd.merge(df1, df2, how='left')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	value	rkey
0	foo	1	NaN
1	bar	2	NaN
2	baz	3	NaN
3	foo	5	foo

# outer相当于是right和left一起作用
pd.merge(df1, df2, how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	lkey	value	rkey
0	foo	1	NaN
1	bar	2	NaN
2	baz	3	NaN
3	foo	5	foo
4	NaN	6	bar
5	NaN	7	baz
6	NaN	8	foo

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

# 以right为标准：如果右有左无，标记为NaN；左有右无，直接丢弃
pd.merge(df1, df2, on='key', how='right')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data1	data2
0	b	0.0	1
1	b	1.0	1
2	b	5.0	1
3	b	0.0	3
4	b	1.0	3
5	b	5.0	3
6	a	2.0	0
7	a	4.0	0
8	a	2.0	2
9	a	4.0	2
10	d	NaN	4

pd.merge(df1, df2, on='key', how='left')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

pd.merge(df1, df2, how='inner')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	data1	data2
0	b	0	1
1	b	0	3
2	b	1	1
3	b	1	3
4	b	5	1
5	b	5	3
6	a	2	0
7	a	2	2
8	a	4	0
9	a	4	2

left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
right

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key1	key2	rval
0	foo	one	4
1	foo	one	5
2	bar	one	6
3	bar	two	7

pd.merge(left, right, on=['key1', 'key2'], how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

pd.merge(left, right, on=['key1', 'key2'], how='inner')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key1	key2	lval	rval
0	foo	one	1	4
1	foo	one	1	5
2	bar	one	3	6

索引行的合并

DF的连接键有时位于索引
传入left_index=True或right_index=True

left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

pd.merge(left1, right1, left_on='key', right_index=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0
5	c	5	NaN

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

join()

按照索引合并
合并多个DF对象，要求没有重复的列
默认使用左连接，保留左边的行索引
简单的合并参数可以是一组DF

left2.join(right2, how='outer')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

left1.join(right1, on='key')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	key	value	group_val
0	a	0	3.5
1	b	1	7.0
2	a	2	3.5
3	a	3	3.5
4	b	4	7.0
5	c	5	NaN

another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
another

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	New York	Oregon
a	7.0	8.0
c	9.0	10.0
e	11.0	12.0
f	16.0	17.0

left2.join([right2, another])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

left2.join([right2, another], how='outer', sort='True')

c:\users\admin\venv\lib\site-packages\pandas\core\frame.py:6848: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  verify_integrity=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
b	NaN	NaN	7.0	8.0	NaN	NaN
c	3.0	4.0	9.0	10.0	9.0	10.0
d	NaN	NaN	11.0	12.0	NaN	NaN
e	5.0	6.0	13.0	14.0	11.0	12.0
f	NaN	NaN	NaN	NaN	16.0	17.0

轴向索引

连接concatentation、绑定binding、堆叠stacking
Numpy的concatenate()函数实现
pandas的concat()函数实现

s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

# 传入的是列表形式
# concat是在axis=0上⼯作的
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

s4 = pd.concat([s1, s2, s3], axis=1,sort=True)
s4

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

pd.concat([s1, s4])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

pd.concat([s1, s4], axis=1, sort=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	0	0	1	2
a	0.0	0.0	NaN	NaN
b	1.0	1.0	NaN	NaN
c	NaN	NaN	2.0	NaN
d	NaN	NaN	3.0	NaN
e	NaN	NaN	4.0	NaN
f	NaN	NaN	NaN	5.0
g	NaN	NaN	NaN	6.0

# 传入join求交集
# join_axes指定要在其它轴上使⽤的索引
pd.concat([s1, s4], axis=1, join='inner')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	0	0	1	2
a	0	0.0	NaN	NaN
b	1	1.0	NaN	NaN

result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

result.unstack()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	a	b	f	g
one	0.0	1.0	NaN	NaN
two	0.0	1.0	NaN	NaN
three	NaN	NaN	5.0	6.0

pd.concat([s1, s2, s3], axis=1, sort=True, keys=['one', 'two', 'three'])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df1

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	a	b	c	d
0	-1.017946	-0.127939	0.556561	3.037912
1	-0.282973	-1.015500	-0.846612	-0.393027
2	-0.022687	-1.878993	0.607246	0.365823

pd.concat([df1, df2], ignore_index=True,sort=True)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	a	b	c	d
0	-1.017946	-0.127939	0.556561	3.037912
1	-0.282973	-1.015500	-0.846612	-0.393027
2	-0.022687	-1.878993	0.607246	0.365823
3	-0.212958	0.053333	NaN	-0.371492
4	-0.237029	-0.239806	NaN	0.623274

合并与重叠

索引全部或者部分重叠的两个数据

Numpy的where函数：类似if-else
Series有⼀个combine_first⽅法

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=['f', 'e', 'd', 'c', 'b', 'a'])
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

b[-1] = np.nan
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

np.where(pd.isnull(a), b, a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

pd.isnull(a)

f     True
e    False
d     True
c    False
b    False
a     True
dtype: bool

# Series有⼀个combine_first⽅法
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

重塑和轴向旋转

reshape
pivot：⽤set_index创建层次化索引，再⽤unstack重塑；长格式转化为宽格式
pandas.melt：将宽格式转化为长格式，合并多列
stack：列旋转为行:S------>DF；
- 默认会滤除缺失数据
- 修改：dropna=False，不滤除
unstack：行旋转为列：DF---->S

# 创建一个DF，同时指定行列索引的name
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                    name='number'))
data

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

# 将列转换为行，得到一个Series
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

# 将S变成DF数据
result.unstack()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

#  unstack()：默认操作的是最内层
result.unstack('state')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

df = pd.DataFrame(np.random.rand(6, 4),
                  index=['one', 'two', 'three', 'four', 'five', 'six'],
                  columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

Genus	A	B	C	D
one	0.862586	0.440328	0.172111	0.852715
two	0.303660	0.322660	0.232518	0.327529
three	0.128695	0.309510	0.224989	0.750412
four	0.658148	0.768613	0.234445	0.184719
five	0.502111	0.406510	0.310178	0.355971
six	0.002387	0.996109	0.207558	0.033299

# Genus自动用作图例
df.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x1ad366bafd0>

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2019-10-21，如有侵权请联系 cloudcommunity@tencent.com 删除

编程算法

本文分享自作者个人站点/博客前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

编程算法