作者:闫钟峰,Datawhale优秀学习者
寄语:本文介绍了创建多级索引、多层索引切片、多层索引中的slice对象、索引层的交换等内容。
tuples = [('A','a'),('A','b'),('B','a'),('B','b')]
mul_index = pd.MultiIndex.from_tuples(tuples, names=('Upper', 'Lower'))
pd.DataFrame({'Score':['perfect','good','fair','bad']},index=mul_index
多重索引本质上的结构是一个由元组构成的list
L1 = list('AABB')
L2 = list('abab')
tuples = list(zip(L1,L2))
mul_index = pd.MultiIndex.from_tuples(tuples, names=('Upper', 'Lower'))
pd.DataFrame({'Score':['perfect','good','fair','bad']},index=mul_index)
注意,如果用于创建多重索引的由tuple组成的list本身是未排序的, 那么创建的df也未排序。
pd.DataFrame({'Score':['perfect','good','fair','bad']},index=pd.MultiIndex.from_tuples(list(zip(L2,L1)), names=('Lower', 'Upper')))
为了便于使用, 可以使用sort_index()进行排序
pd.DataFrame({'Score':['perfect','good','fair','bad']},index=pd.MultiIndex.from_tuples(list(zip(L2,L1)), names=('Lower', 'Upper'))).sort_index()
arrays = [['A','a'],['A','b'],['B','a'],['B','b']]
mul_index = pd.MultiIndex.from_tuples(arrays, names=('Upper', 'Lower'))
pd.DataFrame({'Score':['perfect','good','fair','bad']},index=mul_index)
arrays = [['A','a'],['B','a'],['A','b'],['B','b']]
mul_index = pd.MultiIndex.from_tuples(arrays, names=('Upper', 'Lower'))
pd.DataFrame({'Score':['perfect','good','fair','bad']},index=mul_index)
sorted_multi_index=pd.DataFrame({'Score':['perfect','good','fair','bad']},index=mul_index).sort_index().index
sorted_multi_index==mul_index
[('A', 'a'), ('B', 'a'), ('A', 'b'), ('B', 'b')]==[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]
arr=np.random.randint(1,5,20).reshape(-1,2)
pd.MultiIndex.from_tuples(list(arr),names=('left','right'))
dftemp=pd.DataFrame(np.random.randn(20).reshape(10,2), index=pd.MultiIndex.from_tuples(list(arr),names=('left','right'))).sort_index()
pd.MultiIndex.from_tuples??
# tuples: list / sequence of tuple-likes Each tuple is the index of one row/column.
L1 = ['A','B']
L2 = ['a','b']
pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
Make a MultiIndex from the cartesian product of multiple iterables.
# iterables : list / sequence of iterables Each iterable has unique labels for each level of the index.
pd.MultiIndex.from_product??
3. 指定df中的列创建(set_index方法)
df_using_mul = df.set_index(['Class','Address'])
df_using_mul.head()
df1= df.set_index('Class')
df2 = df1.set_index('Address')
df3= df1.set_index('Address',append=True)
df.set_index??
df_using_mul.loc['C_1']
df_using_mul.swaplevel('Class','Address').loc['street_1']
df_using_mul.loc[df_using_mul.index.get_level_values(1) == 'street_1']
df_using_mul.index.names
df_using_mul.index.get_level_values??
df_using_mul.query('Address == "street_1"')
select * from df_using_mul where Address = 'street_1'
df.query('Address=="street_1"')
df_using_mul.loc(axis=0)[pd.IndexSlice[:, 'street_1']]
pd.IndexSlice??
1. 一般切片
df_using_mul.loc['C_2','street_5']
df_using_mul.index.is_lexsorted()
df_using_mul.sort_index().loc['C_2','street_5']
df_using_mul.loc[('C_2','street_5'):] 报错
df_using_mul.sort_index().loc[('C_2','street_6'):('C_3','street_4')]
df_using_mul.sort_index().loc[('C_2','street_7'):'C_3'].head()
df_using_mul.sort_index().loc['C_1':'C_2']#.head(10)
df_using_mul.sort_index().loc[[('C_2','street_7'),('C_3','street_2')]]
df_using_mul.sort_index().loc[['C_2',('C_3','street_2')]]
df_using_mul.sort_index().loc[[('C_2','street_7'),'C_3']]
df_using_mul.sort_index().loc['C_2':('C_3','street_2')]
df_using_mul.sort_index().loc[(['C_2','C_3'],['street_4','street_7']),:]
df_using_mul.sort_index().loc[(['C_2','C_3'],['street_4','street_7'])]
# KeyError: "None of [Index(['street_4', 'street_7'], dtype='object')] are in the [columns]"
df_using_mul.sort_index().loc[zip(['C_2','C_3'],['street_4','street_7']),:]
def list_product(list1,list2):
lst=[]
for a in list1:
for b in list2:
lst.append((a,b))
return lst
df_using_mul.sort_index().loc[list_product(['C_2','C_3'], ['street_4','street_7']),:]
df_using_mul.sort_index().loc[list_product(['C_2','C_3'], ['street_1','street_4','street_7']),:]
df_using_mul.sort_index().loc[(['C_2','C_3'], ['street_1','street_4','street_7']),:]
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_s = pd.DataFrame(np.random.rand(9,9),index=mul_index1,columns=mul_index2)
df_s
# Create an object to more easily perform multi-index slicing.
pd.IndexSlice??
df_s.loc[pd.IndexSlice['B':,df_s['D']['d']>0.3],pd.IndexSlice[df_s.sum()>4]]
# 对行的筛选等价于 select * from (select * from df_s where (Upper>'B' or D_d>0.3) )
# 如果不使用连接等手段, sql无法实现类似的对列名的筛选---特别地,sql中没有层级索引
pd.IndexSlice[df_s.sum()>4]
pd.IndexSlice['B':,df_s['D']['d']>0.3]
pd.IndexSlice[df_s['D']['d']>0.3]
df_s.loc[pd.IndexSlice['B':,df_s['D']['d']>0.3],pd.IndexSlice[df_s.sum()>4]]
df_using_mul.head()
df_using_mul.swaplevel(i=1,j=0,axis=0).sort_index().head()
# Swap levels i and j in a MultiIndex on a particular axis.
# 有必要增加一个sort_index=True的参数, 使得可以通过该参数设置交换索引后是否按索引重新排序
df_using_mul.swaplevel??
df_muls = df.set_index(['School','Class','Address'])
df_muls.head(10)
df_muls.reorder_levels([2,0,1],axis=0).sort_index().head()
df_muls.reorder_levels(['Address','School','Class'],axis=0).sort_index().head()
# Rearrange index levels using input order. May not drop or duplicate levels.
# 注意后一句话--这使得该函数和df的类似函数 reindex 相比功能更弱
df_muls.reorder_levels??