作者:闫钟峰,Datawhale优秀学习者
寄语:本文对索引设定、常用索引型函数、重复元素处理、抽样函数等内容做了详细介绍。
pd.read_csv('data/table.csv',index_col=['Address','School']).head()
df.head()
df.reindex(index=sorted(list(df.index),reverse=True)).head()
df.reindex(index=list(df.index)[::5])
df.reindex(index=[1101,1101,1203,1206,2402])
df.reindex(columns=['Height','Height','Gender','Average']).head()
df.reindex(index=[1101,1203,2402],columns=['Height','Gender'])
df.reindex(index=[1101,1203,1206,2402],method='bfill')
df.reindex(index=[1101,1203,1206,2402,1205,1301],method='nearest')
df_temp = pd.DataFrame({'Weight':np.zeros(5),
'Height':np.zeros(5),
'ID':[1101,1104,1103,1106,1102]}).set_index('ID')
df_temp.reindex_like(df[0:5][['Weight','Height']])
df_temp.reindex(index=df[0:5][['Weight','Height']].index,columns=df[0:5][['Weight','Height']].columns)
df_temp = pd.DataFrame({'Weight':range(5),
'Height':range(5),
'ID':[1101,1104,1103,1106,1102]}).set_index('ID').sort_index()
df_temp.reindex_like(df[0:5][['Weight','Height']],method='bfill')
df_temp.reindex_like(df[0:5][['Weight','Height']],method='ffill')
df_temp.reindex(index=df[0:5][['Weight','Height']].index,columns=df[0:5][['Weight','Height']].columns,method='ffill')
df.head()
df.set_index('Class').head()
df.set_index('Class',append=True).head()
df.set_index(pd.Series(range(df.shape[0]))).head()
df_=pd.DataFrame(np.random.randn(24).reshape((3,8)))
df_.set_index(list(range(df_.shape[0])))
# 传入参数是 range(df_.shape[0] 时会报错:
# KeyError: 'None of [range(0, 3)] are in the columns'
# 当给 set_index 传入的是list的时候, 就会把列名和list一致的列设置为索引
df.set_index(np.arange(df.shape[0])).head()
df.set_index([pd.Series(range(df.shape[0])),pd.Series(np.ones(df.shape[0]))]).head()
df.reset_index().head()
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_temp = pd.DataFrame(np.random.rand(9,9),index=mul_index1,columns=mul_index2)
df_temp1 = df_temp.reset_index(level=1,col_level=0)
df_temp1.index
df_temp.rename_axis(index={'Lower':'LowerLower'},columns={'Big':'BigBig'})
df_temp.index.names=['UPPER','LOWER']
dftemp=pd.DataFrame(np.random.randn(20).reshape(10,2), index=pd.MultiIndex.from_tuples(list(np.random.randint(1,5,30).reshape(-1,3)))).sort_index()
dftemp.rename_axis(index={0:'LEFT',2:'RIGHT'},)
dftemp.index.names=['LEFT','MIDDLE','RIGHT']
dftemp=pd.DataFrame(np.random.randn(20).reshape(10,2), index=pd.MultiIndex.from_tuples(list(np.random.randint(1,5,30).reshape(-1,3)))).sort_index()
# 传入一个和索引层级等长的list, 不需要命名的层级赋值 None, 需要命名的层级传入字符串
dftemp.index.names=[None,None,'RIGHT']
df_temp1.rename_axis(index={'Upper':'UPPER'})
给index传入的字典,键是原来的索引值, 值是新的索引值。无需指定要修改的索引级别,会自动寻找索引中的相应的值----当不同层级的索引有相同的值的时候,这会造成混乱。
df_temp.rename(index={'A':'T'},columns={'e':'changed_e'}).head()
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L1],names=('Upper1', 'Upper2'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_t = pd.DataFrame(np.random.rand(9,9),index=mul_index1,columns=mul_index2)
df_t.rename(index={'A':'T'},columns={'e':'changed_e'}).head()
# 如果要同时修改行索引的第二层,以及列索引的第一层,怎么指定level?
df_t.rename(index={'A':'T'},level=1,columns={'E':'changed_e'},level=0).head()
# 显然不能同时指定两个level
df_t.rename(index={'A':'T'},columns={'e':'changed_e'},level=1).head()
df.where(df['Gender']=='M').head()
df.query('Gender=="M"').head()
df.where(df['Gender']=='M').dropna().head()
df[df['Gender']=='M'].head()
df.where(df['Gender']=='M',np.random.rand(df.shape[0],df.shape[1])).head()
2. mask函数
df.mask(df['Gender']!='M').dropna().head()
df.mask(df['Gender']!='M',np.random.rand(df.shape[0],df.shape[1])).head()
3. query函数
df.query('(Address in ["street_6","street_7"])&(Weight>(70+10))&(ID in [1303,2304,2402])')
df.duplicated('Class').head()
df.duplicated('Class',keep='last').tail()
df.duplicated('Class',keep=False).head() # - False : Mark all duplicates as ``True``.
df.drop_duplicates('Class')
df.drop_duplicates('Class',keep='last')
df.drop_duplicates(['School','Class'])
df.sample(n=5)
df.sample(9)#由于是第一个参数,可以省略 n=
2. frac为抽样比
df.sample(frac=0.05)
3. replace为是否放回
df.sample(n=df.shape[0],replace=True).head()
# 有放回(replace=True)可以选择比df长度更多的元素回来
len(df.sample(n=123,replace=True)),len(df)
df.sample(n=35,replace=True).index.is_unique
4. axis为抽样维度,默认为0,即抽行
# axis=1 对列进行抽样
df.sample(n=3,axis=1).head()
5. weights为样本权重,自动归一化
w=np.random.rand(df.shape[0])
df.sample(n=3,weights=w).head()
df.sample(n=3,weights=df['Math']).head()
df.sample(n=3,weights=df['Math'].values).head()