import numpy as np
import pandas as pd
from pandas import Series, DataFrame
重新索引不会改变原数据
Series.reindex
DF.reindex()
columns
关键字指定obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
# S型数据重新排序索引
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
0 blue
2 purple
4 yellow
dtype: object
# ffill前项填充:填充的是前一个数值
obj3.reindex(range(6), method='ffill')
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
# DF重新索引
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
frame
Ohio | Texas | California | |
---|---|---|---|
a | 0 | 1 | 2 |
c | 3 | 4 | 5 |
d | 6 | 7 | 8 |
# DF重新索引
frame.reindex(["a", "b", "c", "d"])
Ohio | Texas | California | |
---|---|---|---|
a | 0.0 | 1.0 | 2.0 |
b | NaN | NaN | NaN |
c | 3.0 | 4.0 | 5.0 |
d | 6.0 | 7.0 | 8.0 |
# 重新索引列
frame.reindex(columns=["Ohio", "Utah", "California"])
Ohio | Utah | California | |
---|---|---|---|
a | 0 | NaN | 2 |
c | 3 | NaN | 5 |
d | 6 | NaN | 8 |
# drop等函数默认是就地修改,不改变原有数据
# 使用inplace=True改变原有数据
print(obj)
obj.drop('c', inplace=True)
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
d 4.5
b 7.2
a -5.3
dtype: float64
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
# 舍弃一行数据
new_obj = obj.drop('c')
new_obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
# 默认是删除行数据
data.drop(['Colorado', 'Ohio'])
one | two | three | four | |
---|---|---|---|---|
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
# axis=1:删除列数据
data.drop('two', axis=1)
one | three | four | |
---|---|---|---|
Ohio | 0 | 2 | 3 |
Colorado | 4 | 6 | 7 |
Utah | 8 | 10 | 11 |
New York | 12 | 14 | 15 |
# 删除多列数据
data.drop(['two', 'four'], axis='columns')
one | three | |
---|---|---|
Ohio | 0 | 2 |
Colorado | 4 | 6 |
Utah | 8 | 10 |
New York | 12 | 14 |
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
# 标签索引
data.loc['Colorado', ['two', 'three']]
two 5
three 6
Name: Colorado, dtype: int32
# 切片形式:前面表示行所用,后面表示列
data.loc[:'Utah', 'two']
Ohio 1
Colorado 5
Utah 9
Name: two, dtype: int32
# 整数数值索引
data.iloc[2, [3, 0, 1]]
four 11
one 8
two 9
Name: Utah, dtype: int32
data.iloc[[1, 2], [3, 0, 1]]
four | one | two | |
---|---|---|---|
Colorado | 7 | 4 | 5 |
Utah | 11 | 8 | 9 |
ser = pd.Series(np.arange(3.))
ser
0 0.0
1 1.0
2 2.0
dtype: float64
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]
2.0
# 索引不包含末尾
ser[:1]
0 0.0
dtype: float64
ser.loc[:2]
0 0.0
1 1.0
2 2.0
dtype: float64
ser.iloc[:2]
0 0.0
1 1.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data[['three', 'one']]
three | one | |
---|---|---|
Ohio | 2 | 0 |
Colorado | 6 | 4 |
Utah | 10 | 8 |
New York | 14 | 12 |
data[data['three'] > 5]
one | two | three | four | |
---|---|---|---|---|
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data < 5
one | two | three | four | |
---|---|---|---|---|
Ohio | True | True | True | True |
Colorado | True | False | False | False |
Utah | False | False | False | False |
New York | False | False | False | False |
data[data < 5] = 0
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data.loc['Colorado', ['two', 'three']]
two 5
three 6
Name: Colorado, dtype: int32
# 所有行的前三列,再选择大于5的数值
data.iloc[:, :3][data.three > 5]
one | two | three | |
---|---|---|---|
Colorado | 0 | 5 | 6 |
Utah | 8 | 9 | 10 |
New York | 12 | 13 | 14 |
Stay Foolish Stay Hungry