import pandas as pd
from pandas import Series, DataFrame
import numpy as np
obj = pd.Series([4, 7, -5, 3])
obj
0 4
1 7
2 -5
3 3
dtype: int64
obj.values
array([ 4, 7, -5, 3], dtype=int64)
obj.index # like range(4)
RangeIndex(start=0, stop=4, step=1)
obj2 = pd.Series([4, 7, -5, 3], index=['b', 'b', 'a', 'c'])
obj2
b 4
b 7
a -5
c 3
dtype: int64
obj2.index
Index(['b', 'b', 'a', 'c'], dtype='object')
obj2['a']
-5
obj2[['a', 'b', 'c']]
a -5
b 4
b 7
c 3
dtype: int64
obj2['d'] = 6
obj2['e'] = 10
obj2
b 4
b 7
a -5
c 3
d 6
e 10
dtype: int64
# 1.布尔矩阵操作
obj2[obj2 > 0]
b 4
b 7
c 3
d 6
e 10
dtype: int64
# 2.乘一个标量
obj2 * 2
b 8
b 14
a -10
c 6
d 12
e 20
dtype: int64
# 3.numpy函数直接运算
np.exp(obj2)
b 54.598150
b 1096.633158
a 0.006738
c 20.085537
d 403.428793
e 22026.465795
dtype: float64
'b' in obj2
True
'e' in obj2
True
sdata = {
'Ohio': 35000,
'Texas': 71000,
'Oregon': 16000,
'Utah': 5000
}
obj3 = pd.Series(sdata)
obj3
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
data = {
'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
frame = pd.DataFrame(data)
frame
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
frame.head(10)
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
frame.tail(3)
state | year | pop | |
---|---|---|---|
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
pd.DataFrame(data, columns=['year', 'state', 'pop'])
year | state | pop | |
---|---|---|---|
0 | 2000 | Ohio | 1.5 |
1 | 2001 | Ohio | 1.7 |
2 | 2002 | Ohio | 3.6 |
3 | 2001 | Nevada | 2.4 |
4 | 2002 | Nevada | 2.9 |
5 | 2003 | Nevada | 3.2 |
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
six | 2003 | Nevada | 3.2 | NaN |
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
frame2.index
Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')
# 1.取某一列的所有值,类似于字典取数
frame2['state']
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object
# 2.取某一列的所有值,还可以类似于调用类的属性,由于不常用,在此不推荐使用,了解即可
frame2.state # 不推荐
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object
# 3.取某一行的所有值,已知行标签的情况下
frame2.loc['three']
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
# 4.取某一行的所有值,已知第几行的情况下
frame2.iloc[4]
year 2002
state Nevada
pop 2.9
debt NaN
Name: five, dtype: object
# pd的转置,可以使用类似矩阵转置的方法
frame2.T
one | two | three | four | five | six | |
---|---|---|---|---|---|---|
year | 2000 | 2001 | 2002 | 2001 | 2002 | 2003 |
state | Ohio | Ohio | Ohio | Nevada | Nevada | Nevada |
pop | 1.5 | 1.7 | 3.6 | 2.4 | 2.9 | 3.2 |
debt | NaN | NaN | NaN | NaN | NaN | NaN |