Pandas基础操作学习笔记

python与大数据分析
发布于 2022-03-11 13:41:53
9820
发布于 2022-03-11 13:41:53
pandas是python的一个数据分析包，最初由AQR Capital Management于2008年4月开发，并于2009年底开源出来，目前由专注于Python数据包开发的PyData开发team继续开发和维护，属于PyData项目的部分。Pandas最初被作为金融数据分析工具而开发出来，因此，pandas为时间序列分析提供了很好的支持。
Pandas的名称来自于面板数据（panel data）和python数据分析（dataanalysis）。panel data是经济学中关于多维数据集的一个术语，在Pandas中也提供了panel的数据类型。
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
#Series：一种类似于一维数组的对象，它是由一组数据（各种Numpy数据类型）以及一组与之相关的数据标签（即索引）组成。仅由一组数据即可产生简单的Series
#DataFrame：一个表格型的数据结构，含有一组有序的列，每列可以是不同的值类型（数值、字符串、布尔值等），DataFrame既有行索引也有列索引，可以被看做是由Series组成的字典
#通过一维数组创建Series
arr=np.arange(1,5)
ser=Series(arr)
print('arr=',arr)  #[1 2 3 4]
print('ser=',ser)
#0    1
#1    2
#2    3
#3    4
#dtype: int32
print('ser.index=',ser.index)   #RangeIndex(start=0, stop=4, step=1)
print('ser.values=',ser.values) #[1 2 3 4]
print('ser.dtype=',ser.dtype)   #int32
ser=Series(np.arange(1,5))
#0    1
#1    2
#2    3
#3    4
#dtype: int32
ser.index=['one','two','three','four']
#one      1
#two      2
#three    3
#four     4
#dtype: int32
#通过字典的方式创建Series
adict={'2015':5000,'2016':4500,'2017':5500,'2018':7000,'2019':6800}
ser=Series(adict)
#Index(['2015', '2016', '2017', '2018', '2019'], dtype='object')
#[5000 4500 5500 7000 6800]
#Series应用NumPy数组运算
print(ser['2015'])    #5000
print(ser[0])          #5000
print([ser>5500])      #Bool type
print(ser[ser>5500])   #2018    7000    2019    6800
print(ser/100)
ser1=np.array([1,2,3,4])
print(np.exp(ser1))          #[ 2.71828183  7.3890561  20.08553692 54.59815003]
#Series缺失值检测
scores=Series({'Tom':80,'John':95,'Merry':88,'Tony':100})
newindex=['Tom','John','Joe','Tony','Merry']
scores=Series(scores,index=newindex)
print(scores)
#pandas中的isnull和notnull函数可用于Series缺失值检测
#isnull和notnull都返回一个布尔类型的Series
print(pd.isnull(scores))
print(pd.notnull(scores))
print(scores[pd.isnull(scores)])
print(scores[pd.notnull(scores)])
#Tom       80.0
#John      95.0
#Joe        NaN
#Tony     100.0
#Merry     88.0
#dtype: float64
#Series自动对齐
#不同Series之间进行算术运算，会自动对齐不同索引的数据
product_num=Series([1,2,4,5],index=['p1','p2','p4','p5'])
product_price=Series([10,20,30,40,50],index=['p1','p2','p3','p4','p5'])
product_sale=product_num*product_price
print(product_sale)
#p1     10.0
#p2     40.0
#p3      NaN
#p4    160.0
#p5    250.0
#dtype: float64
#Series及其索引的name属性
product_num.name='product name'
product_num.index.name='product sale num'
print(product_num)
#product sale num
#p1    1
#p2    2
#p4    4
#p5    5
#Name: product name, dtype: int64
#-------------DataFrame----------------
#通过二维数组创建DataFrame
df=DataFrame([['Tom','John','Tony'],[76,98,100]])
#     0     1     2
#0  Tom  John  Tony
#1   76    98   100
df=DataFrame([['Tom',76],['John',98],['Tony',100]])
#      0    1
#0   Tom   76
#1  John   98
#2  Tony  100
df=DataFrame([['Tom','John','Tony'],[76,98,100],[55,65,76]])
#0  Tom  John  Tony
#1   76    98   100
#2   55    65    76
arr=np.array([['Tom',76],['John',98],['Tony',100]])
df=DataFrame(arr,columns=['name','score'],index=['one','two','three'])
#       name score
#one     Tom    76
#two    John    98
#three  Tony   100
#通过字典的方式创建DataFrame
data={'2017':['01','02','03','04'],'profits':[50,20,60,100],'value':[1000,500,900,2000]      }
df=DataFrame(data)
#  2017  profits  value
#0   01       50   1000
#1   02       20    500
#2   03       60    900
#3   04      100   2000
print(df.index)   #RangeIndex(start=0, stop=4, step=1)
print(df.columns) #Index(['2017', 'profits', 'value'], dtype='object')
print(df.values)
#[['01' 50 1000]
# ['02' 20 500]
# ['03' 60 900]
# ['04' 100 2000]]
df=DataFrame(data,index=['one','two','three','four'])
print(df.index)  #Index(['one', 'two', 'three', 'four'], dtype='object')
#索引对象
#不管是Series对象还是DataFrame对象，都有索引对象
#索引对象负责管理轴标签和其他元数据（比如轴名称等）
#通过索引可以从Series、DataFrame中取值或对某个位置的值重新赋值
#Series或者DataFrame自动化对齐功能就是通过索引进行的
#通过索引从Series中取值
ser=Series([20001,20003,20002,20004],index=['2001','2003','2002','2004'])
print(ser)
print(ser['2003'])  #20003
print(ser['2002':'2004'])
#2002    20002
#2004    20004
ser=Series([20001,20002,20003,20004],index=['2001','2002','2003','2004'])
print(ser['2002':'2004'])
#2002    20002
#2003    20003
#2004    20004
print(ser['2001':])
print(ser[:'2003'])
ser['2001']=20000
ser['2002':'2003']=[20012,20013]
#2001    20000
#2002    20012
#2003    20013
#2004    20004
#通过索引从DataFrame中取值
#可以直接通过列索引获取指定列的数据
#要通过行索引获取指定行数据需要ix方法
data={'2017':['01','02','03','04'],'profits':[50,20,60,100],'value':[1000,500,900,2000]      }
df=DataFrame(data)
print(df)
print(df['2017'])
print(df['profits'])
data = [[1,2,3,4],[4,5,6,7],[7,8,9,10]]
rows = ['row1','row2','row3']#行标签
columns = ['col1','col2','col3','col4']#列标签
df = pd.DataFrame(data, index=rows, columns=columns)
print(df)
print(df.loc['row2'])
print(df.iloc[0])
print(df['col1'])
df.loc['row2']=np.NaN
df['col1']=np.NaN
print(df)
#      col1  col2  col3  col4
#row1   NaN   2.0   3.0   4.0
#row2   NaN   NaN   NaN   NaN
#row3   NaN   8.0   9.0  10.0
#汇总和计算描述统计
#常用的数学和统计方法
#相关系数与协方差
#唯一值、值计数以及成员资格
#count 非NA值的数量
#describe方法针对Series或各DataFrame列计算总统计
#min/max 计算最小值、最大值
#argmin、argmax 计算能够获取到最小值和最大值的索引位置（整数）
#idxmin、idxmax 计算能够获取到最小值和最大值的索引值
#quantile 计算样本的分位数（0到1）
#sum 值的总和
#mean 值的平均数
#median 值的算术中位数（50%分位数）
#mad 根据平均值计算平均绝对离差
#var 样本数值的方差
#std 样本值的标准差
#cumsum 样本值的累计和
#cummin、cummax 样本值的累计最小值、最大值
#cumprod 样本值的累计积
#Pct_change 计算百分数变化

a=np.arange(1,21)
b=np.linspace(0,100,20)
c=np.logspace(0,2,20)
d=np.random.random((20))
e=np.vstack((a,b,c,d))
print('e1=',e)
a=np.arange(1,21).reshape(20,1)
b=np.linspace(0,100,20).reshape(20,1)
c=np.logspace(0,2,20).reshape(20,1)
d=np.random.random((20)).reshape(20,1)
e=np.hstack((a,b,c,d))
print('e1=',e)
colname=np.array(['line','linspace','logspace','random'])
rowname=np.arange(1,21)
dp=DataFrame(data=e,index=rowname,columns=colname)
print(dp)
print('dp.describe()=',dp.describe())
#       line    linspace    logspace     random
#count  20.00000   20.000000   20.000000  20.000000
#mean   10.50000   50.000000   23.047581   0.490908
#std     5.91608   31.137262   28.475247   0.216418
#min     1.00000    0.000000    1.000000   0.197207
#25%     5.75000   25.000000    3.179026   0.298534
#50%    10.50000   50.000000   10.073523   0.481251
#75%    15.25000   75.000000   31.804361   0.657586
#max    20.00000  100.000000  100.000000   0.935095
print('dp.sum()=',dp.sum())
#line         210.000000
#linspace    1000.000000
#logspace     460.951618
#random         9.818161
print('dp.min()=',dp.min())
#line        1.000000
#linspace    0.000000
#logspace    1.000000
#random      0.197207
print('dp.max()=',dp.max())
#line         20.000000
#linspace    100.000000
#logspace    100.000000
#random        0.935095
print('dp.count()=',dp.count())
#line        20
#linspace    20
#logspace    20
#random      20
print('dp.mean()=',dp.mean())
#line         10.500000
#linspace    50.000000
#logspace    23.047581
#random       0.490908
print('dp.median()=',dp.median())
#line         10.500000
#linspace    50.000000
#logspace    10.073523
#random       0.481251
print('dp.mad()=',dp.mad())
#line         5.000000
#linspace    26.315789
#logspace    21.810469
#random       0.186213
print('dp.var()=',dp.var())
#line         35.000000
#linspace    969.529086
#logspace    810.839685
#random        0.046837
print('dp.std()=',dp.std())
#line         5.916080
#linspace    31.137262
#logspace    28.475247
#random       0.216418
print('dp.idxmin()=',dp.idxmin())
#line         1
#linspace     1
#logspace     1
#random      14
print('dp.idxmax()=',dp.idxmax())
#line        20
#linspace    20
#logspace    20
#random      12
print('dp.quantile()=',dp.quantile())
#line        10.500000
#linspace    50.000000
#logspace    10.073523
#random       0.377218
print('dp.cumsum()=',dp.cumsum())  #按窗口累计加
print('dp.cummin()=',dp.cummin())  #按窗口累计求最小值
print('dp.cummax()=',dp.cummax())  #按窗口累计求最大值
print('dp.cumprod()=',dp.cumprod())#按窗口累计求积
#print('dp.argmin()=',dp.argmin()) #计算能够获取到最小值和最大值的索引位置（整数）
#print('dp.argmax()=',dp.argmax()) #计算能够获取到最小值和最大值的索引位置（整数）
#AttributeError: 'DataFrame' object has no attribute 'argmin'
#print('dp.Pct_change()=',dp.Pct_change()) #计算百分数变化
#AttributeError: 'DataFrame' object has no attribute 'Pct_change'
print('dp.sum(axis=1)=',dp.sum(axis=1))
print('dp.min(axis=1)=',dp.min(axis=1))
print('dp.max(axis=1)=',dp.max(axis=1))
print('dp.count(axis=1)=',dp.count(axis=1))
print('dp.mean(axis=1)=',dp.mean(axis=1))
#1      0.520017
#2      2.250554
#3      3.817623
#...
#19    48.202577
#20    55.018834
#相关系数与协方差
#corr()相关系数
print('dp.corr()=',dp.corr())
#           line  linspace  logspace    random
#line      1.000000  1.000000  0.865835  0.049066
#linspace  1.000000  1.000000  0.865835  0.049066
#logspace  0.865835  0.865835  1.000000 -0.184504
#random    0.049066  0.049066 -0.184504  1.000000
#cov()协方差
print('dp.cov()=',dp.cov())
#           line    linspace    logspace    random
#line       35.000000  184.210526  145.860073  0.072751
#linspace  184.210526  969.529086  767.684593  0.382899
#logspace  145.860073  767.684593  810.839685 -1.316742
#random      0.072751    0.382899   -1.316742  0.062814
print('dp["line"].corr(dp["linspace"])=',dp["line"].corr(dp["linspace"]))  #0.9999999999999998
print('dp["linspace"].corr(dp["logspace"])=',dp["linspace"].corr(dp["logspace"]))  #0.8658345373066839
print('dp["line"].cov(dp["linspace"])=',dp["line"].cov(dp["linspace"]))  #184.21052631578945
print('dp["linspace"].cov(dp["logspace"])=',dp["linspace"].cov(dp["logspace"]))  #767.684593356434
#唯一值、值计数以及成员资格
#unique方法用于获取Series唯一值数组
#value_counts方法，用于计算一个Series中各值出现的频率
#isin方法，用于判断矢量化集合的成员资格，可用于选取Series中或者DataFrame中列中数据的子集
a=np.array(['a','b','b','c','c','c','d','d'])
ser=Series(a)
print(ser.unique())  #['a' 'b' 'c' 'd']
print(ser.value_counts())
#c     3
#b     2
#d     2
#a     1
mask=ser.isin(['b','c'])
print(mask)
print(ser[mask])
#1    b
#2    b
#3    c
#4    c
#5    c
print(ser.value_counts(ascending=False))
a={'orderid':['1001','1002','1003','1004'],'orderAmt':[1,2,3,4],'memberid':['a1001','b1002','a1001','a1001']}
df=DataFrame(a)
print(df['memberid'].unique())  #['a1001' 'b1002']
#处理缺失数据
#缺失值检测
df=DataFrame([['Tom',np.nan,456,'M'],['Merry',34,4567,np.NaN],['John',23,np.NaN,'M'],['Joe',18,342,'F']],columns=['name','age','salary','gender'])
print(df)
#   name   age  salary gender
#0    Tom   NaN   456.0      M
#1  Merry  34.0  4567.0    NaN
#2   John  23.0     NaN      M
#3    Joe  18.0   342.0      F
print(df.isnull())
#    name    age  salary  gender
#0  False   True   False   False
#1  False  False   False    True
#2  False  False    True   False
#3  False  False   False   False
print(df.notnull())
#   name    age  salary  gender
#0  True  False    True    True
#1  True   True    True   False
#2  True   True   False    True
#3  True   True    True    True
#过滤缺失数据
ser=Series([1,2,3,4,np.NaN,5])
print(ser.dropna())
print(df.dropna())   #默认丢弃只要含有缺失值的行
#  name   age  salary gender
#3  Joe  18.0   342.0      F
print(df.dropna(how='all'))  #丢弃全部为缺失值的行
#    name   age  salary gender
#0    Tom   NaN   456.0      M
#1  Merry  34.0  4567.0    NaN
#2   John  23.0     NaN      M
#3    Joe  18.0   342.0      F
df['gender']=np.NaN
print(df.dropna(axis=1,how='all'))  #丢弃全部为缺失值的行
#    name   age  salary
#0    Tom   NaN   456.0
#1  Merry  34.0  4567.0
#2   John  23.0     NaN
#3    Joe  18.0   342.0
#填充缺失数据
print(df.fillna(0))
#    name   age  salary  gender
#0    Tom   0.0   456.0     0.0
#1  Merry  34.0  4567.0     0.0
#2   John  23.0     0.0     0.0
#3    Joe  18.0   342.0     0.0
#缺失值NA处理方法
#dropna    根据标签的值中是否存在缺失数据对轴标签进行过滤（删除），可通过
#阈值调节对缺失值的容忍度
#fillna    用指定值或插值方法（如ffill或bfill）填充缺失数据
#isnull    返回一个含有布尔值的对象，这些布尔值表示哪些值是缺失值NA
#notnull    Isnull的否定式
#层次化索引
#在某个方向上拥有多个（两个及两个以上）索引级别
#通过层次化索引，pandas能够以低维度形式处理高维度数据
#通过层次化索引，可以按层级统计数据
#Series层次化索引
data=Series([100,100,2000,300,400],index=[['2001','2001','2001','2002','2002'],['Apple','Banana','xigua','apple','xigua']])
print(data)
#2001  Apple      100
#      Banana     100
#      xigua     2000
#2002  apple      300
#      xigua      400
#dtype: int64
data.index.names=['年份','水果类别']
print(data)
#年份    水果类别
#2001  Apple      100
#      Banana     100
#      xigua     2000
#2002  apple      300
#      xigua      400
#dtype: int64
data={'year':[2001,2001,2002,2002,2003],
      'fruit':['apple','banana','apple','banana','apple'],
      'production':[234,344,333,444,222],
      'profit':[23,23,33,22,11]}
df=DataFrame(data)
print(df)
#   year   fruit  production  profit
#0  2001   apple         234      23
#1  2001  banana         344      23
#2  2002   apple         333      33
#3  2002  banana         444      22
#4  2003   apple         222      11
print(df.set_index(['year','fruit']))
#             production  profit
#year fruit
#2001 apple          234      23
#     banana         344      23
#2002 apple          333      33
#     banana         444      22
#2003 apple          222      11
ndf=df.set_index(['year','fruit'])
print(ndf.index)
#MultiIndex(levels=[[2001, 2002, 2003], ['apple', 'banana']],
#           labels=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 0]],
#           names=['year', 'fruit'])
print(ndf.sum(level='year'))
#      production  profit
#year
#2001         578      46
#2002         777      55
#2003         222      11
本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。
原始发表：2019-09-03，如有侵权请联系 cloudcommunity@tencent.com 删除
数据分析