Pandas是面板数据(Panel Data)的简写。它是Python最强大的数据分析和探索工具,因金融数据分析工具而开发,支持类似SQL的数据增删改查,支持时间序列分析,灵活处理缺失数据。
# -*- coding: utf-8 -*-
from pandas import Series
print('用数组生成Series')
obj = Series([4, 7, -5, 3])
print(obj)
print(obj.values)
print(obj.index)
print('指定Series的index')
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
print(obj2)
print(obj2.index)
print(obj2['a'])
obj2['d'] = 6
print(obj2[['c', 'a', 'd']])
print(obj2[obj2 > 0]) # 找出大于0的元素
print('b' in obj2) # 判断索引是否存在
print('e' in obj2)
print('使用字典生成Series')
sdata = {'Ohio':45000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = Series(sdata)
print(obj3)
print('使用字典生成Series,并额外指定index,不匹配部分为NaN。')
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index = states)
print(obj4)
print('Series相加,相同索引部分相加。')
print(obj3 + obj4)
print('指定Series及其索引的名字')
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
print('替换index')
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)
初始化
# -*- coding: utf-8 -*-
import numpy as np
from pandas import Series, DataFrame
print('用字典生成DataFrame,key为列的名字。')
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year':[2000, 2001, 2002, 2001, 2002],
'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
print(DataFrame(data))
print(DataFrame(data, columns = ['year', 'state', 'pop'])) # 指定列顺序
print('指定索引,在列中指定不存在的列,默认数据用NaN。')
frame2 = DataFrame(data,
columns = ['year', 'state', 'pop', 'debt'],
index = ['one', 'two', 'three', 'four', 'five'])
print(frame2)
print(frame2['state'])
print(frame2.year)#访问的不同方式
print(frame2.ix['three'])
frame2['debt'] = 16.5 # 修改一整列
print(frame2)
frame2.debt = np.arange(5) # 用numpy数组修改元素
print(frame2)
print('用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。')
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
print('赋值给新列')
frame2['eastern'] = (frame2.state == 'Ohio') # 如果state等于Ohio为True
print(frame2)
print(frame2.columns)
print
print('DataFrame转置')
pop = {'Nevada':{2001:2.4, 2002:2.9},
'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print(frame3.T)
print('指定索引顺序,以及使用切片初始化数据。')
print(DataFrame(pop, index = [2001, 2002, 2003]))
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevada':frame3['Nevada'][:2]}
print(DataFrame(pdata))
print('指定索引和列的名称')
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print(frame3.values)
print(frame2.values)
import numpy as np
import pandas as pd
import sys
from pandas import Series, DataFrame, Index
print('获取index')
obj = Series(range(3), index = ['a', 'b', 'c'])
index = obj.index
print(index[1:])
try:
index[1] = 'd' # index对象read only
except:
print(sys.exc_info()[0])
print('使用Index对象')
index = Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index = index)
print(obj2)
print(obj2.index is index)
print('判断列和索引是否存在')
pop = {'Nevada':{20001:2.4, 2002:2.9},
'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print(frame3)
print('Ohio' in frame3.columns)
print('2003' in frame3.index)
index
Index的方法和属性
method1
method2
• 创建一个适应新索引的新对象,该Series的reindex将会根据新索引进行重排。如果某个索引值当前不存在,就引入缺失值 • 对于时间序列这样的有序数据,重新索引时可能需要做一些插值处理。method选项即可达到此目的。
reindex参数
# -*- coding: utf-8 -*-
import numpy as np
from pandas import DataFrame, Series
print('重新指定索引及顺序')
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'd', 'c', 'e'])
print(obj2)
print(obj.reindex(['a', 'b', 'd', 'c', 'e'], fill_value = 0)) # 指定不存在元素的默认值
print('重新指定索引并指定填元素充方法')
obj3 = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
print(obj3)
#ffill用前一行相同列的数值填充
print(obj3.reindex(range(6), method = 'ffill'))
print('对DataFrame重新指定索引')
frame = DataFrame(np.arange(9).reshape(3, 3),
index = ['a', 'c', 'd'],
columns = ['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
print
print('重新指定column')
states = ['Texas', 'Utah', 'California']
print(frame.reindex(columns = states))
print('对DataFrame重新指定索引并指定填元素充方法')
print(frame.reindex(index = ['a', 'b', 'c', 'd'],
method = 'ffill',
columns = states))
print(frame.ix[['a', 'b', 'd', 'c'], states])
丢弃某些项 丢弃某条轴上的一个或多个项很简单,只要有一个索引数组或列表即可。由于需要执行一些数据整理和集合逻辑,所以drop方法返回的是一个在指定轴上删除了指定值的新对象
import numpy as np
from pandas import Series, DataFrame
print('Series根据索引删除元素')
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
print(new_obj)
print(obj.drop(['d', 'c']))
print('DataFrame删除元素,可指定索引或列。')
data = DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'three', 'four'])
print(data)
print(data.drop(['Colorado', 'Ohio']))
print(data.drop('two', axis = 1))
print(data.drop(['two', 'four'], axis = 1))
索引、选取、过滤
DataFrame索引
# -*- coding: utf-8 -*-
import numpy as np
from pandas import Series, DataFrame
print('Series的索引,默认数字索引可以工作。')
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
print(obj['b'])
print(obj[3])
print(obj[[1, 3]])
print(obj[obj < 2])
print('Series的数组切片')
print(obj['b':'c'] ) # 闭区间
obj['b':'c'] = 5
print(obj)
print('DataFrame的索引')
data = DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'three', 'four'])
print(data)
print(data['two']) # 打印列
print(data[['three', 'one']])
print(data[:2])
print(data.ix['Colorado', ['two', 'three']]) # 指定索引和列
print(data.ix[['Colorado', 'Utah'], [3, 0, 1]])
print(data.ix[2]) # 打印第2行(从0开始)
print(data.ix[:'Utah', 'two']) # 从开始到Utah,第2列。
print('根据条件选择')
print(data[data.three > 5])
print(data < 5) # 打印True或者False
data[data < 5] = 0
print(data)
# -*- coding: utf-8 -*-
import numpy as np
from pandas import Series, DataFrame
print('加法')
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)
print(s1 + s2)
print('DataFrame加法,索引和列都必须匹配。')
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
columns = list('bcd'),
index = ['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4, 3)),
columns = list('bde'),
index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
print(df1 + df2)
print('数据填充')
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))
print(df1)
print(df2)
print(df1.add(df2, fill_value = 0))
print(df1.reindex(columns = df2.columns, fill_value = 0))
print('DataFrame与Series之间的操作')
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr - arr[0])
frame = DataFrame(np.arange(12).reshape((4, 3)),
columns = list('bde'),
index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print(frame)
print(series)
print(frame - series)
series2 = Series(range(3), index = list('bef'))
print(frame + series2)
series3 = frame['d']
print(series3)
print(frame.sub(series3, axis = 0)) # 按列减
# -*- coding: utf-8 -*-
import numpy as np
from pandas import Series, DataFrame
print('函数')
frame = DataFrame(np.random.randn(4, 3),
columns = list('bde'),
index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))
print('lambda以及应用')
f = lambda x: x.max() - x.min()
#列的最大值减去最小值
print(frame.apply(f))
#行的最大值减去最小值
print(frame.apply(f, axis = 1))
def f(x):
return Series([x.min(), x.max()], index = ['min', 'max'])
print(frame.apply(f))
print('applymap和map')
_format = lambda x: '%.2f' % x
print(frame.applymap(_format))
print(frame['e'].map(_format))
# -*- coding: utf-8 -*-
import numpy as np
from pandas import Series, DataFrame
print('根据索引排序,对于DataFrame可以指定轴。')
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
print(obj.sort_index())
frame = DataFrame(np.arange(8).reshape((2, 4)),
index = ['three', 'one'],
columns = list('dabc'))
print(frame.sort_index())
print(frame.sort_index(axis = 1))
print(frame.sort_index(axis = 1, ascending = False)) # 降序
print('根据值排序')
obj = Series([4, 7, -3, 2])
print(obj.sort_values()) # order已淘汰
print('DataFrame指定列排序')
frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by = 'b')) # sort_index(by = ...)已淘汰
print(frame.sort_values(by = ['a', 'b']))
print('rank,求排名的平均位置(从1开始)')
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
#rank 7为(6+7)/2
print(obj.rank())
print(obj.rank(method = 'first')) # 去第一次出现,不求平均值。
print(obj.rank(ascending = False, method = 'max')) # 逆序,并取最大值。所以-5的rank是7.
frame = DataFrame({'b':[4.3, 7, -3, 2],
'a':[0, 1, 0, 1],
'c':[-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis = 1))
print('重复的索引')
obj = Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
print(obj.index.is_unique) # 判断是非有重复索引
print(obj['a'].ix[0])
print(obj.b.ix[1])
df = DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])
print(df)
print(df.ix['b'].ix[0])
print(df.ix['b'].ix[1])
pandas 对象有一些统计方法。它们大部分都属于约简和汇总统计,用于从 Series 中提取单个值,或从 DataFrame 的行或列中提取一个 Series。 比如 DataFrame.mean(axis=0,skipna=True) 方法,当数据集中存在 NA 值时,这些值会被简单跳过,除非整个切片(行或列)全是 NA,如果不想这样,则可以通过 skipna=False 来禁用此功能:
常用方法选项
常用描述和汇总统计函数1
常用描述和汇总统计函数2
import numpy as np
from pandas import Series, DataFrame
print('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
index = ['a', 'b', 'c', 'd'],
columns = ['one', 'two'])
print(df)
print(df.sum()) # 按列求和
print(df.sum(axis = 1)) # 按行求和
print('平均数')
print(df.mean(axis = 1, skipna = False))
print(df.mean(axis = 1))
print('其它')
print(df.idxmax())
print(df.cumsum())
print(df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())
相关系数与协方差
Series排序
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False)
Docstring:
Merge DataFrame objects by performing a database-style join operation by
columns or indexes.
If joining columns on columns, the DataFrame indexes *will be
ignored*. Otherwise if joining indexes on indexes or indexes on a column or
columns, the index will be passed on.
Parameters
----------
left : DataFrame
right : DataFrame
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
* left: use only keys from left frame (SQL: left outer join)
* right: use only keys from right frame (SQL: right outer join)
* outer: use union of keys from both frames (SQL: full outer join)
* inner: use intersection of keys from both frames (SQL: inner join)
on : label or list
Field names to join on. Must be found in both DataFrames. If on is
None and not merging on indexes, then it merges on the intersection of
the columns by default.
left_on : label or list, or array-like
Field names to join on in left DataFrame. Can be a vector or list of
vectors of the length of the DataFrame to use a particular vector as
the join key instead of columns
right_on : label or list, or array-like
Field names to join on in right DataFrame or vector/list of vectors per
left_on docs
left_index : boolean, default False
Use the index from the left DataFrame as the join key(s). If it is a
MultiIndex, the number of keys in the other DataFrame (either the index
or a number of columns) must match the number of levels
right_index : boolean, default False
Use the index from the right DataFrame as the join key. Same caveats as
left_index
sort : boolean, default False
Sort the join keys lexicographically in the result DataFrame
suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
side, respectively
copy : boolean, default True
If False, do not copy data unnecessarily
indicator : boolean or string, default False
If True, adds a column to output DataFrame called "_merge" with
information on the source of each row.
If string, column with information on source of each row will be added to
output DataFrame, and column will be named value of string.
Information column is Categorical-type and takes on a value of "left_only"
for observations whose merge key only appears in 'left' DataFrame,
"right_only" for observations whose merge key only appears in 'right'
DataFrame, and "both" if the observation's merge key is found in both.
.. versionadded:: 0.17.0
Examples
--------
>>> A >>> B
lkey value rkey value
0 foo 1 0 foo 5
1 bar 2 1 bar 6
2 baz 3 2 qux 7
3 foo 4 3 bar 8
>>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
lkey value_x rkey value_y
0 foo 1 foo 5
1 foo 4 foo 5
2 bar 2 bar 6
3 bar 2 bar 8
4 baz 3 NaN NaN
5 NaN NaN qux 7
Returns
-------
merged : DataFrame
The output type will the be same as 'left', if it is a subclass
of DataFrame.
np.concatenate(arr1,arr2)#默认是竖着增加,axis=1时横着增加,即增加列 combine_first,它实现既不是行之间的连接,也不是列之间的连接,它在修正数据,用一个DataFrame来填补前面的DataFrame中NAN的数据 Merge, join, and concatenate官方文档说明:http://pandas.pydata.org/pandas-docs/stable/merging.html
df['A'] = df['A'].apply(str.upper)
查看一列唯一值:df['A'].unique() 查看是否有重复:df['A'].duplicated() 删除重复数据:df.drop_duplicated(['A'])
pd.date_range(start=None, end=None, periods=None, freq='D', tz=None, normalize=False, name=None, closed=None, **kwargs)
Docstring:
Return a fixed frequency datetime index, with day (calendar) as the default
frequency
Parameters
----------
start : string or datetime-like, default None
Left bound for generating dates
end : string or datetime-like, default None
Right bound for generating dates
periods : integer or None, default None
If None, must specify start and end
freq : string or DateOffset, default 'D' (calendar daily)
Frequency strings can have multiples, e.g. '5H'
tz : string or None
Time zone name for returning localized DatetimeIndex, for example
Asia/Hong_Kong
normalize : bool, default False
Normalize start/end dates to midnight before generating date range
name : str, default None
Name of the resulting index
closed : string or None, default None
Make the interval closed with respect to the given frequency to
the 'left', 'right', or both sides (None)
Pandas中的resample,重新采样,是对原样本重新处理的一个方法,是一个对常规时间序列数据重新采样和频率转换的便捷的方法。 参照:http://blog.csdn.net/wangshuang1631/article/details/52314944