Python Data Analysis Library 或 pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现,它是使Python成为强大而高效的数据分析环境的重要因素之一。 本文介绍了Pandas的基础用法
Github: https://github.com/yingzk/pandas_learning 本文PDF下载:https://www.yingjoy.cn/downloads/pandas_pdf.zip
# 导入numpy库并重命名为np
import numpy as np
# 导入pandas 库并重命名为pd
import pandas as pd
pd.read_csv(filename) # 从csv导入
pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据
pd.read_excel(filename) # 从excel导入
pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据
pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据
pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据
pd.read_clipboard() # 导入系统粘贴板里面的数据
pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。
df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中
df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中
df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中
df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中
pd.DataFrame(np.random.rand(5, 10)) # 创建一个5列10行的由随机浮点数组成的数据框 DataFrame
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.016860 | 0.855994 | 0.992872 | 0.652278 | 0.517510 | 0.742986 | 0.452981 | 0.568701 | 0.795436 | 0.622609 |
1 | 0.476801 | 0.190823 | 0.450436 | 0.912401 | 0.335651 | 0.197766 | 0.042523 | 0.580323 | 0.498982 | 0.473128 |
2 | 0.029820 | 0.886500 | 0.902864 | 0.465084 | 0.380933 | 0.033583 | 0.928827 | 0.501687 | 0.857512 | 0.671840 |
3 | 0.897254 | 0.413717 | 0.991061 | 0.393033 | 0.388630 | 0.661025 | 0.635417 | 0.695609 | 0.305378 | 0.147508 |
4 | 0.573882 | 0.786888 | 0.177782 | 0.864474 | 0.594416 | 0.765678 | 0.217279 | 0.446570 | 0.930604 | 0.686823 |
pd.Series(my_list) # 从一个可迭代的对象 my_list 中创建一个数据组
my_list = ['abc',123,'HelloWorld', 5.7]
pd.Series(my_list)
0 abc
1 123
2 HelloWorld
3 5.7
dtype: object
df = pd.DataFrame(np.random.rand(10, 5))
df.index = pd.date_range('2017/1/1', periods=df.shape[0])
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
2017-01-01 | 0.011762 | 0.634116 | 0.045220 | 0.452117 | 0.879969 |
2017-01-02 | 0.802262 | 0.661908 | 0.214822 | 0.444259 | 0.200370 |
2017-01-03 | 0.301050 | 0.004534 | 0.881042 | 0.825632 | 0.331118 |
2017-01-04 | 0.095324 | 0.916430 | 0.177795 | 0.191502 | 0.546973 |
2017-01-05 | 0.482868 | 0.953719 | 0.615461 | 0.868984 | 0.639286 |
2017-01-06 | 0.958404 | 0.155357 | 0.293012 | 0.115218 | 0.177846 |
2017-01-07 | 0.915488 | 0.486922 | 0.440474 | 0.584764 | 0.271243 |
2017-01-08 | 0.480413 | 0.600622 | 0.325212 | 0.532259 | 0.687718 |
2017-01-09 | 0.859887 | 0.236677 | 0.635073 | 0.811840 | 0.497289 |
2017-01-10 | 0.024623 | 0.635122 | 0.346393 | 0.860260 | 0.325502 |
df.head(n) # 查看前n行的数据
df.tail(n) # 查看后n行的数据
df = pd.DataFrame(np.random.rand(10, 5))
df.head(3)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.996381 | 0.440502 | 0.583701 | 0.120444 | 0.241775 |
1 | 0.126877 | 0.646841 | 0.740163 | 0.764182 | 0.810129 |
2 | 0.254386 | 0.451341 | 0.288513 | 0.515995 | 0.146529 |
df = pd.DataFrame(np.random.rand(10, 5))
df.tail(3)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
7 | 0.466316 | 0.747013 | 0.568442 | 0.562552 | 0.949529 |
8 | 0.243633 | 0.605133 | 0.114011 | 0.898604 | 0.024648 |
9 | 0.155605 | 0.799580 | 0.160883 | 0.986743 | 0.446114 |
df.shape # 查看数据的形状(行和宽)
df.info() # 查看数据的索引、数据类型及内存信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
0 10 non-null float64
1 10 non-null float64
2 10 non-null float64
3 10 non-null float64
4 10 non-null float64
dtypes: float64(5)
memory usage: 480.0 bytes
df.describe() # 对于数据类型为数值型的列,查询其描述性统计的内容
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 0.432296 | 0.601318 | 0.444123 | 0.543053 | 0.413944 |
std | 0.268632 | 0.216483 | 0.298076 | 0.284759 | 0.345554 |
min | 0.155605 | 0.252712 | 0.114011 | 0.126172 | 0.024648 |
25% | 0.245740 | 0.461876 | 0.191090 | 0.295224 | 0.094655 |
50% | 0.353927 | 0.609304 | 0.401004 | 0.565072 | 0.389053 |
75% | 0.559964 | 0.762463 | 0.556721 | 0.693000 | 0.668621 |
max | 0.985457 | 0.932679 | 0.990827 | 0.986743 | 0.949529 |
s.value_counts(dropna=False) # 查询每个独特数据值出现次数统计
s = pd.Series([1,2,3,3,4,np.nan,5,5,5,6,7])
s.value_counts(dropna=False)
5.0 3
3.0 2
7.0 1
6.0 1
NaN 1
4.0 1
2.0 1
1.0 1
dtype: int64
s = s.apply(lambda x: x+1)
s
0 3.0
1 4.0
2 5.0
3 5.0
4 6.0
5 NaN
6 7.0
7 7.0
8 7.0
9 8.0
10 9.0
dtype: float64
df.apply(pd.Series.value_counts) # 查询数据框 (Data Frame) 中每个列的独特数据值出现次数统计
df[col] # 以数组 Series 的形式返回选取的列
df = pd.DataFrame(np.random.rand(5, 5), columns=list('ABCDE'))
df['C']
0 0.452717
1 0.407755
2 0.549391
3 0.759433
4 0.153871
Name: C, dtype: float64
df[[col1, col2]] # 选择多列
df = pd.DataFrame(np.random.rand(5, 5), columns=list('ABCDE'))
df[['C', 'D']]
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
C | D | |
---|---|---|
0 | 0.431885 | 0.304796 |
1 | 0.028960 | 0.187738 |
2 | 0.176520 | 0.102980 |
3 | 0.370277 | 0.098031 |
4 | 0.247122 | 0.345735 |
s.iloc[0] # 按位置选取
s = pd.Series(np.array(['I', 'Love', 'China']))
s.iloc[0]
'I'
s.loc['index_one'] # 按索引选取
s = pd.Series(np.array(['I', 'Love', 'China']))
s.loc[0]
'I'
df.DataFrame[n, :] #选取第n行
df = pd.DataFrame(np.array([['I', 'Love', 'China'], ['I', 'Love', 'Data']]))
df.iloc[1, :]
0 I
1 Love
2 Data
Name: 1, dtype: object
df.iloc[0, 0] # 选取第一个元素
df = pd.DataFrame(np.random.rand(5, 5))
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.786709 | 0.405902 | 0.151383 | 0.384778 | 0.871664 |
1 | 0.491006 | 0.774710 | 0.388011 | 0.758102 | 0.762115 |
2 | 0.085647 | 0.543243 | 0.582565 | 0.664243 | 0.379896 |
3 | 0.806211 | 0.794284 | 0.968755 | 0.883923 | 0.354820 |
4 | 0.463902 | 0.481756 | 0.131181 | 0.590878 | 0.801769 |
df.iloc[0, 0]
0.78670886755075187
df.columns = ['a', 'b'] # 对列名重新命名
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
'C':'foo'})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | |
---|---|---|---|
0 | 1.0 | NaN | foo |
1 | NaN | 4.0 | foo |
2 | 2.0 | NaN | foo |
3 | 3.0 | 5.0 | foo |
4 | 6.0 | 9.0 | foo |
5 | NaN | NaN | foo |
df.columns = ['a', 'b', 'c']
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
a | b | c | |
---|---|---|---|
0 | 1.0 | NaN | foo |
1 | NaN | 4.0 | foo |
2 | 2.0 | NaN | foo |
3 | 3.0 | 5.0 | foo |
4 | 6.0 | 9.0 | foo |
5 | NaN | NaN | foo |
pd.isnull() # 检查数据中出现空值的情况, 返回一个布尔型的列
pd.notnull() #相对应isnull 返回不是空值的情况
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
'C':'foo'})
df.isnull()
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | |
---|---|---|---|
0 | False | True | False |
1 | True | False | False |
2 | False | True | False |
3 | False | False | False |
4 | False | False | False |
5 | True | True | False |
df.isnull().sum() # 对每一列的空值进行统计
A 2
B 3
C 0
dtype: int64
df.dropna(axis = 0, thresh=n) # 删除包含缺失值的行 axis = 1时删除列 # thresh = n移除空值超过(包括等于)n的行
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | |
---|---|---|---|
0 | 1.0 | NaN | foo |
1 | NaN | 4.0 | foo |
2 | 2.0 | NaN | foo |
3 | 3.0 | 5.0 | foo |
4 | 6.0 | 9.0 | foo |
5 | NaN | NaN | foo |
df.dropna(axis = 0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | |
---|---|---|---|
3 | 3.0 | 5.0 | foo |
4 | 6.0 | 9.0 | foo |
df.dropna(axis = 1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
C | |
---|---|
0 | foo |
1 | foo |
2 | foo |
3 | foo |
4 | foo |
5 | foo |
df.dropna(axis = 0, thresh = 2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | |
---|---|---|---|
0 | 1.0 | NaN | foo |
1 | NaN | 4.0 | foo |
2 | 2.0 | NaN | foo |
3 | 3.0 | 5.0 | foo |
4 | 6.0 | 9.0 | foo |
df.fillna(df.mean()) # 用平均值来填充空值
s = pd.Series([1,3,5,np.nan,7,9,9])
s.fillna(s.mean())
0 1.000000
1 3.000000
2 5.000000
3 5.666667
4 7.000000
5 9.000000
6 9.000000
dtype: float64
s.astype(type) # 转换列的类型
s = pd.Series([1,3,5,np.nan,7,9,9])
s.fillna(s.mean()).astype(int)
0 1
1 3
2 5
3 5
4 7
5 9
6 9
dtype: int32
s.replace(1, 'one') # 将Series中的1替换为one
s = pd.Series([1,3,5,np.nan,7,9,9])
s.replace(1,'one')
0 one
1 3
2 5
3 NaN
4 7
5 9
6 9
dtype: object
s.replace([1,3],['one','three']) # 将数组(Series)中所有的1替换为'one', 所有的3替换为'three'
s = pd.Series([1,3,5,np.nan,7,9,9])
s.replace([1,3],['one','three'])
0 one
1 three
2 5
3 NaN
4 7
5 9
6 9
dtype: object
df.rename(columns=lambda x: x + 2) # 将全体列重命名
df = pd.DataFrame(np.random.rand(4,4))
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.669102 | 0.548996 | 0.512802 | 0.449220 |
1 | 0.108840 | 0.974720 | 0.665050 | 0.271009 |
2 | 0.146804 | 0.060744 | 0.637770 | 0.383380 |
3 | 0.108163 | 0.893999 | 0.216907 | 0.730504 |
df.rename(columns=lambda x: x+ 2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
2 | 3 | 4 | 5 | |
---|---|---|---|---|
0 | 0.669102 | 0.548996 | 0.512802 | 0.449220 |
1 | 0.108840 | 0.974720 | 0.665050 | 0.271009 |
2 | 0.146804 | 0.060744 | 0.637770 | 0.383380 |
3 | 0.108163 | 0.893999 | 0.216907 | 0.730504 |
df.rename(columns={'old_name': 'new_ name'}) # 将选择的列重命名
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.rename(columns={'A':'New A', 'B':'New B'})
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
New A | New B | C | D | E | |
---|---|---|---|---|---|
0 | 0.680941 | 0.766561 | 0.486226 | 0.301537 | 0.289970 |
1 | 0.917036 | 0.100054 | 0.464342 | 0.181454 | 0.933591 |
2 | 0.253549 | 0.766181 | 0.085607 | 0.969627 | 0.630674 |
3 | 0.377840 | 0.909920 | 0.214338 | 0.011844 | 0.392257 |
4 | 0.608564 | 0.587614 | 0.039867 | 0.630492 | 0.402101 |
5 | 0.361074 | 0.937618 | 0.787055 | 0.054157 | 0.300325 |
6 | 0.605472 | 0.608429 | 0.052152 | 0.669343 | 0.745648 |
7 | 0.660738 | 0.158713 | 0.352756 | 0.028325 | 0.195899 |
8 | 0.855695 | 0.578177 | 0.447043 | 0.093923 | 0.316234 |
9 | 0.337392 | 0.645260 | 0.140221 | 0.616652 | 0.727144 |
df.set_index('column_one') # 改变索引
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.set_index('B')
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | C | D | E | |
---|---|---|---|---|
B | ||||
0.824403 | 0.013092 | 0.055626 | 0.895268 | 0.837350 |
0.310559 | 0.689064 | 0.541375 | 0.275461 | 0.808554 |
0.533495 | 0.072835 | 0.563758 | 0.695029 | 0.524957 |
0.541211 | 0.820817 | 0.591130 | 0.268978 | 0.546996 |
0.286587 | 0.936692 | 0.343227 | 0.383610 | 0.811302 |
0.878391 | 0.938883 | 0.636148 | 0.776493 | 0.025840 |
0.156482 | 0.918591 | 0.030869 | 0.235020 | 0.096212 |
0.857049 | 0.613991 | 0.810541 | 0.917927 | 0.921329 |
0.713271 | 0.949683 | 0.811386 | 0.920452 | 0.213173 |
0.686945 | 0.522276 | 0.881299 | 0.936260 | 0.030993 |
df.rename(index = lambda x: x+ 1) # 改变全体索引
df = pd.DataFrame(np.random.rand(10,5))
df.rename(index = lambda x: x+ 1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
1 | 0.382337 | 0.185501 | 0.457958 | 0.009713 | 0.628963 |
2 | 0.024175 | 0.223274 | 0.698171 | 0.071715 | 0.063272 |
3 | 0.913995 | 0.713092 | 0.269621 | 0.575365 | 0.805266 |
4 | 0.612708 | 0.220953 | 0.090858 | 0.425472 | 0.018996 |
5 | 0.045363 | 0.153343 | 0.730828 | 0.323554 | 0.364821 |
6 | 0.462096 | 0.614072 | 0.993130 | 0.988894 | 0.788648 |
7 | 0.887381 | 0.802119 | 0.191248 | 0.980064 | 0.628450 |
8 | 0.138270 | 0.922870 | 0.250827 | 0.297472 | 0.289915 |
9 | 0.258687 | 0.807993 | 0.930009 | 0.811335 | 0.609763 |
10 | 0.588020 | 0.392127 | 0.590799 | 0.923180 | 0.722801 |
df[df[col] > 0.5] # 选取数据df中对应行的数值大于0.5的全部列 支持逻辑运算
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
0 | 0.051900 | 0.548808 | 0.744936 | 0.848002 | 0.299505 |
1 | 0.979053 | 0.216078 | 0.394286 | 0.520654 | 0.584194 |
2 | 0.185679 | 0.453151 | 0.839947 | 0.730177 | 0.392377 |
3 | 0.161267 | 0.981833 | 0.890858 | 0.613972 | 0.467528 |
4 | 0.091140 | 0.369805 | 0.600035 | 0.372857 | 0.897063 |
5 | 0.612195 | 0.981150 | 0.578304 | 0.220064 | 0.488182 |
6 | 0.898736 | 0.626289 | 0.788306 | 0.747086 | 0.386097 |
7 | 0.568531 | 0.362593 | 0.644950 | 0.510410 | 0.092556 |
8 | 0.872898 | 0.771917 | 0.853365 | 0.227531 | 0.045184 |
9 | 0.898296 | 0.683850 | 0.138142 | 0.956854 | 0.335476 |
df[df['A'] > 0.5]
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
1 | 0.979053 | 0.216078 | 0.394286 | 0.520654 | 0.584194 |
5 | 0.612195 | 0.981150 | 0.578304 | 0.220064 | 0.488182 |
6 | 0.898736 | 0.626289 | 0.788306 | 0.747086 | 0.386097 |
7 | 0.568531 | 0.362593 | 0.644950 | 0.510410 | 0.092556 |
8 | 0.872898 | 0.771917 | 0.853365 | 0.227531 | 0.045184 |
9 | 0.898296 | 0.683850 | 0.138142 | 0.956854 | 0.335476 |
df[(df['A'] > 0.5) & (df['B'] < 0.7)]
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
1 | 0.979053 | 0.216078 | 0.394286 | 0.520654 | 0.584194 |
6 | 0.898736 | 0.626289 | 0.788306 | 0.747086 | 0.386097 |
7 | 0.568531 | 0.362593 | 0.644950 | 0.510410 | 0.092556 |
9 | 0.898296 | 0.683850 | 0.138142 | 0.956854 | 0.335476 |
df.sort_values(col, ascending=True) #按照列进行排序 # ascending: True 升序 False 降序
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values('A', ascending=True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
3 | 0.015834 | 0.758417 | 0.123415 | 0.802403 | 0.782450 |
1 | 0.068046 | 0.373240 | 0.414358 | 0.105285 | 0.759001 |
7 | 0.134238 | 0.104416 | 0.551595 | 0.472277 | 0.015997 |
4 | 0.236628 | 0.391852 | 0.390275 | 0.904988 | 0.650108 |
8 | 0.469382 | 0.426359 | 0.137109 | 0.253183 | 0.894667 |
2 | 0.508937 | 0.443894 | 0.147076 | 0.149885 | 0.434802 |
0 | 0.572640 | 0.369032 | 0.412343 | 0.402019 | 0.445365 |
9 | 0.663964 | 0.533604 | 0.217605 | 0.602667 | 0.637232 |
6 | 0.765109 | 0.646277 | 0.885381 | 0.743307 | 0.649711 |
5 | 0.962494 | 0.650830 | 0.754514 | 0.578115 | 0.659846 |
df.sort_values([col1,col2],ascending=[True,False]) # 按照数据框的列col1升序,col2降序的方式对数据框df做排序
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values(['A','E'],ascending=[True,False])
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
1 | 0.083526 | 0.969267 | 0.012550 | 0.672851 | 0.866501 |
7 | 0.107839 | 0.383900 | 0.982337 | 0.390914 | 0.308559 |
0 | 0.148742 | 0.306888 | 0.853949 | 0.144650 | 0.414474 |
4 | 0.190011 | 0.794060 | 0.514756 | 0.272207 | 0.086894 |
6 | 0.423982 | 0.229873 | 0.992318 | 0.495706 | 0.971735 |
9 | 0.532263 | 0.106900 | 0.528114 | 0.456583 | 0.362642 |
2 | 0.619678 | 0.800373 | 0.927766 | 0.742667 | 0.645809 |
3 | 0.815312 | 0.920682 | 0.833351 | 0.266840 | 0.132698 |
5 | 0.842293 | 0.049499 | 0.780198 | 0.343752 | 0.341800 |
8 | 0.932379 | 0.398721 | 0.080358 | 0.200681 | 0.549237 |
df.groupby(col) # 按照某列对数据框df做分组 # 常与count进行连用,统计出各词的个数
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | small | 1 |
1 | foo | one | medium | 2 |
2 | foo | two | large | 2 |
3 | foo | two | large | 3 |
4 | bar | three | small | 3 |
5 | bar | three | small | 5 |
df.groupby('B').count()
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | C | D | |
---|---|---|---|
B | |||
one | 2 | 2 | 2 |
three | 2 | 2 | 2 |
two | 2 | 2 | 2 |
df.groupby([col1,col2]) # 按照列col1和col2对数据框df做分组
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.groupby(['A', 'B']).sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
D | ||
---|---|---|
A | B | |
bar | three | 8 |
foo | one | 3 |
two | 5 |
df.groupby(col1)[col2].mean() # 按照列col1对数据框df做分组处理后,返回对应的col2的平均值
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.groupby('A')['D'].mean()
A
bar 4
foo 2
Name: D, dtype: int32
df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean) # 做透视表,索引为col1,针对的数值列为col2和col3,分组函数为平均值
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.pivot_table(df, index=['A', 'B'], columns=['C'], aggfunc=np.sum)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead tr th { text-align: left; }</p> <p> .dataframe thead tr:last-of-type th { text-align: right; }
D | ||||
---|---|---|---|---|
C | large | medium | small | |
A | B | |||
bar | three | NaN | NaN | 8.0 |
foo | one | NaN | 2.0 | 1.0 |
two | 5.0 | NaN | NaN |
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
'B':np.array(['one','one','two','two','three','three']),
'C':np.array(['small','medium','large','large','small','small']),
'D':np.array([1,2,2,3,3,5])})
df.groupby('A').agg(np.mean)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
D | |
---|---|
A | |
bar | 4 |
foo | 2 |
df.apply(np.mean, axis=0) # 对数据框df的每一列求平均值 axis: 0对列名(横着的)进行处理 1对索引(竖着的)进行处理
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
0 | 0.800902 | 0.933676 | 0.461338 | 0.353398 | 0.057885 |
1 | 0.988580 | 0.681318 | 0.533361 | 0.486016 | 0.220004 |
2 | 0.695034 | 0.643920 | 0.694040 | 0.280063 | 0.641867 |
3 | 0.925290 | 0.084906 | 0.120247 | 0.880991 | 0.399596 |
4 | 0.697742 | 0.372860 | 0.881456 | 0.565627 | 0.272549 |
5 | 0.614245 | 0.658123 | 0.797487 | 0.609511 | 0.544633 |
6 | 0.153517 | 0.354870 | 0.910838 | 0.416895 | 0.098821 |
7 | 0.088223 | 0.501401 | 0.702754 | 0.334938 | 0.182708 |
8 | 0.737348 | 0.569340 | 0.291342 | 0.847058 | 0.193331 |
9 | 0.083915 | 0.396210 | 0.589415 | 0.806525 | 0.598841 |
df.apply(np.mean, axis=0)
A 0.578480
B 0.519662
C 0.598228
D 0.558102
E 0.321024
dtype: float64
df1.append(df2) # 在数据框df2的末尾添加数据框df1,其中df1和df2的列数应该相等 列合并
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df1.append(df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
pd.concat([df1, df2], axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等 # 中括号可以换成圆括号 # axis: 0进行行合并 1进行列合并
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
pd.concat([df1, df2], axis=0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
'F': ['B4', 'B5', 'B6', 'B7'],
'G': ['C4', 'C5', 'C6', 'C7'],
'H': ['D4', 'D5', 'D6', 'D7']},
index=[0, 1, 2, 3])
pd.concat((df1, df2), axis=1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | F | G | H | |
---|---|---|---|---|---|---|---|---|
0 | A0 | B0 | C0 | D0 | A4 | B4 | C4 | D4 |
1 | A1 | B1 | C1 | D1 | A5 | B5 | C5 | D5 |
2 | A2 | B2 | C2 | D2 | A6 | B6 | C6 | D6 |
3 | A3 | B3 | C3 | D3 | A7 | B7 | C7 | D7 |
df1.join(df2,on=col1,how='inner') # 对数据框df1和df2做内连接,其中连接的列为col1
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
df2 = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
df1.join(df2, on='key', how='inner')
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | key | C | D | |
---|---|---|---|---|---|
0 | A0 | B0 | K0 | C0 | D0 |
2 | A2 | B2 | K0 | C0 | D0 |
1 | A1 | B1 | K1 | C1 | D1 |
3 | A3 | B3 | K1 | C1 | D1 |
df.mean() # 得到数据框df中每一列的平均值
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.mean()
A 0.362158
B 0.432248
C 0.554478
D 0.331155
E 0.438283
dtype: float64
df.corr() # 得到数据框df中每一列与其他列的相关系数
df.corr()
.dataframe tbody tr th:only-of-type { vertical-align: middle; }</p> <p> .dataframe tbody tr th { vertical-align: top; }</p> <p> .dataframe thead th { text-align: right; }
A | B | C | D | E | |
---|---|---|---|---|---|
A | 1.000000 | -0.167715 | 0.198216 | 0.036939 | 0.113714 |
B | -0.167715 | 1.000000 | 0.449789 | 0.015883 | -0.236658 |
C | 0.198216 | 0.449789 | 1.000000 | -0.296943 | 0.386206 |
D | 0.036939 | 0.015883 | -0.296943 | 1.000000 | -0.777327 |
E | 0.113714 | -0.236658 | 0.386206 | -0.777327 | 1.000000 |
df.count() # 得到数据框df中每一列的非空值个数
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
# df.loc[0][0] = np.nan
df.iloc[0, 0] = np.nan
df.count()
A 9
B 10
C 10
D 10
E 10
dtype: int64
df.max() # 得到数据框df中每一列的最大值
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.max()
A 0.812708
B 0.886171
C 0.987035
D 0.977146
E 0.959625
dtype: float64
df.min() # 得到数据框df中每一列的最小值
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.min()
A 0.128560
B 0.135905
C 0.167476
D 0.137062
E 0.050306
dtype: float64
df.median() # 得到数据框df中每一列的中位数
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.median()
A 0.409373
B 0.597418
C 0.678203
D 0.705762
E 0.519713
dtype: float64
df.std() # 得到数据框df中每一列的标准差
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.std()
A 0.272847
B 0.254870
C 0.258956
D 0.301168
E 0.295753
dtype: float64