python pandas教程

#coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot



#s=pd.Series([7,'Heisenberg',3.14,-1789710578,'Happy Eating!'])
#print s

#Series可以转换字典
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
     'Austin': 450, 'Boston': None}
cities = pd.Series(d)
#---------------------------------------------
#print cities
# F:\桌面>python pandaslearn.py
# 0                7
# 1       Heisenberg
# 2             3.14
# 3      -1789710578
# 4    Happy Eating!
# dtype: object
#---------------------------------------------
#可通过index在Series中选择特定项
#print cities['Chicago']
#print cities[['Chicago', 'Portland', 'San Francisco']]
# Chicago          1000.0
# Portland          900.0
# San Francisco    1100.0
# dtype: float64
#---------------------------------------------
#可以使用boolean  index 选择
# print cities[cities < 1000]
# Austin      450.0
# Portland    900.0
# dtype: float64

#---------------------------------------------

# less_than_1000 = cities < 1000
# print(less_than_1000)
# print('\n')
# print(cities[less_than_1000])

# Austin            True
# Boston           False
# Chicago          False
# New York         False
# Portland          True
# San Francisco    False
# dtype: bool


# Austin      450.0
# Portland    900.0
# dtype: float64
#---------------------------------------------
#你可以改变Series的值
# print('Old value:', cities['Chicago'])
# cities['Chicago'] = 1400
# print('New value:', cities['Chicago'])
# ('Old value:', 1000.0)
# ('New value:', 1400.0)

# changing values using boolean logic
# print(cities[cities < 1000])
# print('\n')
# cities[cities < 1000] = 750

# print cities[cities < 1000]
# Austin      450.0
# Portland    900.0
# dtype: float64


# Austin      750.0
# Portland    750.0
# dtype: float64
#---------------------------------------------
# divide city values by 3
# print (cities / 3)
# # square city values
# print np.square(cities)
# Austin           150.000000
# Boston                  NaN
# Chicago          333.333333
# New York         433.333333
# Portland         300.000000
# San Francisco    366.666667
# dtype: float64
# Austin            202500.0
# Boston                 NaN
# Chicago          1000000.0
# New York         1690000.0
# Portland          810000.0
# San Francisco    1210000.0
# dtype: float64

#---------------------------------------------
#你可以将两个Series相加,index相同都有值的话数值相加,没有的话NaN
# print(cities[['Chicago', 'New York', 'Portland']])
# print('\n')
# print(cities[['Austin', 'New York']])
# print('\n')
# print(cities[['Chicago', 'New York', 'Portland']] + cities[['Austin', 'New York']])
# Chicago     1000.0
# New York    1300.0
# Portland     900.0
# dtype: float64


# Austin       450.0
# New York    1300.0
# dtype: float64


# Austin         NaN
# Chicago        NaN
# New York    2600.0
# Portland       NaN
# dtype: float64

#---------------------------------------------
#NULL checking can be performed with isnull and notnull.
# print cities
# print cities.notnull()

# Austin            450.0
# Boston              NaN
# Chicago          1000.0
# New York         1300.0
# Portland          900.0
# San Francisco    1100.0
# dtype: float64
# Austin            True
# Boston           False
# Chicago           True
# New York          True
# Portland          True
# San Francisco     True
# dtype: bool

# use boolean logic to grab the NULL cities
# print(cities.isnull())
# print('\n')
# print(cities[cities.isnull()])

# Austin           False
# Boston            True
# Chicago          False
# New York         False
# Portland         False
# San Francisco    False
# dtype: bool


# Boston   NaN
# dtype: float64




#---------------------------------------------
#DataFrame
#创建Dataframe可以通过字典,使用列作为因素可以选择列排列顺序,默认按字母顺序排
# data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
#         'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
#         'wins': [11, 8, 10, 15, 11, 6, 10, 4],
#         'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
# football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])


# print football
#---------------------------------------------
#读入csv文件
# from_csv = pd.read_csv('mariano-rivera.csv')
# print from_csv.head()#默认显示前五行
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#创建时间索引
#dates=pd.date_range('20140729',periods=6)
# print dates
# DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
#                '2014-08-02', '2014-08-03', '2014-08-04', '2014-08-05',
#                '2014-08-06', '2014-08-07'],
#               dtype='datetime64[ns]', freq='D')

#创建6*4的数据
#df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
#print df
#                    A         B         C         D
# 2014-07-29 -0.516528 -0.501800  0.775083 -1.185990
# 2014-07-30 -0.209529  0.625856 -0.597548  0.168520
# 2014-07-31  0.080277 -1.202214 -0.235769 -1.473808
# 2014-08-01 -0.836040  2.282775  0.012648  0.685435
# 2014-08-02  0.016593  0.470927 -0.964949  0.296643
# 2014-08-03 -0.375680  0.879676  1.594905 -1.510553


#字典创建时间索引
# df2=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1)})#Timestamp创建时间开始的index,
# print df2
#            A  B
# 0 2014-07-29  1

# df3=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1,index=list(range(4)))})#字典长度不同,以长数据为准
# print df3
#            A  B
# 0 2014-07-29  1
# 1 2014-07-29  1
# 2 2014-07-29  1
# 3 2014-07-29  1
# print football
#---------------------------------------------
# df.head()#查看前几行数据,默认5
# df.tail()#查看后几行数据
# df.dtypes#查看各列的数据格式
# df[20:22]#20到22行
# df.index#查看数据框索引
# df.columns#查看列名
# df.values#查看数据值
# df.describe()#查看DataFrame的描述性统计(count,mean等)
# type(df.describe())#查看输出描述性统计特性是啥类型(DataFrame或者Series)
# df.T#DataFrame转置
# df.sort(columns='C')#指定哪一列对数据排序
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

# 1,1,2012-09-09,DEN,,PIT,W 31-19,3,71,Demaryius Thomas,Trail 7-13,Lead 14-13*
# 2,1,2012-09-09,DEN,,PIT,W 31-19,4,1,Jacob Tamme,Trail 14-19,Lead 22-19*
# 3,2,2012-09-17,DEN,@,ATL,L 21-27,2,17,Demaryius Thomas,Trail 0-20,Trail 7-20
# 4,3,2012-09-23,DEN,,HOU,L 25-31,4,38,Brandon Stokley,Trail 11-31,Trail 18-31
# 5,3,2012-09-23,DEN,,HOU,L 25-31,4,6,Joel Dreessen,Trail 18-31,Trail 25-31

#如果DataFrame没有表头,可以head=None ,然后用(names=列名)的列表来标上表头
# cols = ['num', 'game', 'date', 'team', 'home_away', 'opponent',
#         'result', 'quarter', 'distance', 'receiver', 'score_before',
#         'score_after']
# no_headers = pd.read_csv('peyton-passing-TDs-2012.csv', sep=',', header=None,
#                          names=cols)
# print no_headers.head()
#    num  game        date team home_away opponent   result  quarter  distance  \....
# 0    1     1  2012-09-09  DEN       NaN      PIT  W 31-19        3        71
# 1    2     1  2012-09-09  DEN       NaN      PIT  W 31-19        4         1
# 2    3     2  2012-09-17  DEN         @      ATL  L 21-27        2        17
# 3    4     3  2012-09-23  DEN       NaN      HOU  L 25-31        4        38
# 4    5     3  2012-09-23  DEN       NaN      HOU  L 25-31        4         6

#---------------------------------------------
#pandas有各种各样的reader函数来让你选择是否跳过行,分析日期,处理NULL点等
#pandas也有各种写函数是让数据保存为(CSV,HTML table,JION),典型的如下
#my_dataframe.to_csv('path_to_file.csv')

# #excel 
# football.to_excel('football.xlsx', index=False)
# # delete the DataFrame
# del football
# football = pd.read_excel('football.xlsx', 'Sheet1')

#---------------------------------------------
#pandas支持DataFrame直接读入或写入数据库
#注意:pandas直接to_sql速度很慢,如果写入大数据量DataFrame,可以先将DataFrame转换为csv文件,然后直接导入

# from pandas.io import sql
# import sqlite3

# conn = sqlite3.connect('/Users/gjreda/Dropbox/gregreda.com/_code/towed')
# query = "SELECT * FROM towed WHERE make = 'FORD';"

# results = sql.read_sql(query, con=conn)
# results.head()
# #因为query结果可以直接读入到DataFrame中,我更倾向于从clipboard中读取数据
# hank = pd.read_clipboard()
# hank.head()

#我们也可以从URL中读数据
# url = 'https://raw.github.com/gjreda/best-sandwiches/master/data/best-sandwiches-geocode.tsv'

# # fetch the text from the URL and read it into a DataFrame
# from_url = pd.read_table(url, sep='\t')
# print from_url.head(3)

#-----------------------------------------------------------------------------------------------------------------------------------------------------------

#Working with DataFrames

# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')#usecols返回列的子集

#输出DataFrame的简要介绍
# print movies.info()

# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1682 entries, 0 to 1681
# Data columns (total 5 columns):
# movie_id              1682 non-null int64
# title                 1682 non-null object
# release_date          1681 non-null object
# video_release_date    0 non-null float64
# imdb_url              1679 non-null object
# dtypes: float64(1), int64(1), object(3)
# memory usage: 65.8+ KB
# None

#输出每一列的类型
#print movies.dtypes


#分块读取文件,针对大数据,可以读取前几行看看
# reader=pd.read_csv('ml-100k/u.item', sep='|',iterator=True)
# print reader.get_chunk(5)

# loop=True
# while loop:
#     try:
#         chunk = reader.get_chunk(5)
#         chunk.to_csv('output.csv', mode='a', header=False) # 和上面的代码作用一样只是通过iterator来实现
#     except StopIteration:
#         break

#------------------------------------------------------------------------------------------
#选择

#选择一列
#print users['occupation'].head()
#选择多列
#print users[['age', 'zip_code']].head()

#多行选择
# users older than 25
# print(users[users.age > 25].head(3))
# print('\n')

# # users aged 40 AND male
# print(users[(users.age == 40) & (users.sex == 'M')].head(3))
# print('\n')

# # users younger than 30 OR female
# print(users[(users.sex == 'F') | (users.age < 30)].head(3))

#因为现在我们的索引没有意义,可以通过set_index设定,默认情况下set_index返回新的DataFrame,必须确认是否想要这种变化
# print(users.set_index('user_id').head())
# print('\n')

# print(users.head())
# print("\n^^^ I didn't actually change the DataFrame. ^^^\n")

# with_new_index = users.set_index('user_id')
# print(with_new_index.head())
# print("\n^^^ set_index actually returns a new DataFrame. ^^^\n")

#users.set_index('user_id', inplace=True)#inplace说明要修改DataFrame
#print users.head()#已经将user_id改为索引,删除了由0开始的默认索引

# #可以通过iloc选择多行数据,iloc是位置索引(从0开始,第一行数据是0)
# print(users.iloc[99])
# print('\n')
# print(users.iloc[[1, 50, 300]])


# #也可以通过loc的label方法选择,loc标签索引,位置的话从1开始(第一行数据是1)

# print(users.loc[100])
# print('\n')
# print(users.loc[[2, 51, 301]])

# #如果我们想要回到原来的0开始的默认索引,采用reset_index
# users.reset_index(inplace=True)
# print users.head()

# The simplified rules of indexing are

# Use loc for label-based indexing
# Use iloc for positional indexing


#-----------------------------------------------------------------------------------------------------------------------------------------------------------

#联合joining

left_frame = pd.DataFrame({'key': range(5), 
                           'left_value': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key': range(2, 7), 
                           'right_value': ['f', 'g', 'h', 'i', 'j']})
# print(left_frame)
# print('\n')
# print(right_frame)

#------------------------------------------------------------------------------------------
#inner join (default)只保留key相同的dataframe
# a=pd.merge(left_frame, right_frame, on='key', how='inner')
# print a.info()

#    key left_value right_value
# 0    2          c           f
# 1    3          d           g
# 2    4          e           h



# a=pd.merge(left_frame, right_frame, left_on='key', right_on='key')
# print a

# a=pd.merge(left_frame, right_frame, left_on='key', right_index=True)
# print a 
#    key  key_x left_value  key_y right_value
# 0    0      0          a      2           f
# 1    1      1          b      3           g
# 2    2      2          c      4           h
# 3    3      3          d      5           i
# 4    4      4          e      6           j
#------------------------------------------------------------------------------------------


#left outer join

# a=pd.merge(left_frame, right_frame, on='key', how='left')
# print a

#    key left_value right_value
# 0    0          a         NaN
# 1    1          b         NaN
# 2    2          c           f
# 3    3          d           g
# 4    4          e           h

# b=pd.merge(left_frame, right_frame, on='key', how='right')
# print b

#    key left_value right_value
# 0  2.0          c           f
# 1  3.0          d           g
# 2  4.0          e           h
# 3  5.0        NaN           i
# 4  6.0        NaN           j


#------------------------------------------------------------------------------------------
# full outer join
# print pd.merge(left_frame, right_frame, on='key', how='outer')

#    key left_value right_value
# 0  0.0          a         NaN
# 1  1.0          b         NaN
# 2  2.0          c           f
# 3  3.0          d           g
# 4  4.0          e           h
# 5  5.0        NaN           i
# 6  6.0        NaN           j

#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#combining

#print pd.concat([left_frame, right_frame])

#    key left_value right_value
# 0    0          a         NaN
# 1    1          b         NaN
# 2    2          c         NaN
# 3    3          d         NaN
# 4    4          e         NaN
# 0    2        NaN           f
# 1    3        NaN           g
# 2    4        NaN           h
# 3    5        NaN           i
# 4    6        NaN           j

# print pd.concat([left_frame, right_frame], axis=1)
#    key left_value  key right_value
# 0    0          a    2           f
# 1    1          b    3           g
# 2    2          c    4           h
# 3    3          d    5           i
# 4    4          e    6           j
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

# Grouping
headers = ['name', 'title', 'department', 'salary']
chicago = pd.read_csv('city-of-chicago-salaries.csv', 
                      header=0,
                      names=headers,
                      converters={'salary': lambda x: float(x.replace('$', ''))})
# print chicago.head()

#                     name                     title        department   salary
# 0        AARON,  ELVIA J          WATER RATE TAKER       WATER MGMNT  85512.0
# 1      AARON,  JEFFERY M            POLICE OFFICER            POLICE  75372.0
# 2    AARON,  KIMBERLEI R  CHIEF CONTRACT EXPEDITER  GENERAL SERVICES  80916.0
# 3    ABAD JR,  VICENTE M         CIVIL ENGINEER IV       WATER MGMNT  99648.0
# 4  ABBATACOLA,  ROBERT J       ELECTRICAL MECHANIC          AVIATION  89440.0

#pandas的groupby返回dataframegroupby类型
by_dept = chicago.groupby('department')
# print by_dept
# <pandas.core.groupby.DataFrameGroupBy object at 0x0000000007E022E8>

# print(by_dept.count().head()) # NOT NULL records within each column
# print('\n')
# print(by_dept.size().tail()) # total records for each department

#                    name  title  salary
# department
# ADMIN HEARNG         42     42      42
# ANIMAL CONTRL        61     61      61
# AVIATION           1218   1218    1218
# BOARD OF ELECTION   110    110     110
# BOARD OF ETHICS       9      9       9


# department
# PUBLIC LIBRARY     926
# STREETS & SAN     2070
# TRANSPORTN        1168
# TREASURER           25
# WATER MGMNT       1857
# dtype: int64


# print(by_dept.sum()[20:25]) # total salaries of each department
# print('\n')
# print(by_dept.mean()[20:25]) # average salary of each department
# print('\n')
# print(by_dept.median()[20:25]) # take that, RDBMS!

#                        salary
# department
# HUMAN RESOURCES     4850928.0
# INSPECTOR GEN       4035150.0
# IPRA                7006128.0
# LAW                31883920.2
# LICENSE APPL COMM     65436.0


#                          salary
# department
# HUMAN RESOURCES    71337.176471
# INSPECTOR GEN      80703.000000
# IPRA               82425.035294
# LAW                70853.156000
# LICENSE APPL COMM  65436.000000


#                     salary
# department
# HUMAN RESOURCES    68496.0
# INSPECTOR GEN      76116.0
# IPRA               82524.0
# LAW                66492.0
# LICENSE APPL COMM  65436.0

#print by_dept.title.nunique().sort_values(ascending=False)[:5]

# department
# WATER MGMNT    153
# TRANSPORTN     150
# POLICE         130
# AVIATION       125
# HEALTH         118
# Name: title, dtype: int64
#------------------------------------------------------------------------------------------
def ranker(df):
    """Assigns a rank to each employee based on salary, with 1 being the highest paid.
    Assumes the data is DESC sorted."""
    df['dept_rank'] = np.arange(len(df)) + 1
    return df

chicago.sort_values('salary', ascending=False, inplace=True)
chicago = chicago.groupby('department').apply(ranker)
# print(chicago[chicago.dept_rank == 1].head(7))

#                          name                     title      department  \
# 18039     MC CARTHY,  GARRY F  SUPERINTENDENT OF POLICE          POLICE
# 8004           EMANUEL,  RAHM                     MAYOR  MAYOR'S OFFICE
# 25588       SANTIAGO,  JOSE A         FIRE COMMISSIONER            FIRE
# 763    ANDOLINO,  ROSEMARIE S  COMMISSIONER OF AVIATION        AVIATION
# 4697     CHOUCAIR,  BECHARA N    COMMISSIONER OF HEALTH          HEALTH
# 21971      PATTON,  STEPHEN R       CORPORATION COUNSEL             LAW
# 12635      HOLT,  ALEXANDRA D                BUDGET DIR   BUDGET & MGMT

#          salary  dept_rank
# 18039  260004.0          1
# 8004   216210.0          1
# 25588  202728.0          1
# 763    186576.0          1
# 4697   177156.0          1
# 21971  173664.0          1
# 12635  169992.0          1

#------------------------------------------------------------------------------------------

print chicago[chicago.department == "LAW"][:5]

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏非典型技术宅

iOS实践:一步步实现星级评分1. 创建星星2. 优化3. 灵异事件

18740
来自专栏码匠的流水账

聊聊storm client的netty buffer watermark

本文主要研究一下storm client的netty buffer watermark

29060
来自专栏码匠的流水账

聊聊spring cloud gateway的PreserveHostHeaderGatewayFilter

本文主要研究下spring cloud gateway的PreserveHostHeaderGatewayFilter

14520
来自专栏小樱的经验随笔

HDU 2438 Turn the corner(三分查找)

托一个学弟的福,学了一下他的最简便三分写法,然后找了一道三分的题验证了下,AC了一题,写法确实方便,还是我太弱了,漫漫AC路!各路大神,以后你们有啥好的简便写法...

30150
来自专栏jeremy的技术点滴

python开发小技巧

32740
来自专栏HansBug's Lab

3891: [Usaco2014 Dec]Piggy Back

3891: [Usaco2014 Dec]Piggy Back Time Limit: 10 Sec  Memory Limit: 128 MB Submit:...

38290
来自专栏逍遥剑客的游戏开发

WOW小地图生成

32230
来自专栏菩提树下的杨过

asp中的md5/sha1/sha256算法收集

对于asp这种古董级的技术,这年头想找一些有用的资料已经不容易了,下面是一些常用的加密算法: md5 (将以下代码另存为md5.inc) <% Private ...

41190
来自专栏码匠的流水账

聊聊sentinel的FlowSlot

com/alibaba/csp/sentinel/slots/block/flow/FlowSlot.java

17110
来自专栏码匠的流水账

聊聊storm client的netty buffer watermark

本文主要研究一下storm client的netty buffer watermark

10820

扫码关注云+社区

领取腾讯云代金券