#coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
#s=pd.Series([7,'Heisenberg',3.14,-1789710578,'Happy Eating!'])
#print s
#Series可以转换字典
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
'Austin': 450, 'Boston': None}
cities = pd.Series(d)
#---------------------------------------------
#print cities
# F:\桌面>python pandaslearn.py
# 0 7
# 1 Heisenberg
# 2 3.14
# 3 -1789710578
# 4 Happy Eating!
# dtype: object
#---------------------------------------------
#可通过index在Series中选择特定项
#print cities['Chicago']
#print cities[['Chicago', 'Portland', 'San Francisco']]
# Chicago 1000.0
# Portland 900.0
# San Francisco 1100.0
# dtype: float64
#---------------------------------------------
#可以使用boolean index 选择
# print cities[cities < 1000]
# Austin 450.0
# Portland 900.0
# dtype: float64
#---------------------------------------------
# less_than_1000 = cities < 1000
# print(less_than_1000)
# print('\n')
# print(cities[less_than_1000])
# Austin True
# Boston False
# Chicago False
# New York False
# Portland True
# San Francisco False
# dtype: bool
# Austin 450.0
# Portland 900.0
# dtype: float64
#---------------------------------------------
#你可以改变Series的值
# print('Old value:', cities['Chicago'])
# cities['Chicago'] = 1400
# print('New value:', cities['Chicago'])
# ('Old value:', 1000.0)
# ('New value:', 1400.0)
# changing values using boolean logic
# print(cities[cities < 1000])
# print('\n')
# cities[cities < 1000] = 750
# print cities[cities < 1000]
# Austin 450.0
# Portland 900.0
# dtype: float64
# Austin 750.0
# Portland 750.0
# dtype: float64
#---------------------------------------------
# divide city values by 3
# print (cities / 3)
# # square city values
# print np.square(cities)
# Austin 150.000000
# Boston NaN
# Chicago 333.333333
# New York 433.333333
# Portland 300.000000
# San Francisco 366.666667
# dtype: float64
# Austin 202500.0
# Boston NaN
# Chicago 1000000.0
# New York 1690000.0
# Portland 810000.0
# San Francisco 1210000.0
# dtype: float64
#---------------------------------------------
#你可以将两个Series相加,index相同都有值的话数值相加,没有的话NaN
# print(cities[['Chicago', 'New York', 'Portland']])
# print('\n')
# print(cities[['Austin', 'New York']])
# print('\n')
# print(cities[['Chicago', 'New York', 'Portland']] + cities[['Austin', 'New York']])
# Chicago 1000.0
# New York 1300.0
# Portland 900.0
# dtype: float64
# Austin 450.0
# New York 1300.0
# dtype: float64
# Austin NaN
# Chicago NaN
# New York 2600.0
# Portland NaN
# dtype: float64
#---------------------------------------------
#NULL checking can be performed with isnull and notnull.
# print cities
# print cities.notnull()
# Austin 450.0
# Boston NaN
# Chicago 1000.0
# New York 1300.0
# Portland 900.0
# San Francisco 1100.0
# dtype: float64
# Austin True
# Boston False
# Chicago True
# New York True
# Portland True
# San Francisco True
# dtype: bool
# use boolean logic to grab the NULL cities
# print(cities.isnull())
# print('\n')
# print(cities[cities.isnull()])
# Austin False
# Boston True
# Chicago False
# New York False
# Portland False
# San Francisco False
# dtype: bool
# Boston NaN
# dtype: float64
#---------------------------------------------
#DataFrame
#创建Dataframe可以通过字典,使用列作为因素可以选择列排列顺序,默认按字母顺序排
# data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
# 'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
# 'wins': [11, 8, 10, 15, 11, 6, 10, 4],
# 'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
# football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
# print football
#---------------------------------------------
#读入csv文件
# from_csv = pd.read_csv('mariano-rivera.csv')
# print from_csv.head()#默认显示前五行
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#创建时间索引
#dates=pd.date_range('20140729',periods=6)
# print dates
# DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
# '2014-08-02', '2014-08-03', '2014-08-04', '2014-08-05',
# '2014-08-06', '2014-08-07'],
# dtype='datetime64[ns]', freq='D')
#创建6*4的数据
#df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
#print df
# A B C D
# 2014-07-29 -0.516528 -0.501800 0.775083 -1.185990
# 2014-07-30 -0.209529 0.625856 -0.597548 0.168520
# 2014-07-31 0.080277 -1.202214 -0.235769 -1.473808
# 2014-08-01 -0.836040 2.282775 0.012648 0.685435
# 2014-08-02 0.016593 0.470927 -0.964949 0.296643
# 2014-08-03 -0.375680 0.879676 1.594905 -1.510553
#字典创建时间索引
# df2=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1)})#Timestamp创建时间开始的index,
# print df2
# A B
# 0 2014-07-29 1
# df3=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1,index=list(range(4)))})#字典长度不同,以长数据为准
# print df3
# A B
# 0 2014-07-29 1
# 1 2014-07-29 1
# 2 2014-07-29 1
# 3 2014-07-29 1
# print football
#---------------------------------------------
# df.head()#查看前几行数据,默认5
# df.tail()#查看后几行数据
# df.dtypes#查看各列的数据格式
# df[20:22]#20到22行
# df.index#查看数据框索引
# df.columns#查看列名
# df.values#查看数据值
# df.describe()#查看DataFrame的描述性统计(count,mean等)
# type(df.describe())#查看输出描述性统计特性是啥类型(DataFrame或者Series)
# df.T#DataFrame转置
# df.sort(columns='C')#指定哪一列对数据排序
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
# 1,1,2012-09-09,DEN,,PIT,W 31-19,3,71,Demaryius Thomas,Trail 7-13,Lead 14-13*
# 2,1,2012-09-09,DEN,,PIT,W 31-19,4,1,Jacob Tamme,Trail 14-19,Lead 22-19*
# 3,2,2012-09-17,DEN,@,ATL,L 21-27,2,17,Demaryius Thomas,Trail 0-20,Trail 7-20
# 4,3,2012-09-23,DEN,,HOU,L 25-31,4,38,Brandon Stokley,Trail 11-31,Trail 18-31
# 5,3,2012-09-23,DEN,,HOU,L 25-31,4,6,Joel Dreessen,Trail 18-31,Trail 25-31
#如果DataFrame没有表头,可以head=None ,然后用(names=列名)的列表来标上表头
# cols = ['num', 'game', 'date', 'team', 'home_away', 'opponent',
# 'result', 'quarter', 'distance', 'receiver', 'score_before',
# 'score_after']
# no_headers = pd.read_csv('peyton-passing-TDs-2012.csv', sep=',', header=None,
# names=cols)
# print no_headers.head()
# num game date team home_away opponent result quarter distance \....
# 0 1 1 2012-09-09 DEN NaN PIT W 31-19 3 71
# 1 2 1 2012-09-09 DEN NaN PIT W 31-19 4 1
# 2 3 2 2012-09-17 DEN @ ATL L 21-27 2 17
# 3 4 3 2012-09-23 DEN NaN HOU L 25-31 4 38
# 4 5 3 2012-09-23 DEN NaN HOU L 25-31 4 6
#---------------------------------------------
#pandas有各种各样的reader函数来让你选择是否跳过行,分析日期,处理NULL点等
#pandas也有各种写函数是让数据保存为(CSV,HTML table,JION),典型的如下
#my_dataframe.to_csv('path_to_file.csv')
# #excel
# football.to_excel('football.xlsx', index=False)
# # delete the DataFrame
# del football
# football = pd.read_excel('football.xlsx', 'Sheet1')
#---------------------------------------------
#pandas支持DataFrame直接读入或写入数据库
#注意:pandas直接to_sql速度很慢,如果写入大数据量DataFrame,可以先将DataFrame转换为csv文件,然后直接导入
# from pandas.io import sql
# import sqlite3
# conn = sqlite3.connect('/Users/gjreda/Dropbox/gregreda.com/_code/towed')
# query = "SELECT * FROM towed WHERE make = 'FORD';"
# results = sql.read_sql(query, con=conn)
# results.head()
# #因为query结果可以直接读入到DataFrame中,我更倾向于从clipboard中读取数据
# hank = pd.read_clipboard()
# hank.head()
#我们也可以从URL中读数据
# url = 'https://raw.github.com/gjreda/best-sandwiches/master/data/best-sandwiches-geocode.tsv'
# # fetch the text from the URL and read it into a DataFrame
# from_url = pd.read_table(url, sep='\t')
# print from_url.head(3)
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#Working with DataFrames
# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
encoding='latin-1')
# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
encoding='latin-1')#usecols返回列的子集
#输出DataFrame的简要介绍
# print movies.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 1682 entries, 0 to 1681
# Data columns (total 5 columns):
# movie_id 1682 non-null int64
# title 1682 non-null object
# release_date 1681 non-null object
# video_release_date 0 non-null float64
# imdb_url 1679 non-null object
# dtypes: float64(1), int64(1), object(3)
# memory usage: 65.8+ KB
# None
#输出每一列的类型
#print movies.dtypes
#分块读取文件,针对大数据,可以读取前几行看看
# reader=pd.read_csv('ml-100k/u.item', sep='|',iterator=True)
# print reader.get_chunk(5)
# loop=True
# while loop:
# try:
# chunk = reader.get_chunk(5)
# chunk.to_csv('output.csv', mode='a', header=False) # 和上面的代码作用一样只是通过iterator来实现
# except StopIteration:
# break
#------------------------------------------------------------------------------------------
#选择
#选择一列
#print users['occupation'].head()
#选择多列
#print users[['age', 'zip_code']].head()
#多行选择
# users older than 25
# print(users[users.age > 25].head(3))
# print('\n')
# # users aged 40 AND male
# print(users[(users.age == 40) & (users.sex == 'M')].head(3))
# print('\n')
# # users younger than 30 OR female
# print(users[(users.sex == 'F') | (users.age < 30)].head(3))
#因为现在我们的索引没有意义,可以通过set_index设定,默认情况下set_index返回新的DataFrame,必须确认是否想要这种变化
# print(users.set_index('user_id').head())
# print('\n')
# print(users.head())
# print("\n^^^ I didn't actually change the DataFrame. ^^^\n")
# with_new_index = users.set_index('user_id')
# print(with_new_index.head())
# print("\n^^^ set_index actually returns a new DataFrame. ^^^\n")
#users.set_index('user_id', inplace=True)#inplace说明要修改DataFrame
#print users.head()#已经将user_id改为索引,删除了由0开始的默认索引
# #可以通过iloc选择多行数据,iloc是位置索引(从0开始,第一行数据是0)
# print(users.iloc[99])
# print('\n')
# print(users.iloc[[1, 50, 300]])
# #也可以通过loc的label方法选择,loc标签索引,位置的话从1开始(第一行数据是1)
# print(users.loc[100])
# print('\n')
# print(users.loc[[2, 51, 301]])
# #如果我们想要回到原来的0开始的默认索引,采用reset_index
# users.reset_index(inplace=True)
# print users.head()
# The simplified rules of indexing are
# Use loc for label-based indexing
# Use iloc for positional indexing
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#联合joining
left_frame = pd.DataFrame({'key': range(5),
'left_value': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key': range(2, 7),
'right_value': ['f', 'g', 'h', 'i', 'j']})
# print(left_frame)
# print('\n')
# print(right_frame)
#------------------------------------------------------------------------------------------
#inner join (default)只保留key相同的dataframe
# a=pd.merge(left_frame, right_frame, on='key', how='inner')
# print a.info()
# key left_value right_value
# 0 2 c f
# 1 3 d g
# 2 4 e h
# a=pd.merge(left_frame, right_frame, left_on='key', right_on='key')
# print a
# a=pd.merge(left_frame, right_frame, left_on='key', right_index=True)
# print a
# key key_x left_value key_y right_value
# 0 0 0 a 2 f
# 1 1 1 b 3 g
# 2 2 2 c 4 h
# 3 3 3 d 5 i
# 4 4 4 e 6 j
#------------------------------------------------------------------------------------------
#left outer join
# a=pd.merge(left_frame, right_frame, on='key', how='left')
# print a
# key left_value right_value
# 0 0 a NaN
# 1 1 b NaN
# 2 2 c f
# 3 3 d g
# 4 4 e h
# b=pd.merge(left_frame, right_frame, on='key', how='right')
# print b
# key left_value right_value
# 0 2.0 c f
# 1 3.0 d g
# 2 4.0 e h
# 3 5.0 NaN i
# 4 6.0 NaN j
#------------------------------------------------------------------------------------------
# full outer join
# print pd.merge(left_frame, right_frame, on='key', how='outer')
# key left_value right_value
# 0 0.0 a NaN
# 1 1.0 b NaN
# 2 2.0 c f
# 3 3.0 d g
# 4 4.0 e h
# 5 5.0 NaN i
# 6 6.0 NaN j
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#combining
#print pd.concat([left_frame, right_frame])
# key left_value right_value
# 0 0 a NaN
# 1 1 b NaN
# 2 2 c NaN
# 3 3 d NaN
# 4 4 e NaN
# 0 2 NaN f
# 1 3 NaN g
# 2 4 NaN h
# 3 5 NaN i
# 4 6 NaN j
# print pd.concat([left_frame, right_frame], axis=1)
# key left_value key right_value
# 0 0 a 2 f
# 1 1 b 3 g
# 2 2 c 4 h
# 3 3 d 5 i
# 4 4 e 6 j
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
# Grouping
headers = ['name', 'title', 'department', 'salary']
chicago = pd.read_csv('city-of-chicago-salaries.csv',
header=0,
names=headers,
converters={'salary': lambda x: float(x.replace('$', ''))})
# print chicago.head()
# name title department salary
# 0 AARON, ELVIA J WATER RATE TAKER WATER MGMNT 85512.0
# 1 AARON, JEFFERY M POLICE OFFICER POLICE 75372.0
# 2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER GENERAL SERVICES 80916.0
# 3 ABAD JR, VICENTE M CIVIL ENGINEER IV WATER MGMNT 99648.0
# 4 ABBATACOLA, ROBERT J ELECTRICAL MECHANIC AVIATION 89440.0
#pandas的groupby返回dataframegroupby类型
by_dept = chicago.groupby('department')
# print by_dept
# <pandas.core.groupby.DataFrameGroupBy object at 0x0000000007E022E8>
# print(by_dept.count().head()) # NOT NULL records within each column
# print('\n')
# print(by_dept.size().tail()) # total records for each department
# name title salary
# department
# ADMIN HEARNG 42 42 42
# ANIMAL CONTRL 61 61 61
# AVIATION 1218 1218 1218
# BOARD OF ELECTION 110 110 110
# BOARD OF ETHICS 9 9 9
# department
# PUBLIC LIBRARY 926
# STREETS & SAN 2070
# TRANSPORTN 1168
# TREASURER 25
# WATER MGMNT 1857
# dtype: int64
# print(by_dept.sum()[20:25]) # total salaries of each department
# print('\n')
# print(by_dept.mean()[20:25]) # average salary of each department
# print('\n')
# print(by_dept.median()[20:25]) # take that, RDBMS!
# salary
# department
# HUMAN RESOURCES 4850928.0
# INSPECTOR GEN 4035150.0
# IPRA 7006128.0
# LAW 31883920.2
# LICENSE APPL COMM 65436.0
# salary
# department
# HUMAN RESOURCES 71337.176471
# INSPECTOR GEN 80703.000000
# IPRA 82425.035294
# LAW 70853.156000
# LICENSE APPL COMM 65436.000000
# salary
# department
# HUMAN RESOURCES 68496.0
# INSPECTOR GEN 76116.0
# IPRA 82524.0
# LAW 66492.0
# LICENSE APPL COMM 65436.0
#print by_dept.title.nunique().sort_values(ascending=False)[:5]
# department
# WATER MGMNT 153
# TRANSPORTN 150
# POLICE 130
# AVIATION 125
# HEALTH 118
# Name: title, dtype: int64
#------------------------------------------------------------------------------------------
def ranker(df):
"""Assigns a rank to each employee based on salary, with 1 being the highest paid.
Assumes the data is DESC sorted."""
df['dept_rank'] = np.arange(len(df)) + 1
return df
chicago.sort_values('salary', ascending=False, inplace=True)
chicago = chicago.groupby('department').apply(ranker)
# print(chicago[chicago.dept_rank == 1].head(7))
# name title department \
# 18039 MC CARTHY, GARRY F SUPERINTENDENT OF POLICE POLICE
# 8004 EMANUEL, RAHM MAYOR MAYOR'S OFFICE
# 25588 SANTIAGO, JOSE A FIRE COMMISSIONER FIRE
# 763 ANDOLINO, ROSEMARIE S COMMISSIONER OF AVIATION AVIATION
# 4697 CHOUCAIR, BECHARA N COMMISSIONER OF HEALTH HEALTH
# 21971 PATTON, STEPHEN R CORPORATION COUNSEL LAW
# 12635 HOLT, ALEXANDRA D BUDGET DIR BUDGET & MGMT
# salary dept_rank
# 18039 260004.0 1
# 8004 216210.0 1
# 25588 202728.0 1
# 763 186576.0 1
# 4697 177156.0 1
# 21971 173664.0 1
# 12635 169992.0 1
#------------------------------------------------------------------------------------------
print chicago[chicago.department == "LAW"][:5]