利用Python进行数据分析-案例1-USA.gov数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import json
path = '/Users/piqianchao/data-visualization/pydata-book/datasets/bitly_usagov/example.txt'
open(path).readline() # 读取一行数据
# 结果
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
records = [json.loads(line) for line in open(path)] # json---> Python字典形式
records[0] # 第一个字典形式数据
# 结果转成了Python能够处理的字典形式
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'c': 'US',
'nk': 1,
'tz': 'America/New_York',
'gr': 'MA',
'g': 'A6qOVH',
'h': 'wfLQtf',
'l': 'orofrog',
'al': 'en-US,en;q=0.8',
'hh': '1.usa.gov',
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
't': 1331923247,
'hc': 1331822918,
'cy': 'Danvers',
'll': [42.576698, -70.954903]}
# 最常出现的时区 tz
time_zones = [rec['tz'] for rec in records if 'tz' in rec] # 不是所有的tz字段都有时区记录,加上if判断
time_zones[:10]
# 结果
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
# 1. 如何进行计数
def get_counts(sequence):
counts = {} # 将计数值保存在字典中
for x in sequence:
if x in counts:
counts[x] += 1 # 存在则计数加1
else:
counts[x] = 1 # 不存在则定为1
return counts
# 使用Python标准库
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # 初始化为整数
for x in sequence:
counts[x] += 1
return counts
counts = get_counts(time_zones) # 直接将times_zones列表传进来,进行统计个数
counts['America/New_York'] # 1251
# 如何获取前10位的时区及其计数值
def top_counts(count_dict, n=10):
# 从传进来的字典中取出键值,分别赋给(count, tz)构成列表中包含的是集合形式
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort() # 将列表按照 count 的大小排序
# print(value_key_pairs)
return value_key_pairs[-n:] # 取出后10个
top_counts(counts)
# 结果
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
# 方法2:通过Collections.Counter类
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
# 结果
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
frame = pd.DataFrame(records) # records 是个字典形式
frame.info()
# 结果
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 18 columns):
a 3440 non-null object
c 2919 non-null object
nk 3440 non-null float64
tz 3440 non-null object
gr 2919 non-null object
g 3440 non-null object
h 3440 non-null object
l 3440 non-null object
al 3094 non-null object
hh 3440 non-null object
r 3440 non-null object
u 3440 non-null object
t 3440 non-null float64
hc 3440 non-null float64
cy 2919 non-null object
ll 2919 non-null object
_heartbeat_ 120 non-null float64
kw 93 non-null object
dtypes: float64(4), object(14)
memory usage: 500.8+ KB
tz_counts = frame['tz'].value_counts() # 统计每个数目并且输出
tz_counts[:10]
# 结果
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64