# 用 Python 分析 YouTube 百万条数据

video ID

age

category

length

views

rate

ratings

related IDs

## 计算评分前 100 的视频

1234567891011

CRVmdd_bpA 4.87BFBixmV-pc 4.81BArP_-_vXI 4.566M41YqM_xk 4.52CYhSNHbeC8 4.38BwggJEPZQ 4.29FDJ8x3ZKQE 3.75BbtF1Ysel0 3.4AdZ6wRHsrE 2.9AKiJZEOqzQ 1.33.....

### mapper

1234567891011121314

#!/usr/bin/env python#coding=utf-8import sysfor line in sys.stdin: line = line.strip() lineList = line.split("\t") if len(lineList) > 7: #规律少于7个字段的无效数据 videoName = lineList[0] try: videoRating = float(lineList[6]) #去除脏数据，将不符合数据的评分归零 except ValueError: videoRating = 0.0 print "%s\t%s"%(videoName, videoRating)

### reducer

1234567891011121314151617

#!/usr/bin/env python#coding=utf-8import sysdict = {}for line in sys.stdin: line = line.strip() line = line.split('\t') dict[line[0]] = float(line[1]) #将所有数据放到 dictionary 中，以便进行排序## 根据值进行排序，然后输出前 100 个sortResult = sorted(dict.items(),key=lambda d:d[1],reverse=True)i = 0for max in sortResult: print "%s\t%s"%(max[0],max[1]) i+=1 if i > 99: break

## 计算每个视频类型下的视频数量

123456

Science & Technology 45151Sports 23548Travel & Events 11544Nonprofits & Activism 15644People & Blogs 35165...

### mapper

12345678

#!/usr/bin/env python#coding=utf-8import sysfor line in sys.stdin: line = line.strip() lineList = line.split('\t') if len(lineList) > 4: print "%s\t%s"%(lineList[3], 1)

### reducer

123456789101112

#!/usr/bin/env python#coding=utf-8import sysdict = {}for line in sys.stdin: line = line.strip() key, count = line.split('\t') dict.setdefault(key, 0) dict[key] += int(count)for key, value in dict.items(): print "%s\t%s"%(key,value)

123456789101112131415161718

News & Politics 153677Education 15033Nonprofits & Activism 5879Entertainment 726567Travel & Events 25934Howto & DIY 36750Gadgets & Games 129481People & Blogs 275704Science & Technology 15856Howto & Style 45422Pets & Animals 54339UNA 33252Music 825158Travel & Places 34962Comedy 364937Sports 286697Autos & Vehicles 94111Film & Animation 296458

