今天在知乎看到一个问题:25岁以下前五得分后卫你觉得是谁,怎么排?
意识到样本排序问题
主成分分析可以为每个样本打分,根据分值大小排序
R语言里 ballr
包里有一个函数 NBAPerGameAdvStatistics()
可以抓取NBA指定赛季所有球员的统计数据;数据来源是https://www.basketball-reference.com。
> library(ballr)
> players<-NBAPerGameAdvStatistics("2019")
> dim(players)
[1] 708 30
2018-2019赛季NBA总共有708位球员,统计指标中包括30各变量。可以通过 colnames(players)
命令查看所有的变量名称。通过 head(players)
命令查看前六行数据。
> colnames(players)
[1] "rk" "player" "pos" "age" "tm" "g"
[7] "mp" "per" "tspercent" "x3par" "ftr" "orbpercent"
[13] "drbpercent" "trbpercent" "astpercent" "stlpercent" "blkpercent" "tovpercent"
[19] "usgpercent" "x" "ows" "dws" "ws" "ws_48"
[25] "x_2" "obpm" "dbpm" "bpm" "vorp" "link"
> head(players)
rk player pos age tm g mp per tspercent x3par ftr orbpercent drbpercent trbpercent
1 1 Alex Abrines SG 25 OKC 31 588 6.3 0.507 0.809 0.083 0.9 7.8 4.2
2 2 Quincy Acy PF 28 PHO 10 123 2.9 0.379 0.833 0.556 2.7 20.1 11.3
3 3 Jaylen Adams PG 22 ATL 34 428 7.6 0.474 0.673 0.082 2.6 12.3 7.4
4 4 Steven Adams C 25 OKC 80 2669 18.5 0.591 0.002 0.361 14.7 14.8 14.7
5 5 Bam Adebayo C 21 MIA 82 1913 17.9 0.623 0.031 0.465 9.2 24.0 16.6
6 6 Deng Adel SF 21 CLE 19 194 2.7 0.424 0.639 0.111 1.6 9.6 5.4
astpercent stlpercent blkpercent tovpercent usgpercent x ows dws ws ws_48 x_2 obpm dbpm bpm
1 4.3 1.3 0.9 7.9 12.2 NA 0.1 0.6 0.6 0.053 NA -2.4 -0.9 -3.4
2 8.2 0.4 2.7 15.2 9.2 NA -0.1 0.0 -0.1 -0.022 NA -5.7 -0.3 -5.9
3 19.8 1.5 1.0 19.7 13.5 NA -0.1 0.2 0.1 0.011 NA -3.1 -1.3 -4.4
4 6.6 2.0 2.4 12.6 16.4 NA 5.1 4.0 9.1 0.163 NA 0.6 2.1 2.7
5 14.2 1.8 3.0 17.1 15.8 NA 3.4 3.4 6.8 0.171 NA -0.6 3.6 3.0
6 3.4 0.3 1.8 13.7 9.9 NA -0.2 0.0 -0.2 -0.054 NA -5.3 -2.0 -7.3
vorp link
1 -0.2 /players/a/abrinal01.html
2 -0.1 /players/a/acyqu01.html
3 -0.3 /players/a/adamsja01.html
4 3.2 /players/a/adamsst01.html
5 2.4 /players/a/adebaba01.html
6 -0.3 /players/a/adelde01.html
rank和link变量没有用,在后续分析中可以去掉;还有两列x和x_2都是缺失值也可以在后续的分析中删除。
> df<-players%>%
filter(pos == "PG")%>%
filter(age<=25)%>%
select(-rk,-link,-x,-x_2)
> dim(df)
[1] 62 26
可以看到25岁以下的得分后卫总共有62位。
#构造数据集
df1<-data.frame(A=sort(c(rep(1:8,7),rep(9,6))),
B=c(rep(1:7,8),1:6),
Name=gsub(" ","\n",df$player))
head(df1)
#绘图
library(ggplot2)
ggplot(df1,aes(x=A,y=B))+
geom_text(aes(label=Name))+
theme_bw()+xlim(0.5,9.5)+ylim(0,8)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank())
放眼望去好像没有太熟悉的名字。
(数据中竟然没有得分篮板助攻等数据,回头检查才发现 NBAPerGameAdvStatistics()
拿到的是高阶统计数据,还有一个函数是 NBAPerGameStatisticsPer100Poss(season=2018)
可以拿到常规数据。数据还得整合。)
df2<-df%>%
filter(g >= sort(df$g,decreasing = T)[10])
df2$rank<-1:dim(df2)[1]
df2$Name<-gsub(" ","\n",df2$player)
ggplot(df2,aes(x=reorder(rank,g),y=g,fill=Name))+
geom_bar(stat="identity")+ylim(0,100)+
theme_bw()+geom_text(aes(label=Name),hjust=0)+
geom_text(aes(label=g),hjust=2)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "none")+
coord_flip()+scale_fill_brewer(palette = "Spectral")+
labs(title="比赛场次",caption="Author: MingYan")
自己能认出来的吹杨和西蒙斯
df3<-df%>%
filter(mp >= sort(df$mp,decreasing = T)[10])
df3$rank<-1:dim(df3)[1]
df3$Name<-gsub(" ","\n",df3$player)
df3
ggplot(df3,aes(x=reorder(rank,mp),y=mp,fill=Name))+
geom_bar(stat="identity")+ylim(0,2900)+
theme_bw()+geom_text(aes(label=Name),hjust=0)+
geom_text(aes(label=mp),hjust=2)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "none")+
coord_flip()+scale_fill_brewer(palette = "Spectral")+
labs(title="出场时间(分钟)",caption="Author: MingYan")
df4<-df%>%
filter(per >= sort(df$per,decreasing = T)[10])
df4$rank<-1:dim(df4)[1]
df4$Name<-gsub(" ","\n",df4$player)
df3
ggplot(df4,aes(x=reorder(rank,per),y=per,fill=Name))+
geom_bar(stat="identity")+ylim(0,40)+
theme_bw()+geom_text(aes(label=Name),hjust=0)+
geom_text(aes(label=per),hjust=2)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "none")+
coord_flip()+scale_fill_brewer(palette = "Spectral")+
labs(title="PER",caption="Author: MingYan")
library(ballr)
help(package="ballr")
players_a<-NBAPerGameStatistics("2019")
players_b<-NBAPerGameAdvStatistics("2019")
dim(players_a)
dim(players_b)
df<-data.frame(player=players_a$player,
pos=players_a$pos,
age=players_a$age,
tm=players_a$tm,
g=players_a$g,
gs=players_a$gs,
mp=players_a$mp,
fga=players_a$fga,
fgpercent=players_a$fgpercent,
three_papercent=players_a$x3ppercent,
three_pa=players_a$x3pa,
fta=players_a$fta,
ftpercent=players_a$ftpercent,
trb=players_a$trb,
ast=players_a$ast,
stl=players_a$stl,
blk=players_a$blk,
tov=players_a$tov,
pts=players_a$pts,
per=players_b$per,
ts=players_b$tspercent,
ws=players_b$ws)
dim(df)
#选择25岁以下的得分后卫
library(dplyr)
df1<-df%>%
filter(pos == "PG")%>%
filter(age <= 25)
dim(df1)
head(df)
[1] 62 22
player pos age tm g gs mp fga fgpercent three_papercent three_pa fta ftpercent trb
1 Alex Abrines SG 25 OKC 31 2 19.0 5.1 0.357 0.323 4.1 0.4 0.923 1.5
2 Quincy Acy PF 28 PHO 10 0 12.3 1.8 0.222 0.133 1.5 1.0 0.700 2.5
3 Jaylen Adams PG 22 ATL 34 1 12.6 3.2 0.345 0.338 2.2 0.3 0.778 1.8
4 Steven Adams C 25 OKC 80 80 33.4 10.1 0.595 0.000 0.0 3.7 0.500 9.5
5 Bam Adebayo C 21 MIA 82 28 23.3 5.9 0.576 0.200 0.2 2.8 0.735 7.3
6 Deng Adel SF 21 CLE 19 3 10.2 1.9 0.306 0.261 1.2 0.2 1.000 1.0
ast stl blk tov pts per ts ws
1 0.6 0.5 0.2 0.5 5.3 6.3 0.507 0.6
2 0.8 0.1 0.4 0.4 1.7 2.9 0.379 -0.1
3 1.9 0.4 0.1 0.8 3.2 7.6 0.474 0.1
4 1.6 1.5 1.0 1.7 13.9 18.5 0.591 9.1
5 2.2 0.9 0.8 1.5 8.9 17.9 0.623 6.8
6 0.3 0.1 0.2 0.3 1.7 2.7 0.424 -0.2
df_points<-df1%>%
filter(pts >= sort(df1$pts,decreasing = T)[10])
df_points
df_points$rank<-1:dim(df_points)[1]
df_points$Name<-gsub(" ","\n",df_points$player)
library(ggplot2)
ggplot(df_points,aes(x=reorder(rank,pts),y=pts,fill=Name))+
geom_bar(stat="identity")+ylim(0,23)+
theme_bw()+geom_text(aes(label=Name),hjust=0)+
geom_text(aes(label=pts),hjust=2)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "none")+
coord_flip()+scale_fill_manual(values=rainbow(10))+
labs(title="Points Per Game",caption="Author: MingYan")
25岁以下得分后卫场均得分前十名:1、篮网队的拉塞尔;2、老鹰队的特雷杨;3、掘金队的贾马尔穆雷;4、国王队的福克斯(不确定);5、76人的西蒙斯;6-9光看英文名还真想不起来是谁;10、是原来小牛队的丹尼尔史密斯吗?
df_rebound<-df1%>%
filter(trb >= sort(df1$trb,decreasing = T)[10])
df_rebound
df_rebound$rank<-1:dim(df_rebound)[1]
df_rebound$Name<-gsub(" ","\n",df_rebound$player)
ggplot(df_rebound,aes(x=reorder(rank,trb),y=trb,fill=Name))+
geom_bar(stat="identity")+ylim(0,10)+
theme_bw()+geom_text(aes(label=Name),hjust=0)+
geom_text(aes(label=trb),hjust=2)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "none")+
coord_flip()+scale_fill_manual(values=rainbow(10))+
labs(title="Total Rebounds Per Game",
caption="Source: https://www.basketball-reference.com\nAuthor: MingYan")
场均篮板前两名:1、76人的西蒙斯;2、湖人队的朗佐鲍尔;
library(ggsci)
df_assists<-df1%>%
filter(ast >= sort(df1$ast,decreasing = T)[10])
df_assists
df_assists$rank<-1:dim(df_assists)[1]
df_assists$Name<-gsub(" ","\n",df_assists$player)
df_assists<-df_assists[-4,]
ggplot(df_assists,aes(x=reorder(rank,ast),y=ast,fill=Name))+
geom_bar(stat="identity")+ylim(0,10)+
theme_bw()+geom_text(aes(label=Name),hjust=0)+
geom_text(aes(label=ast),hjust=2)+
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
legend.position = "none")+
coord_flip()+
labs(title="",
caption="Source: https://www.basketball-reference.com\nAuthor: MingYan")+
scale_fill_ucscgb()
第一名:老鹰队的特雷杨
糟糕,怎么根据主成分来打分想不起来了!抽时间仔细看主成分的分析方法!