# Python代码和贝叶斯理论告诉你，谁是最好的棒球选手

RasmusBååth的视频链接：

“不论你了解与否，但棒球的魅力就在于精确度。没有其他运动像棒球这样完全依赖于运动数据的连续性，统计性和有序性。棒球球迷比注册会计师还要关注数字。”

——体育记者Jim Murray

1.数据

2.生成模型

3.先验概率

Fox Sports链接：

https://www.foxsports.com/mlb/stats

importpandas as pd

importseaborn as sns

importrequests

from bs4importBeautifulSoup

plt.style.use('fivethirtyeight')

%matplotlibinline

%config InlineBackend.figure_format ='retina'

def batting_stats(url,season):

r = requests.get(url)

soup = BeautifulSoup(r.text,'lxml')

table = soup.find_all("table",{"class":"wisbb_standardTable tablesorter"})[]

ifseason =='spring':

row_height = len(table.find_all('tr')[:-1])

else:

row_height = len(table.find_all('tr')[:-2])

result_df = pd.DataFrame(columns=[row.text.strip()forrow in table_head.find_all('th')], index = range(,row_height))

ds_url_st ="https://www.foxsports.com/mlb/dominic-smith-player-stats?seasonType=3"

dominic_smith_spring = batting_stats(ds_url_st,'spring')

dominic_smith_spring.iloc[-1]

defposterior(n_try, k_success, prior):

hit=list()

forp in prior:

hit.append(np.random.binomial(n_try, p))

posterior = prior[list(map(lambda x: x == k_success, hit))]

plt.figure(figsize=(8,5))

plt.hist(posterior)

plt.title('Posterior distribution')

plt.xlabel('Posterior on AVG')

plt.ylabel('Frequency')

print('Number of draws left: %d, Posterior mean: %.3f, Posterior median: %.3f, Posterior 95%% quantile interval: %.3f-%.3f'%

(len(posterior), posterior.mean(), posterior.median(), posterior.quantile(.025), posterior.quantile(.975)))

ds_n_trials =int(dominic_smith_spring[['AB','H']].iloc[-1][])

ds_k_success =int(dominic_smith_spring[['AB','H']].iloc[-1][1])

posterior(ds_n_trials, ds_k_success, prior_ni)

dominic_smith_spring.iloc[-2:]

Beta分布是一个连续概率分布，它有两个参数，alpha和beta。Beta分布最常见的用途之一是对一个实验的成功概率的不确定性进行建模。

Beta分布相关内容：

https://www.statlect.com/probability-distributions/beta-distribution

n_draw =20000

prior_trials =int(dominic_smith_spring.iloc[3].AB)

prior_success =int(dominic_smith_spring.iloc[3].H)

prior_i = pd.Series(np.random.beta(prior_success+1, prior_trials-prior_success+1, size = n_draw))

plt.figure(figsize=(8,5))

plt.hist(prior_i)

plt.title('Beta distribution(a=%d, b=%d)'% (prior_success+1,prior_trials-prior_success+1))

plt.xlabel('Prior on AVG')

plt.ylabel('Frequency')

posterior(ds_n_trials, ds_k_success, prior_i)

ds_url ="https://www.foxsports.com/mlb/dominic-smith-player-stats?seasonType=1"

dominic_smith_reg = batting_stats(ds_url,'regular')

dominic_smith = dominic_smith_reg.append(dominic_smith_spring.iloc[3], ignore_index=True)

dominic_smith

ds_prior_trials = pd.to_numeric(dominic_smith.AB).sum()

ds_prior_success = pd.to_numeric(dominic_smith.H).sum()

n_draw =20000

prior_i_02 = pd.Series(np.random.beta(ds_prior_success+1, ds_prior_trials-ds_prior_success+1, size = n_draw))

plt.figure(figsize=(8,5))

plt.hist(prior_i_02)

plt.title('Beta distribution(a=%d, b=%d)'% (ds_prior_success+1,ds_prior_trials-ds_prior_success+1))

plt.xlabel('Prior on AVG')

plt.ylabel('Frequency')

posterior(ds_n_trials, ds_k_success, prior_i_02)

Pymc3链接：

https://github.com/pymc-devs/pymc3

HMC-NUTS链接：

http://blog.fastforwardlabs.com/2017/01/30/the-algorithms-behind-probabilistic-programming.html

gc_url_st ="https://www.foxsports.com/mlb/gavin-cecchini-player-stats?seasonType=3"

gc_url_reg ="https://www.foxsports.com/mlb/gavin-cecchini-player-stats?seasonType=1"

gavin_cecchini_spring = batting_stats(gc_url_st,'spring')

gavin_cecchini_reg = batting_stats(gc_url_reg,'regular')

gc_n_trials =int(gavin_cecchini_spring.iloc[1].AB)

gc_k_success =int(gavin_cecchini_spring.iloc[1].H)

gc_prior = pd.DataFrame(gavin_cecchini_reg.iloc[1]).transpose().append(gavin_cecchini_spring.iloc[])

gc_prior

gc_prior_trials = pd.to_numeric(gc_prior.AB).sum()

gc_prior_success = pd.to_numeric(gc_prior.H).sum()

def observed_data_generator(n_try,observed_data):

result = np.ones(observed_data)

fails = n_try - observed_data

result = np.append(result, np.zeros(fails))

returnresult

ds_observed = observed_data_generator(ds_n_trials,ds_k_success)

gc_observed = observed_data_generator(gc_n_trials,gc_k_success)

importpymc3 as pm

with pm.Model() as model_a:

D_p = pm.Beta('DS_AVG', ds_prior_success+1, ds_prior_trials-ds_prior_success+1)

G_p = pm.Beta('GC_AVG', gc_prior_success+1, gc_prior_trials-gc_prior_success+1)

DS = pm.Bernoulli('DS', p=D_p, observed=ds_observed)

GC = pm.Bernoulli('GC', p=G_p, observed=gc_observed)

DvG = pm.Deterministic('DvG', D_p - G_p)

start = pm.find_MAP()

trace = pm.sample(10000, start=start)

pm.plot_posterior(trace, varnames=['DS_AVG','GC_AVG','DvG'],ref_val=)

http://www.sumsar.net/blog/2014/10/probable-points-and-credible-intervals-part-one/

pm.summary(trace)

https://github.com/tthustla/Bayesball/blob/master/Bayesball.ipynb

https://towardsdatascience.com/bayesball-bayesian-analysis-of-batting-average-102e0390c0e4

【今日机器学习概念】

Have a Great Definition

• 发表于:
• 原文链接http://kuaibao.qq.com/s/20180418A0LBT300?refer=cp_1026
• 腾讯「云+社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 yunjia_community@tencent.com 删除。

2022-01-18

2018-04-13

2018-04-12

2022-01-18

2022-01-18

2022-01-18

2018-06-14

2022-01-18

2022-01-18

2022-01-18

2022-01-18

2022-01-18