TVP

# 基于Markov Chain Monte Carlo的智能手表睡眠数据分析

9.基于Markov Chain Monte Carlo的智能手表睡眠数据分析

8.按主题抓取多个网页及建立自己的数据集

7.Jupyter Notebook/Lab中添加R Kernel的详细步骤

6.Web信息爬取 | 详解 + Reddit等2个案例实践

5.基于MovieLens的影评趋势分析|详解

4.Windows和PC机上搭建Spark+Python开发环境的详细步骤

3.Jupyter Notebook/Lab中添加R Kernel的详细步骤

2.盘点数据科学领域常用的Python库

1.如何用Python学习数据科学

1

1.1 数据准备

import pandas as pd

import numpy as np

import scipy

from scipy import stats

import pymc3 as pm

import theano.tensor as tt

import scipy

from scipy import optimize

import matplotlib.pyplot as plt

%matplotlib inline

from IPython.core.pylabtools import figsize

import matplotlib

import json

matplotlib.rcParams.update(s)

matplotlib.rcParams['figure.figsize'] = (10, 3)

matplotlib.rcParams['font.size'] = 14

matplotlib.rcParams['ytick.major.size'] = 20

# Markov Chain Monte Carlo设置初始样本数量

N_SAMPLES = 5000

# 读入数据

# 设置绘图坐标信息

sleep_labels = ['9:00', '9:30', '10:00', '10:30', '11:00', '11:30', '12:00']

wake_labels = ['5:00', '5:30', '6:00', '6:30', '7:00', '7:30', '8:00']

1.2入睡数据

print('Number of sleep observations %d' % len(sleep_data))

Number of sleep observations 11340

figsize(16, 6)

# 入睡数据

plt.scatter(sleep_data['time_offset'], sleep_data['indicator'],

s= 60, alpha=0.01, facecolor = 'b', edgecolors='b')

plt.yticks([0, 1], ['Awake', 'Asleep']); plt.xlabel('PM Time');

plt.title('Falling Asleep Data', size = 18)

plt.xticks([-60, -30, 0, 30, 60, 90, 120], sleep_labels);

1.3 活动数据

# Wake data

plt.scatter(wake_data['time_offset'], wake_data['indicator'],s= 50, alpha = 0.01, facecolor='r', edgecolors = 'r');

plt.yticks([0, 1], ['Awake', 'Asleep']);

plt.xlabel('AM Time');

plt.title('Waking Up Data')

plt.xticks([-60, -30, 0, 30, 60, 90, 120], wake_labels);

1.4 选择概率分布

figsize(16, 6)

def logistic(x, beta):

return 1. / (1. + np.exp(beta * x))

x = np.linspace(-5, 5, 1000)

for beta in [-5, -1, 0.5, 1, 5]:

plt.plot(x, logistic(x, beta), label = r"$\beta$ = %.1f" % beta)

plt.legend();

plt.title(r'Logistic Function with Different $\beta$ values');

figsize(20, 8)

def logistic(x, beta, alpha=0):

return 1.0 / (1.0 + np.exp(np.dot(beta, x) + alpha))

x = np.linspace(-4, 6, 1000)

plt.plot(x, logistic(x, beta=1), label=r"$\beta = 1, \alpha = 0$", ls="--", lw=2)

plt.plot(x, logistic(x, beta=-1), label=r"$\beta = -1 \alpha = 0$", ls="--", lw=2)

plt.plot(x, logistic(x, -1, 1),

label=r"$\beta = -1, \alpha = 1$", color="darkblue")

plt.plot(x, logistic(x, -1, -1),

label=r"$\beta = -1, \alpha = -1$",color="skyblue")

plt.plot(x, logistic(x, -2, 5),

label=r"$\beta = -2, \alpha = 5$", color="orangered")

plt.plot(x, logistic(x, -2, -5),

label=r"$\beta = -2, \alpha = -5$", color="darkred")

plt.legend(); plt.ylabel('Probability'); plt.xlabel('t')

plt.title(r'Logistic Function with Varying $\beta$ and $\alpha$');

2

Markov Chain Monte Carlo（MCMC）

2.1 参数β和α的先验分布

figsize(20, 8)

nor = stats.norm

x= np.linspace(-10, 10, 1000)

mu = (-5, 0, 4)

tau = (0.5, 1, 2.5)

colors = ("forestgreen", "navy", "darkred")

params = zip(mu, tau, colors)

for param in params:

y = nor.pdf(x, loc = param[0], scale = 1 / param[1])

plt.plot(x, y,

label="$\mu = %d,\;\\tau = %.1f$" % (param[0], param[1]),

color = param[2])

plt.fill_between(x, y, color = param[2], alpha = 0.3)

plt.legend(prop={'size':18});

plt.xlabel("$x$")

plt.ylabel("Probability Density", size = 18)

plt.title("Normal Distributions", size = 20);

2.2 参数搜索空间

%matplotlib inline

import scipy.stats as stats

from IPython.core.pylabtools import figsize

import numpy as np

figsize(16, 8)

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

jet = plt.cm.jet

fig = plt.figure()

x = y = np.linspace(0, 5, 100)

X, Y = np.meshgrid(x, y)

plt.subplot(121)

norm_x = stats.norm.pdf(x, loc=0, scale=1)

norm_y = stats.norm.pdf(y, loc=0, scale=1)

M = np.dot(norm_x[:, None], norm_y[None, :])

im = plt.imshow(M, interpolation='none', origin='lower',

cmap=jet)

plt.xlim(0, 40)

plt.ylim(0, 40)

plt.title("Parameter Search Space for Normal Priors.")

ax.plot_surface(X, Y, M, cmap=plt.cm.jet)

ax.view_init(azim=390)

plt.title("Parameter Search Space for Normal Priors");

2.3 具体算法

3

p(ti)是具有时间自变量的逻辑回归函数，因此以上公式变为：

MCMC的目标就是在使用数据集、假定正态先验分布的基础上找到参数alpha和beta的值。

3.1 睡眠模型

# 根据时间偏移进行排序

sleep_data.sort_values('time_offset', inplace=True)

# 设置时间

time = np.array(sleep_data.loc[:, 'time_offset'])

# 设置时间观测点

sleep_obs = np.array(sleep_data.loc[:, 'indicator'])

with pm.Model() as sleep_model:

# 创建参数alpha和beta

alpha = pm.Normal('alpha', mu=0.0, tau=0.01, testval=0.0)

beta = pm.Normal('beta', mu=0.0, tau=0.01, testval=0.0)

# 根据逻辑函数创建概率

p = pm.Deterministic('p', 1. / (1. + tt.exp(beta * time + alpha)))

# 通过观测数据创建伯努利参数

observed = pm.Bernoulli('obs', p, observed=sleep_obs)

# 通过最大化后验估计设置初始值

# start = pm.find_MAP()

# 设置采样算法

step = pm.Metropolis()

# 进行采样

sleep_trace = pm.sample(N_SAMPLES, step=step, njobs=2);

# 提取样本

alpha_samples = sleep_trace["alpha"][5000:, None]

beta_samples = sleep_trace["beta"][5000:, None]

figsize(16, 10)

plt.subplot(211)

plt.title(r"""Distribution of $\alpha$ with %d samples""" % N_SAMPLES)

plt.hist(alpha_samples, histtype='stepfilled', color = 'darkred', bins=30, alpha=0.8);

plt.ylabel('Probability Density')

plt.subplot(212)

plt.title(r"""Distribution of $\beta$ with %d samples""" % N_SAMPLES)

plt.hist(beta_samples, histtype='stepfilled', color = 'darkblue', bins=30, alpha=0.8)

plt.ylabel('Probability Density');

# 设置概率预测的时间值

time_est = np.linspace(time.min()- 15, time.max() + 15, 1e3)[:, None]

# 取样本平均值

alpha_est = alpha_samples.mean()

beta_est = beta_samples.mean()

# 计算概率

sleep_est = logistic(time_est, beta=beta_est, alpha=alpha_est)

figsize(16, 6)

plt.plot(time_est, sleep_est, color = 'navy',

lw=3, label="Most Likely Logistic Model")

plt.scatter(time, sleep_obs, edgecolor = 'slateblue',

s=50, alpha=0.2, label='obs')

plt.title('Probability Distribution for Sleep with %d Samples' % N_SAMPLES);

plt.legend(prop={'size':18})

plt.ylabel('Probability')

plt.xlabel('PM Time');

plt.xticks([-60, -30, 0, 30, 60, 90, 120], sleep_labels);

print('The probability of sleep increases to above 50% at 10:{} PM.'.format(int(time_est[np.where(sleep_est > 0.5)[0][0]][0])))

The probability of sleep increases to above 50% at 10:14 PM.

colors = ["#348ABD", "#A60628", "#7A68A6"]

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("BMH", colors)

figsize(12, 6)

probs = sleep_trace['p']

plt.scatter(time, probs.mean(axis=0), cmap = cmap,

c = probs.mean(axis=0), s = 50);

plt.title('Probability of Sleep as Function of Time')

plt.xlabel('PM Time');

plt.ylabel('Probability');

plt.xticks([-60, -30, 0, 30, 60, 90, 120], sleep_labels);

print('10:00 PM probability of being asleep: {:.2f}%.'.

format(100 * logistic(0, beta_est, alpha_est)))

print('9:30 PM probability of being asleep: {:.2f}%.'.

format(100 * logistic(-30, beta_est, alpha_est)))

print('10:30 PM probability of being asleep: {:.2f}%.'.

format(100 * logistic(30, beta_est, alpha_est)))

10:00 PM probability of being asleep: 27.41%.

9:30 PM probability of being asleep: 4.80%.

10:30 PM probability of being asleep: 73.90%.

plt.fill_between(time_est[:, 0], *quantiles, alpha=0.6,

color='slateblue', label = '95% CI')

plt.plot(time_est, sleep_est, lw=2, ls='--',

color='black', label="average posterior \nprobability of sleep")

plt.xticks([-60, -30, 0, 30, 60, 90, 120], sleep_labels);

plt.scatter(time, sleep_obs, edgecolor = 'skyblue', s=50, alpha=0.1);

plt.legend(prop={'size':14})

plt.xlabel('PM Time'); plt.ylabel('Probability');

plt.title('Posterior Probabilty with 95% CI');

def sleep_posterior(time_offset, time):

figsize(16, 8)

prob = logistic(time_offset, beta_samples, alpha_samples)

plt.hist(prob, bins=100, histtype='step', lw=4)

plt.title('Probability Distribution for Sleep at %s PM' % time)

plt.xlabel('Probability of Sleep'); plt.ylabel('Samples')

plt.show();

sleep_posterior(0, '10:00')

sleep_posterior(-30, '9:30')

sleep_posterior(30, '10:30')

print('Most likely alpha parameter: {:.6f}.'.format(alpha_est))

print('Most likely beta parameter: {:.6f}.'.format(beta_est))

Most likely alpha parameter: 0.973708.

Most likely beta parameter: -0.067142.

figsize(12, 6)

# alpha的轨迹

plt.subplot(211)

plt.title(r'Trace of $\alpha$')

plt.plot(alpha_samples, color = 'darkred')

plt.xlabel('Samples'); plt.ylabel('Parameter');

# beta的轨迹

plt.subplot(212)

plt.title(r'Trace of $\beta$')

plt.plot(beta_samples, color='b')

plt.xlabel('Samples'); plt.ylabel('Parameter');

figsize(20, 12)

pm.traceplot(sleep_trace, ['alpha', 'beta']);

pm.autocorrplot(sleep_trace, ['alpha', 'beta']);

3.2 活动模型

wake_data.sort_values('time_offset', inplace=True)

time = np.array(wake_data.loc[:, 'time_offset'])

wake_obs = np.array(wake_data.loc[:, 'indicator'])

with pm.Model() as wake_model:

alpha = pm.Normal('alpha', mu=0.0, tau=0.01, testval=0.0)

beta = pm.Normal('beta', mu=0.0, tau=0.01, testval=0.0)

p = pm.Deterministic('p', 1. / (1. + tt.exp(beta * time + alpha)))

observed = pm.Bernoulli('obs', p, observed=wake_obs)

step = pm.Metropolis()

wake_trace = pm.sample(N_SAMPLES, step=step, njobs=2);

alpha_samples = wake_trace["alpha"][5000:, None]

beta_samples = wake_trace["beta"][5000:, None]

time_est = np.linspace(time.min()- 15, time.max() + 15, 1e3)[:, None]

alpha_est = alpha_samples.mean()

beta_est = beta_samples.mean()

wake_est = logistic(time_est, beta=beta_est, alpha=alpha_est)

figsize(16, 6)

plt.plot(time_est, wake_est, color = 'darkred', lw=3, label="average posterior \nprobability of wake")

plt.scatter(time, wake_obs, edgecolor = 'r', facecolor = 'r', s=50, alpha=0.05, label='obs')

plt.title('Posterior Probability of Wake with %d Samples' % N_SAMPLES);

plt.legend(prop={'size':14})

plt.ylabel('Probability')

plt.xlabel('AM Time');

plt.xticks([-60, -30, 0, 30, 60, 90, 120], wake_labels);

print('The probability of being awake passes 50% at 6:{} AM.'.format(int(time_est[np.where(wake_est

The probability of being awake passes 50% at 6:11 AM.

colors = ["#348ABD", "#A60628", "#7A68A6"]

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("BMH", colors)

figsize(12, 6)

probs = wake_trace['p']

plt.scatter(time, probs.mean(axis=0), cmap = cmap, c = probs.mean(axis=0), s = 50);

plt.title('Probability of Sleep as Function of Time')

plt.xlabel('AM Time');

plt.ylabel('Probability');

plt.xticks([-60, -30, 0, 30, 60, 90, 120], wake_labels);

print('Probability of being awake at 5:30 AM: {:.2f}%.'.

format(100 - (100 * logistic(-30, beta=beta_est, alpha=alpha_est))))

print('Probability of being awake at 6:00 AM: {:.2f}%.'.

format(100 - (100 * logistic(0, beta=beta_est, alpha=alpha_est))))

print('Probability of being awake at 6:30 AM: {:.2f}%.'.

format(100 - (100 * logistic(30, beta=beta_est, alpha=alpha_est))))

Probability of being awake at 5:30 AM: 14.07%.

Probability of being awake at 6:00 AM: 37.89%.

Probability of being awake at 6:30 AM: 69.46%.

3.3 睡眠时长模型

raw_data['length'] = 8 - (raw_data['Sleep'] / 60) + (raw_data['Wake'] / 60)

duration = raw_data['length']

figsize(10, 8)

plt.hist(duration, bins = 20, color = 'darkred')

plt.xlabel('Hours'); plt.title('Length of Sleep Distribution');

plt.ylabel('Observations');

with pm.Model() as duration_model:

# 三个参数进行采样

alpha_skew = pm.Normal('alpha_skew', mu=0, tau=0.5, testval=3.0)

mu_ = pm.Normal('mu', mu=0, tau=0.5, testval=7.4)

tau_ = pm.Normal('tau', mu=0, tau=0.5, testval=1.0)

# 时长是一个确定性变量

duration_ = pm.SkewNormal('duration', alpha = alpha_skew, mu = mu_,sd = 1/tau_, observed = duration)

# 进行采样

step = pm.Metropolis()

duration_trace = pm.sample(N_SAMPLES, step=step)

# 从采样中提取最大可能性估计

alpha_skew_samples = duration_trace['alpha_skew'][5000:]

mu_samples = duration_trace['mu'][5000:]

tau_samples = duration_trace['tau'][5000:]

alpha_skew_est = alpha_skew_samples.mean()

mu_est = mu_samples.mean()

tau_est = tau_samples.mean()

x = np.linspace(6, 12, 1000)

y = stats.skewnorm.pdf(x, a = alpha_skew_est, loc=mu_est, scale=1/tau_est)

plt.plot(x, y, color = 'forestgreen')

plt.fill_between(x, y, color = 'forestgreen', alpha = 0.2);

plt.xlabel('Hours'); plt.ylabel('Probability');

plt.title('Posterior Distribution for Duration of Sleep');

plt.vlines(x = x[np.argmax(y)], ymin=0, ymax=y.max(),

linestyles='--', linewidth=2, color='red',

label = 'Most Likely Duration');

print('The most likely duration of sleep is {:.2f} hours.'.format(x[np.argmax(y)]))

print('Probability of at least 6.5 hours of sleep = {:.2f}%.'.format(100 * (1 - stats.skewnorm

.cdf(6.5, a = alpha_skew_est, loc = mu_est, scale = 1/tau_est))))

print('Probability of at least 8.0 hours of sleep = {:.2f}%.'.format(100 * (1 - stats.skewnorm

.cdf(8.0, a = alpha_skew_est, loc = mu_est, scale = 1/tau_est))))

print('Probability of at least 9.0 hours of sleep = {:.2f}%.'.format(100 * (1 - stats.skewnorm

.cdf(9.0, a = alpha_skew_est, loc = mu_est, scale = 1/tau_est))))

Probability of at least 6.5 hours of sleep = 99.07%.

Probability of at least 8.0 hours of sleep = 44.12%.

Probability of at least 9.0 hours of sleep = 11.03%.

x = np.linspace(6, 12, 1000)

y = stats.skewnorm.pdf(x, a = alpha_skew_est, loc=mu_est, scale=1/tau_est)

figsize(10, 8)

# Plot the posterior distribution

plt.plot(x, y, color = 'forestgreen',

label = 'Model', lw = 3)

plt.fill_between(x, y, color = 'forestgreen', alpha = 0.2);

# Plot the observed values

plt.hist(duration, bins=10, color = 'red', alpha=0.8,

label='Observed', normed=True)

plt.xlabel('Hours'); plt.ylabel('Probability');

plt.title('Duration Model');

plt.vlines(x = x[np.argmax(y)], ymin=0, ymax=y.max(),

linestyles='--', linewidth=2, color='k',

label = 'Most Likely Duration');

plt.legend(prop={'size':12});

4

END

• 发表于:
• 原文链接http://kuaibao.qq.com/s/20180416G08KF900?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2018-03-26

2022-11-30

2022-11-30

2022-11-30

2022-11-30

2022-11-30

2022-11-30

2022-11-30

2022-11-30

2022-11-30

2022-11-30