# 【年度系列】监督学习标签在股市中的应用（代码+书籍）

（书籍在文末免费下载）

Pn和P0代表未来和当前的股票价格：

```import numpy as np

mu = 0.1
sig = 1.
N = 400
n_samples = 1000
x = np.arange(N)
samples = []
for i in range(n_samples):
rs = np.random.normal(mu, sig, N)
rs = np.cumsum(rs)
samples.append(rs)
samples = np.array(samples)
r_mu = np.mean(samples, axis=0)
r_sig = np.std(samples, axis=0)```

```def get_t1(close, timestamps, num_days):
t1 = close.index.searchsorted(timestamps + pd.Timedelta(days=num_days))
t1 = t1[t1 < close.shape[0]]
t1 = pd.Series(close.index[t1], index=timestamps[:t1.shape[0]])
return t1

timestamps = close.index
num_days = 10
t1 = get_t1(close, timestamps, num_days)
Output:```
```Date
2000-01-03   2000-01-13
2000-01-04   2000-01-14
2000-01-05   2000-01-18
2000-01-06   2000-01-18
2000-01-07   2000-01-18
Name: Date, dtype: datetime64[ns]```

```def get_touch_idx(close, events, sltp, molecule=None):
# Sample a subset with specific indices
if molecule is not None:
_events = events.loc[molecule]
else:
_events = events
touch_idx = pd.DataFrame(index=_events.index)
# Set Stop Loss and Take Profoit
if sltp[0] > 0:
sls = -sltp[0] * _events["trgt"]
else:
# Switch off stop loss
sls = pd.Series(index=_events.index)
if sltp[1] > 0:
tps = sltp[1] * _events["trgt"]
else:
# Switch off profit taking
tps = pd.Series(index=_events.index)
# Replace undefined value with the last time index
vertical_lines = _events["t1"].fillna(close.index[-1])
for loc, t1 in vertical_lines.iteritems():
df = close[loc:t1]
# Change the direction depending on the side
df = (df / close[loc] - 1) * _events.at[loc, 'side']
touch_idx.at[loc, 'sl'] = df[df < sls[loc]].index.min()
touch_idx.at[loc, 'tp'] = df[df > tps[loc]].index.min()
touch_idx['t1'] = _events['t1'].copy(deep=True)

get_touch_idx获取未来价格何时以及何种barrier。

```import pandas as pd
from finance_ml.multiprocessing import mp_pandas_obj

def get_events(close, timestamps, sltp, trgt, min_ret=0,
# Get sampled target values
trgt = trgt.loc[timestamps]
trgt = trgt[trgt > min_ret]
if len(trgt) == 0:
return pd.DataFrame(columns=['t1', 'trgt', 'side'])
# Get time boundary t1
if t1 is None:
t1 = pd.Series(pd.NaT, index=timestamps)
# slpt has to be either of integer, list or tuple
if isinstance(sltp, list) or isinstance(sltp, tuple):
_sltp = sltp[:2]
else:
_sltp = [sltp, sltp]
# Define the side
if side is None:
# Default is LONG
_side = pd.Series(1, index=trgt.index)
else:
_side = side.loc[trgt.index]
events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1)
events = events.dropna(subset=['trgt'])
time_idx = mp_pandas_obj(func=get_touch_idx,
pd_obj=('molecule', events.index),
close=close, events=events, sltp=_sltp)
# Skip when all of barrier are not touched
time_idx = time_idx.dropna(how='all')
events['type'] = time_idx.idxmin(axis=1)
events['t1'] = time_idx.min(axis=1)
if side is None:
events = events.drop('side', axis=1)
return events
from finance_ml.stats import get_daily_vol

vol = get_daily_vol(close)
print('volatility')

events = get_events(close, timestamps, [2, 2], vol, min_ret=0,
print('events')
```volatility
Date
2000-01-04         NaN
2000-01-05    0.031374
2000-01-06    0.025522
2000-01-10    0.024588
2000-01-11    0.022054
Name: Close, dtype: float64

events
t1      trgt type
Date
2000-01-05 2000-01-12  0.031374   sl
2000-01-06 2000-01-18  0.025522   t1
2000-01-10 2000-01-12  0.024588   sl
2000-01-11 2000-01-18  0.022054   tp
2000-01-12 2000-01-14  0.020946   tp```

get_events在内部使用get_torch_idx并获取标签。

Output, events, contains the followings: - t1, when the barrier is touched - trgt, scale used to define horizontal barriers - type, which barrier is touched。

```def get_sizes(close, events, sign_label=True):
# Prices algined with events
events = events.dropna(subset=['t1'])
# All used indices
time_idx = events.index.union(events['t1'].values).drop_duplicates()
close = close.reindex(time_idx, method='bfill')
# Create out object
out = pd.DataFrame(index=events.index)
out['ret'] = close.loc[events['t1'].values].values / close.loc[
events.index].values - 1.
if 'side' in events:
out['ret'] *= events['side']
out['side'] = events['side']
out['size'] = np.sign(out['ret'])
if sign_label:
out['size'] = np.sign(out['ret'])
out.loc[out['ret'] == 0, 'size'] = 1.
else:
# 0 when touching vertical line
out['size'].loc[events['type'] == 't1'] = 0
if 'side' in events:
out.loc[out['ret'] <= 0, 'size'] = 0
return out

labels = get_sizes(close, events, sign_label=True)
```ret  size
Date
2000-01-05 -0.070293  -1.0
2000-01-06  0.048273   1.0
2000-01-10 -0.057372  -1.0
2000-01-11  0.054311   1.0
2000-01-12  0.060864   1.0```

```# Separate data time stamps
def get_partial_index(df, start=None, end=None):
if start is not None:
df = df.loc[df.index >= start]
if end is not None:
df = df.loc[df.index <= end]
return df.index

train_end = '2017-08-31'
test_start = '2017-09-01'
train_idx = get_partial_index(df, end=train_end)
test_idx = get_partial_index(df, start=test_start)

def generate_features(close, volume, label, timestamps, timelag):
index = close.index
data = []
for i in range(1, timelag):
# Normalize
data.append(close.shift(i).values / close.values)
data.append(volume.shift(i).values / volume.values)

features = pd.DataFrame(np.stack(data, axis=1), index=index)
features = features.loc[timestamps].dropna()
label = label.dropna()
time_idx = features.index & label.index
y = label.loc[time_idx].values
label_map = {-1: 0, 1: 1}
y = np.array([label_map[y_i] for y_i in y]).astype(int)
X = features.loc[time_idx].values
return X, y

timelag = 30
train_X, train_y = generate_features(close, volume, labels['size'], train_idx, timelag=timelag)
test_X, test_y = generate_features(close, volume, labels['size'], test_idx, timelag=timelag)```

Note that close and volume features are normalized with the current value. Intuitively, the scales of close and volume themselves do not have any meanings. The value in comparison to the current close and value are rather essential information. This normalization allows you to build models irrelevant to the scales.

```import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tdata
import torch.optim as optim
from sklearn.metrics import accuracy_score

from torch_utils.datasets import NumpyDataset
from torch_utils.training import train_step, test_step

input_dim = train_X.shape[1]
output_dim = 1

class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_dim, 16)
self.bn1 = nn.BatchNorm1d(16)
self.fc2 = nn.Linear(16, 8)
self.bn2 = nn.BatchNorm1d(8)
self.fc3 = nn.Linear(8, output_dim)

def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = F.relu(self.bn2(self.fc2(x)))
x = self.fc3(x)
return x

def predict(self, x, threshold=.5):
x = self.forward(x)
x = F.sigmoid(x)
return x > threshold

batch_size = 32
batch_size=batch_size, shuffle=True)
batch_size=batch_size)

n_epochs = 1000
model = Net()
loss_func = F.binary_cross_entropy_with_logits
score_func = accuracy_score
for i in range(n_epochs):
loss_func=loss_func, score_func=score_func,
epoch=i, log_interval=0, silent=True)
if i % 100 == 0:

model.eval()
output = model.predict(torch.tensor(test_X).float())
accuracy = accuracy_score(test_y, output)
print(f'Test Accuracy: {accuracy:.4g}')

Test Accuracy: 0.5229```

1、数据量少。

2、需要选择合适的模型。

Tomoaki

295 篇文章139 人订阅

0 条评论

## 相关文章

982

### 这是一份开光的课程 |《神经网络》中文字幕版（1.3 & 1.4）

《Neutral Network for Machine Learning》（机器学习中的神经网络）系列课程，是深度学习大神 Geoffrey Hinton 毕...

2897

2605

5141

5959

6944

1113

1355

### 用keras对国产剧评论文本的情感进行预测

RNN即循环神经网络，其主要用途是处理和预测序列数据。在CNN中，神经网络层间采用全连接的方式连接，但层内节点之间却无连接。RNN为了处理序列数据，层内节点的输...

4265

992