paddlepaddle如何预加载embedding向量

Gxjun

发布于 2019-12-10 16:33:11

9910

发布于 2019-12-10 16:33:11

文章被收录于专栏：ml

使用小批量数据时，模型容易过拟合，所以需要对全量数据进行处理，我是用的是word2vec训练的词向量. 那么训练好对词向量如何加载呢？

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   Version     :       None
   File Name   :       paddle_load_w2v
   Description :       None
   Author      :       gongxijun
   Email       :      
   date        :       2019-12-04
-------------------------------------------------
   Change Activity:
                   2019-12-04:
-------------------------------------------------
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

__author__ = 'gongxijun'
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.nets as nets
import numpy as np
import math
import codecs
from huangdao.dataset import data_feeder


def load_parameter(file_name):
    embeddings = []
    words = []
    with codecs.open(file_name, 'r',encoding="utf8") as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
        for line in range(vocab_size):
            word_list = f.readline().strip("\n").strip(" ").split(' ')
            word = word_list[0]
            vector = word_list[1:]
            words.append(word if len(word) > 0 else "unk")
            assert len(vector) == vector_size, "{} {}".format(len(vector), vector_size)
            embeddings.append(np.array(vector))
    assert len(words) == len(embeddings)
    return words, embeddings


word_dict_len = 74378
word_dim = 128


def get_embedding(name, shape, is_sparse=True, dtype='int64'):
    """
    :param name:
    :param is_categorical: bool 是否是类标签
    :param shape: must be (a,b)
    :param dtype:
    :param is_sparse: bool
    :return:
    """
    alias_id = layers.data(name=name, shape=[1], dtype=dtype)
    assert len(shape) == 2, '{} must equal 2'.format(len(shape))
    alias_emb = layers.embedding(input=alias_id, size=shape,
                                 param_attr=fluid.param_attr.ParamAttr(name="embedding_{}".format(name)),
                                 is_sparse=is_sparse)
    alias_fc = layers.fc(input=alias_emb, size=shape[1])
    return alias_fc


words_emb = get_embedding("words", shape=(word_dict_len, word_dim))

place = fluid.CPUPlace()
exe = fluid.Executor(place)

exe.run(fluid.default_startup_program())
embedding_param = fluid.global_scope().find_var(
    "embedding_words").get_tensor()
words, embeddings = load_parameter("/Users/gongxijun/data/item2vec.txt")
embedding_param.set(embeddings, place)

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2019-12-05 ，如有侵权请联系 cloudcommunity@tencent.com 删除

word2vec