import gensim
import numpy as np
import jieba
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
# stop_text = open('stop_list.txt', 'r')
# stop_word = []
# for line in stop_text:
# stop_word.append(line.strip())
TaggededDocument = gensim.models.doc2vec.TaggedDocument
def get_corpus():
with open("corpus_seg.txt", 'r') as doc:
docs = doc.readlines()
train_docs = []
for i, text in enumerate(docs):
word_list = text.split(' ')
length = len(word_list)
word_list[length - 1] = word_list[length - 1].strip()
document = TaggededDocument(word_list, tags=[i])
train_docs.append(document)
return train_docs
def train(x_train, size=200, epoch_num=1):
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
model_dm.