前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >[TextMatch框架] tests

[TextMatch框架] tests

作者头像
MachineLP
发布2020-10-29 10:18:53
3770
发布2020-10-29 10:18:53
举报
文章被收录于专栏:小鹏的专栏

TextMatch

TextMatch is a semantic matching model library for QA & text search … It’s easy to train models and to export representation vectors.

TextMatch/tests模块包含 :

(1)core_test

qa_match_test.py

代码语言:javascript
复制
import sys
from textmatch.core.qa_match import QMatch, AMatch, SemanticMatch

test_dict = {"id0": "其实事物发展有自己的潮流和规律",
   "id1": "当你身处潮流之中的时候,要紧紧抓住潮流的机会",
   "id2": "想办法脱颖而出,即使没有成功,也会更加洞悉时代的脉搏",
   "id3": "收获珍贵的知识和经验。而如果潮流已经退去",
   "id4": "这个时候再去往这个方向上努力,只会收获迷茫与压抑",
   "id5": "对时代、对自己都没有什么帮助",
   "id6": "但是时代的浪潮犹如海滩上的浪花,总是一浪接着一浪,只要你站在海边,身处这个行业之中,下一个浪潮很快又会到来。你需要敏感而又深刻地去观察,略去那些浮躁的泡沫,抓住真正潮流的机会,奋力一搏,不管成败,都不会遗憾。"}


def test_q_match(testword):
    # QMatch
    q_match = QMatch( q_dict=test_dict, match_models=['bow', 'tfidf', 'ngram_tfidf']) 
    q_match_pre = q_match.predict(testword, match_strategy='score', vote_threshold=0.5, key_weight = {'bow': 1, 'tfidf': 1, 'ngram_tfidf': 1})
    print ('q_match_pre>>>>>', q_match_pre )
    return q_match_pre

def test_a_match(testword):
    # AMatch
    a_match = AMatch( a_dict=test_dict, match_models=['bow', 'tfidf', 'ngram_tfidf']) 
    a_match_pre = a_match.predict(testword, ['id0', 'id1'], match_strategy='score', vote_threshold=0.5, key_weight = {'bow': 1, 'tfidf': 1, 'ngram_tfidf': 1}) 
    print ('a_match_pre>>>>>', a_match_pre )
    # a_match_pre>>>>> {'id0': 1.0, 'id1': 0.0} 
    return a_match_pre


def test_semantic_match(testword,words_dict=test_dict):
    # SemanticMatch
    s_match = SemanticMatch( words_dict=words_dict, match_models=['bow', 'tfidf', 'ngram_tfidf'] ) 
    s_match_pre = s_match.predict(testword, ['id0','id1', "id5"], match_strategy='score', vote_threshold=0.5, key_weight = {'bow': 1, 'tfidf': 1, 'ngram_tfidf': 1})
    print ('s_match_pre>>>>>', s_match_pre ) 
    # s_match_pre>>>>> {'id0': 1.0, 'id1': 0.0}
    return s_match_pre




if __name__ == '__main__':
    testword = "其实事物发展有自己的潮流和规律"
    test_q_match(testword)
    test_a_match(testword)
    test_semantic_match(testword)

运行结果:

q_match_pre>>>>> {'id0': 0.9999993948153113} a_match_pre>>>>> {'id0': 0.9999993948153113, 'id1': 0.22259270511979246} s_match_pre>>>>> {'id0': 0.9999993948153113, 'id1': 0.22259270511979246, 'id5': 0.09423726836364266}

text_embedding_test.py

代码语言:javascript
复制
import sys
import json 
from textmatch.config.constant import Constant as const
from textmatch.core.text_embedding import TextEmbedding

test_dict = {"id0": "其实事物发展有自己的潮流和规律",
   "id1": "当你身处潮流之中的时候,要紧紧抓住潮流的机会",
   "id2": "想办法脱颖而出,即使没有成功,也会更加洞悉时代的脉搏",
   "id3": "收获珍贵的知识和经验。而如果潮流已经退去",
   "id4": "这个时候再去往这个方向上努力,只会收获迷茫与压抑",
   "id5": "对时代、对自己都没有什么帮助",
   "id6": "但是时代的浪潮犹如海滩上的浪花,总是一浪接着一浪,只要你站在海边,身处这个行业之中,下一个浪潮很快又会到来。你需要敏感而又深刻地去观察,略去那些浮躁的泡沫,抓住真正潮流的机会,奋力一搏,不管成败,都不会遗憾。"}


if __name__ == '__main__':
    # ['bow', 'tfidf', 'ngram_tfidf', 'bert']
    # ['bow', 'tfidf', 'ngram_tfidf', 'bert', 'w2v']
    text_embedding = TextEmbedding( match_models=['bow', 'tfidf', 'ngram_tfidf', 'w2v'], words_dict=test_dict ) 
    pre = text_embedding.predict( "其实事物发展有自己的潮流和规律" ) 
    print ('text_embedding>>>>>', pre) 

    pre = text_embedding.predict( "其实事物发展有自己的潮流和规律", "id1" ) 
    print ('text_embedding>>>>>', pre) 

(2)models_test

bm25_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_search.bm25 import BM25
from textmatch.config.constant import Constant as const



if __name__ == '__main__':
    # 存放问句
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]

    bm25 = BM25()
    bm25.init(words_list, update=True)
    testword = "我在九寨沟,很喜欢"
    pre = bm25.predict(testword)
    print ('pre>>>>>', pre) 

pre>>>>> [-1.27683889 -1.41282764 0.83974856]

edit_sim_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_search.edit_sim import EditDistance
from textmatch.config.constant import Constant as const



if __name__ == '__main__':
    # 存放问句
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]

    edit_dis = EditDistance()
    edit_dis.init(words_list)
    testword = "我在九寨沟,很喜欢"
    pre = edit_dis.predict(testword)
    print ('pre>>>>>', pre) 

pre>>>>> [0.25, 0.368421052631579, 0.5]

jaccard_sim_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_search.jaccard_sim import Jaccard
from textmatch.config.constant import Constant as const



if __name__ == '__main__':
    # 存放问句
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]

    jaccard_dis = Jaccard()
    jaccard_dis.init(words_list)
    testword = "我在九寨沟,很喜欢"
    pre = jaccard_dis.predict(testword)
    print ('pre>>>>>', pre) 

pre>>>>> [0.23529411764705882, 0.3125, 0.6]

bow_sklearn_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_embedding.bow_sklearn import Bow
from textmatch.config.constant import Constant as const



if __name__ == '__main__':
    # 存放问句
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]

    bow = Bow(dic_path=const.BOW_DIC_PATH, bow_index_path=const.BOW_INDEX_PARH, )
    bow.init(words_list, update=True)
    testword = "我在九寨沟,很喜欢"
    #for word in jieba.cut(testword):
    #    print ('>>>>', word)
    pre = bow.predict(testword)
    print ('pre>>>>>', pre) 

    pre = bow._predict(testword)[0]
    print ('pre>>>>>', pre) 

pre>>>>> [0.27735009 0.53033008 0.86602539] pre>>>>> [1. 0. 1. 1. 0. 1. 0.]

tf_idf_sklearn_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_embedding.tf_idf_sklearn import TfIdf
from textmatch.config.constant import Constant as const


if __name__ == '__main__':
    # 存放问句
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]

    tfidf = TfIdf(dic_path=const.TFIDF_DIC_PATH, tfidf_model_path=const.TFIDF_MODEL_PATH, tfidf_index_path=const.TFIDF_INDEX_PATH, )
    tfidf.init(words_list, update=True)
    testword = "我在九寨沟,很喜欢"
    #for word in jieba.cut(testword):
    #    print ('>>>>', word)
    pre = tfidf.predict(testword)
    print ('pre>>>>>', pre) 

    pre = tfidf._predict(testword)[0]
    print ('pre>>>>>', pre) 

pre>>>>> [0.21094354 0.45357592 0.87701746] pre>>>>> [0.63174505 0. 0.4804584 0.4804584 0. 0.37311881 0. ]

ngram_tf_idf_sklearn_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_embedding.ngram_tf_idf_sklearn import NgramTfIdf
from textmatch.config.constant import Constant as const


if __name__ == '__main__':
    # 存放问句
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]

    tfidf = NgramTfIdf(dic_path=const.NGRAM_TFIDF_DIC_PATH, tfidf_model_path=const.NGRAM_TFIDF_MODEL_PATH, tfidf_index_path=const.NGRAM_TFIDF_INDEX_PATH, )
    tfidf.init(words_list, update=True)
    testword = "我在九寨沟,很喜欢"
    #for word in jieba.cut(testword):
    #    print ('>>>>', word)
    pre = tfidf.predict(testword)
    print ('pre>>>>>', pre) 

    pre = tfidf._predict(testword)[0]
    print ('pre>>>>>', pre) 

pre>>>>> [0. 0.14160782 0.99999983]

pre>>>>> [0. 0. 0. 0. 0.62276601 0. 0. 0. 0. 0. 0. 0.4736296 0.62276601 0. 0. 0. 0. ]

w2v_test.py

代码语言:javascript
复制
import sys
from textmatch.models.text_embedding.w2v import Word2Vec
from textmatch.models.text_embedding.stop_words import StopWords
from textmatch.config.constant import Constant as const



if __name__ == '__main__':
    words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
    w2v = Word2Vec(w2v_model_file=const.W2V_MODEL_FILE,  stop_word=StopWords(stopwords_file=const.STOPWORDS_FILE) )
    w2v.init(words_list, update=True)
    testword = "我在九寨沟,很喜欢"
    pre = w2v.predict(testword)
    print ('pre>>>>>', pre) 

pre>>>>> [0.17818374 0.27095952 0.70004393]

wmd_test.py

代码语言:javascript
复制
import time
import jieba
import gensim
import threading
import numpy as np
from textmatch.config.constant import Constant as const
# 粗排:使用word mover distance(WMD)来进行初始的排查,最终得分0-0.15的太相似了,0.45-1分的基本不相关,所以从0.15-0.45分钟选择了10%来进行人工标注


# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
w2v_model_file = const.W2V_MODEL_FILE
w2v_model = gensim.models.Word2Vec.load(w2v_model_file)

w2v_model.init_sims(replace=True) # normalizes vectors
distance = w2v_model.wmdistance("你们是你们哪,你们哪里的。", "你们是哪里,你们是谁?")  
print ('distance>>>>', distance) 



'''
"你有什么事你说。", "我是他家人/朋友,你有什么事可以给我说?"            0.6694891459671026
"呃,我想提前结清我名下那个款项。", "我需要提前结清"                    0.6992085239002946
"你们是你们哪,你们哪里的。", "你们是哪里,你们是谁?"                  0.27438064142232443   
"嗯,好。", "你们催收人员说要对我上门催收,是不是真的?"                 0.948713353219643
"嗯。就是您就是就是。就是您就是您拨打的这个电话。", "你们催收人员说要对我上门催收,是不是真的?"                 0.8855274054486878
"提前结清。", "我需要提前结清"                 0.5150805852253076
'''
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2020/06/12 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档