TVP

# 自然语言处理中句子相似度计算的几种方法

TF 计算

TFIDF 计算

Word2Vec 计算

importdistance

defedit_distance(s1, s2):

returndistance.levenshtein(s1, s2)

s1 ='string'

s2 ='setting'

print(edit_distance(s1, s2))

2

pip3 install distance

importdistance

defedit_distance(s1, s2):

returndistance.levenshtein(s1, s2)

strings = [

'你在干什么',

'你在干啥子',

'你在做什么',

'你好啊',

'我喜欢吃香蕉'

]

target ='你在干啥'

results = list(filter(lambdax: edit_distance(x, target)

print(results)

['你在干什么','你在干啥子']

fromsklearn.feature_extraction.textimportCountVectorizer

importnumpyasnp

defjaccard_similarity(s1, s2):

return' '.join(list(s))

# 将字中间加入空格

# 转化为TF矩阵

cv = CountVectorizer(tokenizer=lambdas: s.split())

corpus = [s1, s2]

vectors = cv.fit_transform(corpus).toarray()

# 求交集

numerator = np.sum(np.min(vectors, axis=))

# 求并集

denominator = np.sum(np.max(vectors, axis=))

# 计算杰卡德系数

return1.0* numerator / denominator

s1 ='你在干嘛呢'

s2 ='你在干什么呢'

print(jaccard_similarity(s1, s2))

['么','什','你','呢','嘛','在','干']

cv.get_feature_names()

[[11111]

[111111]]

TF计算

cosθ=a·b/|a|*|b|

fromsklearn.feature_extraction.textimportCountVectorizer

importnumpyasnp

fromscipy.linalgimportnorm

deftf_similarity(s1, s2):

return' '.join(list(s))

# 将字中间加入空格

# 转化为TF矩阵

cv = CountVectorizer(tokenizer=lambdas: s.split())

corpus = [s1, s2]

vectors = cv.fit_transform(corpus).toarray()

# 计算TF系数

returnnp.dot(vectors[], vectors[1]) / (norm(vectors[]) * norm(vectors[1]))

s1 ='你在干嘛呢'

s2 ='你在干什么呢'

print(tf_similarity(s1, s2))

TFIDF计算

fromsklearn.feature_extraction.textimportTfidfVectorizer

importnumpyasnp

fromscipy.linalgimportnorm

deftfidf_similarity(s1, s2):

return' '.join(list(s))

# 将字中间加入空格

# 转化为TF矩阵

cv = TfidfVectorizer(tokenizer=lambdas: s.split())

corpus = [s1, s2]

vectors = cv.fit_transform(corpus).toarray()

# 计算TF系数

returnnp.dot(vectors[], vectors[1]) / (norm(vectors[]) * norm(vectors[1]))

s1 ='你在干嘛呢'

s2 ='你在干什么呢'

print(tfidf_similarity(s1, s2))

Word2Vec计算

Word2Vec，顾名思义，其实就是将每一个词转换为向量的过程。如果不了解的话可以参考：https://blog.csdn.net/itplus/article/details/37969519。

importgensim

importjieba

importnumpyasnp

fromscipy.linalgimportnorm

model_file ='./word2vec/news_12g_baidubaike_20g_novel_90g_embedding_64.bin'

defvector_similarity(s1, s2):

defsentence_vector(s):

words = jieba.lcut(s)

v = np.zeros(64)

forwordinwords:

v += model[word]

v /= len(words)

returnv

v1, v2 = sentence_vector(s1), sentence_vector(s2)

returnnp.dot(v1, v2) / (norm(v1) * norm(v2))

s1 ='你在干嘛'

s2 ='你正做什么'

vector_similarity(s1, s2)

strings = [

'你在干什么',

'你在干啥子',

'你在做什么',

'你好啊',

'我喜欢吃香蕉'

]

target ='你在干啥'

forstringinstrings:

print(string, vector_similarity(string, target))

• 发表于:
• 原文链接https://kuaibao.qq.com/s/20180615G101H900?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2022-12-01

2022-12-01

2022-12-01

2018-06-04

2018-06-13

2018-06-13

2022-12-01

2018-05-22

2018-06-14

2022-12-01