# 使用word2vec和xgboost寻找Quora上的相似问题

Changing the world, one article at a time. Sr. Data Scientist, Toronto Canada. Opinion=my own.

df = df.dropna(how="any").reset_index(drop=True)

a= 0

for i in range(a,a+10):

print(df.question1[i])

print(df.question2[i])

print()

question1 = 'What would a Trump presidency mean for

current international master’s students on an F1

visa?'

question2 = 'How will a Trump presidency affect the

students presently in US or planning to study in US?'

question1 = question1.lower().split()

question2 = question2.lower().split()

question1 = [w for w in question1 if w not in

stop_words]

question2 = [w for w in question2 if w not in

stop_words]

import gensim

from gensim.models import Word2Vec

model =

binary=True)

distance = model.wmdistance(question1, question2)

print('distance = %.4f' % distance)

model.init_sims(replace=True)

distance = model.wmdistance(question1, question2)

print('normalized distance = %.4f' % distance)

normalized distance = 0.7589

question3 = 'Why am I mentally very lonely? How can I

solve it?'

question4 = 'Find the remainder when [math]23^

[/math] is divided by 24,23?'

question3 = question3.lower().split()

question4 = question4.lower().split()

question3 = [w for w in question3 if w not in

stop_words]

question4 = [w for w in question4 if w not in

stop_words]

distance = model.wmdistance(question3, question4)

print('distance = %.4f' % distance)

distance = 1.2637

model.init_sims(replace=True)

distance = model.wmdistance(question3, question4)

print('normalized distance = %.4f' % distance)

normalized distance = 1.2637 、

FuzzyWuzzy

from fuzzywuzzy import fuzz

question1 = 'What would a Trump presidency mean for

current international master’s students on an F1

visa?'

question2 = 'How will a Trump presidency affect the

students presently in US or planning to study in US?'

fuzz.ratio(question1, question2)

53

fuzz.partial_token_set_ratio(question1, question2)

100

question3 = 'Why am I mentally very lonely? How can I

solve it?'

question4 = 'Find the remainder when [math]23^

[/math] is divided by 24,23?'

fuzz.ratio(question3, question4)

28

fuzz.partial_token_set_ratio(question3, question4)

37

defwmd(q1, q2):

q1 = str(q1).lower().split()

q2 = str(q2).lower().split()

stop_words = stopwords.words('english')

q1 = [w for w in q1 if w not in stop_words]

q2 = [w for w in q2 if w not in stop_words]

return model.wmdistance(q1, q2)

def norm_wmd(q1, q2):

q1 = str(q1).lower().split()

q2 = str(q2).lower().split()

stop_words = stopwords.words('english')

q1 = [w for w in q1 if w not in stop_words]

q2 = [w for w in q2 if w not in stop_words]

return norm_model.wmdistance(q1, q2)

def sent2vec(s):

words = str(s).lower()

words = word_tokenize(words)

words = [w for w in words if not w in stop_words]

M = []

for w in words:

try:

M.append(model[w])

except:

continue

M = np.array(M)

v = M.sum(axis=0)

return v / np.sqrt((v ** 2).sum())

1.单词个数

2.字符个数

3.问题1和问题2中相同单词的个数

4.问题1和问题2中不同单词的个数

5.问题1和问题2的向量余弦距离

6.问题1和问题2的向量曼哈顿距离

7.--杰卡德距离

8.--兰氏距离

9.--欧氏距离

10.--闵可夫斯基距离

11.--布雷柯蒂斯距离

12.峰度和偏度

13.词移距离

14.标准化词移距离

1. df['len_q1'] = df.question1.apply(lambda x: len(str(x))

2. df['len_q2'] = df.question2.apply(lambda x: len(str(x))

3. df['diff_len'] = df.len_q1 - df.len_q2

4. df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ',''))))

5. df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ',''))))

6. df['len_word_q1'] = df.question1.apply(lambda x:len(str(x).split())

7. df['len_word_q2'] = df.question2.apply(lambda x:len(str(x).split())

8. df['common_words'] = df.apply(lambda x: len(set(str(x[‘question1’]).lower().split()).intersection(set(str(x[‘question2’]).lower().split()))), axis=1)

9. df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x[‘question1’]), str(x[‘question2'])),axis=1)

10.df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x[‘question1’]), str(x['question2'])), axis=1)

11.df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

12.df[‘fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),str(x['question2'])), axis=1)

13.df['fuzz_token_set_ratio’] = df.apply(lambda x: fuzz.token_set_ratio(str(x[‘question1’]), str(x['question2'])), axis=1)

14.df[‘fuzz_token_sort_ratio’] = df.apply(lambda x: fuzz.token_sort_ratio(str(x[‘question1’]),str(x[‘question2'])), axis=1)

（这块原文pdf不全，由于是算法包的内容，大家自己补齐吧）

word2vec模型

model =

binary=True)

df['wmd'] = df.apply(lambda x: wmd(x['question1'],

x['question2']), axis=1)

norm_model =

binary=True)

norm_model.init_sims(replace=True)

df['norm_wmd'] = df.apply(lambda x:

norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((df.shape[0], 300))

question1_vectors[i, :] = sent2vec(q)

question2_vectors = np.zeros((df.shape[0], 300))

question2_vectors[i, :] = sent2vec(q)

df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]

df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_cectors). np.nan_to_num(question2_vectors))]

df['jaccard_distance'] = [jaccard(x, y) for (x, y) inzip(np.nan_to_num(question1_cectors). np.nan_to_num(question2_vectors))]

df['canberra_distance'] = [canberra(x, y) for (x, y) inzip(np.nan_to_num(question1_cectors). np.nan_to_num(question2_vectors))]

df['euclidean_distance'] = [euclidean(x, y) for (x, y) inzip(np.nan_to_num(question1_cectors). np.nan_to_num(question2_vectors))]

df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]

df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]

df[‘kur_q1vec’] = [kurtosis(x) for x in mp.nan_to_num(question1_vectors)]

df[‘kur_q2vec’] = [kurtosis(x) for x in mp.nan_to_num(question2_vectors)]

df.drop(['question1', 'question2'], axis=1, inplace=True)

df = df[pd.notnull(df['cosine_distance'])]

df = df[pd.notnull(df['jaccard_distance'])]

X = df.loc[:, df.columns != 'is_duplicate']

y = df.loc[:, df.columns == 'is_duplicate']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)

import xgboost as xgb

prediction = model.predict(X_test)

cm = confusion_matrix(y_test, prediction)

print(cm)

print(’Accuracy’, accyracy_score(y_test, prediction))

print(classification_report(y_test, prediction))

Jupyter notebook can be found on Github. 周末加油鸭！

Reference:

• 发表于:
• 原文链接https://kuaibao.qq.com/s/20181213G0TF0Y00?refer=cp_1026
• 腾讯「云+社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。

2018-07-02

2018-05-11

2018-04-17

2018-04-10

2019-12-11

2019-12-11