关于自然语言处理系列-文本摘要提取进阶

python与大数据分析

发布于 2022-03-11 14:42:01

5870

发布于 2022-03-11 14:42:01

关于自然语言处理重要的一个部分是文本摘要，文本摘要的提取涉及到分词、断句、文本权重问题；分词前文已述，断句通过正则表达式完成；文本权重又包括句子的tfidf权重、文本相似度权重和句子的位置权重；关于权重又涉及到归一化处理和权重的权值等等。总的来说提取的摘要质量要比之前的snownlp、sumy、goose直接拿来用效果要好一些。

相关代码来自互联网，不过自己做了一些优化和完善。

代码示例

# coding:utf-8
import jieba
import numpy as np
import collections
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import math
import re

# 分割语句，生成语句列表和语句顺序字典
def split_sentence(text, punctuation_list=r'([\s\.\!\?\。\！\？]+)'):
    # 将文章按照标点符号列表里的符号切分成句子，将所有句子保存在列表里；同时生成一份带句子顺序的字典
    # 正则表达式分割中文文本
    sentence_set = re.split(punctuation_list, text)
    # 追加一个空标志
    sentence_set.append("")
    # 将分割后的字符串添加回原来的标点符号
    sentence_set = ["".join(i) for i in zip(sentence_set[0::2], sentence_set[1::2])]
    sentence_with_index = dict(zip(range(len(sentence_set)), sentence_set))
    # 返回语句列表和带语句顺序的字典
    return sentence_set, sentence_with_index

# 计算语句列表中每个词的tfidf值
def get_tfidf_matrix(sentence_set, stop_word):
    corpus = []
    # 对每条语句进行分词，并且去掉停用词，写入corpus列表
    for sent in sentence_set:
        sent_cut = jieba.cut(sent)
        sent_list = [word for word in sent_cut if word not in stop_word]
        sent_str = ' '.join(sent_list)
        corpus.append(sent_str)

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    # CountVectorizer.fit_transform将文本进行词袋处理
    # TfidfTransformer.fit_transform 用于统计vectorizer中每个词语的TF-IDF值。
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word=vectorizer.get_feature_names()
    tfidf_matrix = tfidf.toarray()
    # 返回tfidf矩阵
    return np.array(tfidf_matrix)

# 基于tfidf对各行语句求权重
def get_sentence_with_words_weight(tfidf_matrix):
    # 对tfidf_matrix值求和
    tfidf_matrix_sum = tfidf_matrix.sum(1)
    # 转换矩阵维度，进行归一化处理
    tfidf_matrix_sum = np.reshape(tfidf_matrix_sum,(-1, 1))
    min_max_scaler = preprocessing.MinMaxScaler()
    tfidf_matrix_sum = min_max_scaler.fit_transform(tfidf_matrix_sum)
    # 归一化处理后，将二维转一维再转list
    tfidf_list_sum=tfidf_matrix_sum.flatten().tolist()
    # 将list转为当前行对应的tfidf值
    sentence_with_words_weight = dict(zip(range(len(tfidf_list_sum)),tfidf_list_sum ))

    return sentence_with_words_weight

# 计算各语句的位置权重
def get_sentence_with_position_weight(sentence_set):
    # 线性处理各语句位置权重，会导致后面的语句被忽视，这里做了对数规约化处理
    sentence_with_position_weight = {}
    total_sent = len(sentence_set)
    for i in range(total_sent):
        #sentence_with_position_weight[i] = (total_sent - i) / total_sent
        sentence_with_position_weight[i]=math.log(total_sent-i,10)
    return sentence_with_position_weight

# 计算余弦相似度返回值比较
def similarity(sent1, sent2):
    # 计算余弦相似度
    return np.sum(sent1 * sent2) / 1e-6 + (np.sqrt(np.sum(sent1 * sent1)) * \
                                           np.sqrt(np.sum(sent2 * sent2)))

# 计算相似度权重
def get_similarity_weight(tfidf_matrix):
    sentence_score = collections.defaultdict(lambda: 0.)
    # 遍历构建各语句之间的相似度，累加后，生成语句间相似度字典
    for i in range(len(tfidf_matrix)):
        score_i = 0.
        for j in range(len(tfidf_matrix)):
            score_i += similarity(tfidf_matrix[i], tfidf_matrix[j])
        sentence_score[i] = score_i

    # 进行归一化处理
    max_score = max(sentence_score.values())  # 归一化
    min_score = min(sentence_score.values())
    for key in sentence_score.keys():
        x = sentence_score[key]
        sentence_score[key] = (x - min_score) / (max_score - min_score)

    return sentence_score

# 基于权重和得分生成总权重值
def ranking_base_on_weigth(sentence_with_words_weight,
                           sentence_with_position_weight,
                           sentence_score, feature_weight=[1, 1, 1]):
    sentence_weight = collections.defaultdict(lambda: 0.)
    # 遍历各语句的tfidf值、语句顺序值、相似度值，并乘以相应的权重，加权后得到每条语句的总权重值
    for sent in sentence_score.keys():
        sentence_weight[sent] = feature_weight[0] * sentence_with_words_weight[sent] + \
                                feature_weight[1] * sentence_with_position_weight[sent] + \
                                feature_weight[2] * sentence_score[sent]
    sort_sent_weight = sorted(sentence_weight.items(), key=lambda d: d[1], reverse=True)

    return sort_sent_weight

# 基于各语句总权重值和摘要比例，从语句中挑选相关摘要
def get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.3):
    topK = int(len(sort_sent_weight) * topK_ratio)
    # 按各语句的权重值进行排序，并获取topN条数据
    summarization_sent = sorted([sent[0] for sent in sort_sent_weight[:topK]])
    # 通过语句索引找相关语句拼装回去
    summarization = []
    for i in summarization_sent:
        summarization.append(sentence_with_index[i])

    summary = ''.join(summarization)
    return summary

if __name__ == '__main__':
    stopwordfile= 'C:\Python\Pycharm\langprocess\\stopwords.txt'
    test_text = 'C:\Python\Pycharm\langprocess\\train\C4-Literature\C4-Literature02.txt'
    test_text = 'C:\Python\Pycharm\langprocess\\背影.txt'
    #test_text = 'C:\Python\Pycharm\langprocess\\第一章.txt'
    # 读取待做摘要的文章内容
    with open(test_text, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    # 读取停用词词典
    stop_word = []
    with open(stopwordfile, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            stop_word.append(line.strip())
    # 返回断句后的语句 和 索引语句字典
    sentence_set, sentence_with_index = split_sentence(text, punctuation_list=r'([\s\.\!\?\。\！\？]+)')
    # 返回各语句各分词的tfidf矩阵
    tfidf_matrix = get_tfidf_matrix(sentence_set, stop_word)
    # 根据tfidf矩阵，生成该语句的tfidf值
    sentence_with_words_weight = get_sentence_with_words_weight(tfidf_matrix)
    # 生成语句的位置权重值
    sentence_with_position_weight = get_sentence_with_position_weight(sentence_set)
    # 根据tfidf矩阵，进行文本相似度计算，生成各语句的文本相似度值
    sentence_score = get_similarity_weight(tfidf_matrix)
    # 将tfidf值、位置权重值、文本相似度值按照相关权重进行计算，返回总的权重值
    sort_sent_weight = ranking_base_on_weigth(sentence_with_words_weight,
                                              sentence_with_position_weight,
                                              sentence_score, feature_weight=[1, 0.01, 1])
    # 根据权重值、摘要比例生成摘要
    summarization = get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.2)
    print('摘要:\n', summarization)

摘要内容：

我与父亲不相见已二年余了，我最不能忘记的是他的背影。
那年冬天，祖母死了，父亲的差使也交卸了，正是祸不单行的日子，我从北京到徐州，打算跟着父亲奔丧回家。
到徐州见着父亲，看见满院狼藉的东西，又想起祖母，不禁簌簌地流下眼泪。
这些日子，家中光景很是惨淡，一半为了丧事，一半为了父亲赋闲。
丧事完毕，父亲要到南京谋事，我也要回北京念书，我们便同行。
父亲因为事忙，本已说定不送我，叫旅馆里一个熟识的茶房陪我同去。
但他终于不放心，怕茶房不妥帖；颇踌躇了一会。父亲是一个胖子，走过去自然要费事些。
我看见他戴着黑布小帽，穿着黑布大马褂，深青布棉袍，蹒跚地走到铁道边，慢慢探身下去，尚不大难。
过铁道时，他先将橘子散放在地上，自己慢慢爬下，再抱起橘子走。
我北来后，他写了一信给我，信中说道，“我身体平安，惟膀子疼痛利害，举箸提笔，诸多不便，大约大去之期不远矣。”
我读到此处，在晶莹的泪光中，又看见那肥胖的，青布棉袍，黑布马褂的背影。

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2020-03-04，如有侵权请联系 cloudcommunity@tencent.com 删除

NLP 服务