python jieba Tfidf提取文章文本摘要代码

代码语言:python

所属分类:其他

代码描述:python jieba Tfidf提取文章文本摘要代码,用到了sklearn中的TfidfTransformer转换器。

代码标签: python jieba Tfidf 提取 文章 文本 摘要 代码

下面为部分代码预览,完整代码请点击下载或在bfwstudio webide中打开

#!/usr/local/python3/bin/python3
# -*- coding: utf-8 -*
# coding:utf-8
import jieba
import numpy as np
import collections
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  


def split_sentence(text, punctuation_list='!?。!?'):
   
"""
    将文本段安装标点符号列表里的符号切分成句子,将所有句子保存在列表里。
    """

    sentence_set
= []
    inx_position
= 0         #索引标点符号的位置
    char_position
= 0        #移动字符指针位置
   
for char in text:
        char_position
+= 1
       
if char in punctuation_list:
            next_char
= list(text[inx_position:char_position+1]).pop()
           
if next_char not in punctuation_list:
                sentence_set
.append(text[inx_position:char_position])
                inx_position
= char_position
   
if inx_position < len(text):
        sentence_set
.append(text[inx_position:])

    sentence_with_index
= {i:sent for i,sent in enumerate(sentence_set)} #dict(zip(sentence_set, range(len(sentences))))
   
return sentence_set,sentence_with_index

def get_tfidf_matrix(sentence_set,stop_word):
    corpus
= []
   
for sent in sentence_set:
        sent_cut
= jieba.cut(sent)
        sent_list
= [word for word in sent_cut if word not in stop_word]
        sent_str
= ' '.join(sent_list)
        corpus
.append(sent_str)

    vectorizer
=CountVectorizer()
    transformer
=TfidfTransformer()
    tfidf
=transformer.fit_transform(vectorizer.fit_transform(corpus))
   
# word=vectorizer.get_feature_names()
    tfidf_matrix
=tfidf.toarray()
   
return np.array(tfidf_matrix)

def get_sentence_with_words_weight(tfidf_matrix):
    sentence_with_words_weight
= {}
   
for i in range(len(tfidf_matrix)):
        sentence_with_words_weight
[i] = np.sum(tfidf_matrix[i])

    max_weight
= max(sentence_with_words_weight.values()) #归一化
    min_weight
= min(sentence_with_words_weight.values())
   
for key in sentence_with_words_weight.keys():
        x
= sentence_with_words_weight[key]
        sentence_with_words_weight
[key] = (x-min_weight)/(max_weight-min_weight)

   
return sentence_with_words_weight

def get_sentence_with_position_weight(sentence_set):
    sentence_with_position_weight
= {}
    total_sent
= len(sentence_set)
   
for i in range(total_sent):
        sentence_with_position_weight
[i] = (total_sent - i) / total_sent
   
return sentence_with_position_weight

def similarity(sent1,sent2):
   
"""
    计算余弦相似度
    """

   
return np.sum(sent1 * sent2) / 1e-6+(np.sqrt(n.........完整代码请登录后点击上方下载按钮下载查看

网友评论0