python jieba Tfidf提取文章文本摘要代码
代码语言:python
所属分类:其他
代码描述:python jieba Tfidf提取文章文本摘要代码,用到了sklearn中的TfidfTransformer转换器。
代码标签: python jieba Tfidf 提取 文章 文本 摘要 代码
下面为部分代码预览,完整代码请点击下载或在bfwstudio webide中打开
#!/usr/local/python3/bin/python3 # -*- coding: utf-8 -* # coding:utf-8 import jieba import numpy as np import collections from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer def split_sentence(text, punctuation_list='!?。!?'): """ 将文本段安装标点符号列表里的符号切分成句子,将所有句子保存在列表里。 """ sentence_set = [] inx_position = 0 #索引标点符号的位置 char_position = 0 #移动字符指针位置 for char in text: char_position += 1 if char in punctuation_list: next_char = list(text[inx_position:char_position+1]).pop() if next_char not in punctuation_list: sentence_set.append(text[inx_position:char_position]) inx_position = char_position if inx_position < len(text): sentence_set.append(text[inx_position:]) sentence_with_index = {i:sent for i,sent in enumerate(sentence_set)} #dict(zip(sentence_set, range(len(sentences)))) return sentence_set,sentence_with_index def get_tfidf_matrix(sentence_set,stop_word): corpus = [] for sent in sentence_set: sent_cut = jieba.cut(sent) sent_list = [word for word in sent_cut if .........完整代码请登录后点击上方下载按钮下载查看
网友评论0