python判断两篇文章文本是否相似代码
代码语言:python
所属分类:其他
代码描述:python判断两篇文章文本是否相似代码,算法原理步骤:对文本进行分句,以句号、感叹号、问号、换行符作为句子的结尾标识,一个正在表达式就可以分好句了。其次,挑选最长的n句话,分别进行hash计算。hash函数可以用Python自带模块hashlib中的md5, sha等等,也可以用我在爬虫教程中多次提到的farmhash。最后,我们需要根据这n个hash值给文本内容一个similar_id,通过上面两种HashDB的类的任意一种都可以比较容易实现。其原理就是,similar_id从0开始,从HashDB中
代码标签: python 判断 两篇 文章 文本 相似 代码
下面为部分代码预览,完整代码请点击下载或在bfwstudio webide中打开
#!/usr/local/python3/bin/python3 # coding:utf-8 # Author: veelion@ebuinfo.com import pickle import os import re import hashlib import traceback class HashDBLeveldb: def __init__(self, name): import leveldb db_name = name + '.hashdb' self.db = leveldb.LevelDB(db_name) def get(self, key): if isinstance(key, str): key = key.encode('utf8') elif isinstance(key, int): key = str(key).encode() try: value = self.db.Get(key).decode() except: value = None return value def put(self, key, value): if isinstance(key, str): key = key.encode('utf8') elif isinstance(key, int): key = str(key).encode('utf8') if isinstance(value, str): value = value.encode('utf8') elif isinstance(value, int): value = str(value).encode('utf8') self.db.Put(key, value) class HashDBMemory: def __init__(self, name): self.name = name self.db_name = name + '.hashdb.pkl' self.db = {} if os.path.isfile(self.db_name): with open(self.db_name, 'rb') as f: try: self.db = pickle.load(f) except: traceback.print_exc() self.db = {} #def __del__(self): #with open(self.db_name, 'wb') as f: #pickle.dump(self.db, f) def get(self, key): return self.db.get(key) def put(self, key, value): self.db[key] = value class NSHash: '''using top-n longest sentences' hashes to identify similar documents''' def __init__(self, name, hashfunc='md5', hashdb='memory'): ''' hashfunc: md5, farmhash or others in module: hashlib ''' if hashfunc == 'farmhash': import farmhash self.hashfunc = farmhash.hash64 elif hashfunc == 'md5': def md5hash(s): if isinstance(s, str): s = s.encode('utf8') return hashlib.md5(s).hexdigest() self.hashfunc = md5hash .........完整代码请登录后点击上方下载按钮下载查看
网友评论0