Python中怎么实现结巴分词且去掉除号
Admin 2022-07-29 群英技术资讯 728 次浏览
import re import jieba.analyse import codecs import pandas as pd def simplification_text(xianbingshi): """提取文本""" xianbingshi_simplification = [] with codecs.open(xianbingshi,'r','utf8') as f: for line in f : line = line.strip() line_write = re.findall('(?<=\<b\>).*?(?=\<e\>)',line) for line in line_write: xianbingshi_simplification.append(line) with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write.txt','w','utf8') as f: for line in xianbingshi_simplification: f.write(line + '\n') def jieba_text(): """""" word_list = [] data = open(r"C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\xianbingshi_write.txt", encoding='utf-8').read() seg_list = jieba.cut(data, cut_all=False) # 精确模式 for i in seg_list: word_list.append(i.strip()) data_quchong = pd.DataFrame({'a':word_list}) data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True) word_list = data_quchong['a'].tolist() with codecs.open('word.txt','w','utf8')as w: for line in word_list: w.write(line + '\n') def word_messy(word): """词语提炼""" word_sub_list = [] with codecs.open(word,'r','utf8') as f: for line in f: line_sub = re.sub("^[1-9]\d*\.\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\d+)(\.\d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line) word_sub_list.append(line_sub) word_sub_list.sort() with codecs.open('word.txt','w','utf8')as w: for line in word_sub_list: w.write(line.strip("\n") + '\n') if __name__ == '__main__': xianbingshi = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\xianbingshi_sub_sen_all(1).txt' # simplification_text(xianbingshi) # word = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\word.txt' simplification_text(xianbingshi)
补充:python 进行结巴分词 并且用re去掉符号
# 把停用词做成字典 stopwords = {} fstop = open('stop_words.txt', 'r',encoding='utf-8',errors='ingnore') for eachWord in fstop: stopwords[eachWord.strip()] = eachWord.strip() #停用词典 fstop.close() f1=open('all.txt','r',encoding='utf-8',errors='ignore') f2=open('allutf11.txt','w',encoding='utf-8') line=f1.readline() while line: line = line.strip() #去前后的空格 line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+――!,;:。?、~@#¥%……&*()]+", " ", line) #去标点符号 seg_list=jieba.cut(line,cut_all=False) #结巴分词 outStr="" for word in seg_list: if word not in stopwords: outStr+=word outStr+=" " f2.write(outStr) line=f1.readline() f1.close() f2.close()
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:mmqy2019@163.com进行举报,并提供相关证据,查实之后,将立刻删除涉嫌侵权内容。
猜你喜欢
Python全局样式和文本样式的定义是什么,有不少朋友对此感兴趣,下面小编给大家整理和分享了相关知识和资料,易于大家学习和理解,有需要的朋友可以借鉴参考,下面我们一起来了解一下吧。
这篇文章主要为大家介绍了GCN图卷积神经网络原理及代码解析,有需要的朋友可以借鉴参考下,希望能够有所帮助,祝大家多多进步,早日升职加薪
这篇文章主要为大家介绍了Python简单实现gif动图倒放的示例过程,有需要的朋友可以借鉴参考下,希望能够有所帮助,祝大家多多进步,早日升职加薪
Python集合分为变集合和不可变集合两种,本文就详细的来介绍一下这两种集合的使用,文中通过示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
这篇文章主要介绍了pytorch 两个GPU同时训练的解决方案,具有很好的参考价值,希望对大家有所帮助。如有错误或未考虑完全的地方,望不吝赐教
成为群英会员,开启智能安全云计算之旅
立即注册Copyright © QY Network Company Ltd. All Rights Reserved. 2003-2020 群英 版权所有
增值电信经营许可证 : B1.B2-20140078 粤ICP备09006778号 域名注册商资质 粤 D3.1-20240008