东营伟浩建设集团网站,贵阳平面设计公司,php网站开发工程师找工作,找人做购物网站对商品的评论进行数据挖掘得到评论标签#xff08;商品属性评论观点#xff09;#xff0c;以及用户的分组信息#xff1a;
第一步#xff1a;对文本进行预处理#xff0c;分词并进行语义角色标注
# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Post…对商品的评论进行数据挖掘得到评论标签商品属性评论观点以及用户的分组信息
第一步对文本进行预处理分词并进行语义角色标注
# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import heapq
import re
import emojiclass Sentence_Parser:def __init__(self):#LTP_DIR F:\project support\ltp_data_v3.4.0LTP_DIR ./ltp_data_v3.4.0# 分词self.segmentor Segmentor()self.segmentor.load(os.path.join(LTP_DIR, cws.model))# 词性标注self.postagger Postagger()self.postagger.load(os.path.join(LTP_DIR, pos.model))# 依存句法分析self.parser Parser()self.parser.load(os.path.join(LTP_DIR, parser.model))# 命名实体识别人名、地名、机构名等self.recognizer NamedEntityRecognizer()self.recognizer.load(os.path.join(LTP_DIR, ner.model))# 词义角色标注施事、受事、时间、地点self.labeller SementicRoleLabeller()self.labeller.load(os.path.join(LTP_DIR, pisrl_win.model))def format_labelrole(self, words, postags):词义角色标注arcs self.parser.parse(words, postags)roles self.labeller.label(words, postags, arcs)roles_dict {}for role in roles:roles_dict[role.index] {arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments}# for item in roles_dict.items():# print(item)return roles_dictdef bulid_parser_child_dict(self, words, postags, arcs):句法分析---为句子中的每个词语维护一个保存句法依存子节点的字典child_dict_list []format_parse_list []for index in range(len(words)):child_dict dict()for arc_index in range(len(arcs)):if arcs[arc_index].head index 1:if arcs[arc_index].relation not in child_dict:child_dict[arcs[arc_index].relation] []child_dict[arcs[arc_index].relation].append(arc_index)else:child_dict[arcs[arc_index].relation].append(arc_index)child_dict_list.append(child_dict)rely_id [arc.head for arc in arcs]# print(rely_id)relation [arc.relation for arc in arcs]# for i in range(len(relation)):# print(words[i], _, postags[i], _, i, _, relation[i])heads [Root if id 0 else words[id-1] for id in rely_id]# print(heads)for i in range(len(words)):a [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]format_parse_list.append(a)return child_dict_list, format_parse_listdef parser_main(self, sentence):parser主函数words list(self.segmentor.segment(sentence))postags list(self.postagger.postag(words))arcs self.parser.parse(words, postags)child_dict_list, format_parse_list self.bulid_parser_child_dict(words, postags, arcs)roles_dict self.format_labelrole(words, postags)return words, postags, child_dict_list, roles_dict, format_parse_listdef select(self, words, postags):筛选出名词和形容词co_model Word2Vec.load(coseg_text.model)n_list0 []a_list []for i in range(len(postags)):if postags[i] n:if len(words[i]) 2:n_list0.append(words[i])if postags[i] a:# if len(words[i]) 2:a_list.append(words[i])n_list0 list(set(n_list0))a_list list(set(a_list))# print(n_list0)# print(a_list)si_p []for n in n_list0:try:s co_model.similarity(n, 手机)si_p.append(s)except Exception as e:si_p.append(0)index_list list(map(si_p.index, heapq.nlargest(int(0.8*len(si_p)), si_p))) #取出和手机相关度最高的nn_list []for index in index_list:n_list.append(n_list0[index])# print(n_list)return n_list, a_listdef simlarity(self, n_list0, a_list):计算相似度,进行正逆向匹配筛选出名词和形容词的最佳搭配n_list0 n_list0a_list a_listco_model Word2Vec.load(coseg_text.model)si_p []for n in n_list0:try:s co_model.similarity(n, 手机)si_p.append(s)except Exception as e:si_p.append(0)index_list list(map(si_p.index, heapq.nlargest(int(0.8*len(si_p)), si_p))) #取出和手机相关度最高的nn_list []for index in index_list:n_list.append(n_list0[index])# 名词正向匹配comment1_df pd.DataFrame(columns[comment_tag, similarity], index[np.arange(100)])index 0for i in range(len(n_list)):f_si 0for j in range(len(a_list)):try:si co_model.similarity(n_list[i], a_list[j])if si f_si:f_si sicomment_tag n_list[i] a_list[j]else:f_si f_siexcept Exception as e:print(语料库中缺少该词, e)comment1_df.loc[index, ] [comment_tag, f_si]index 1comment1_df comment1_df.sort_values(bysimilarity, ascendingFalse, ignore_indexTrue)comment1_df.dropna(subset[comment_tag], inplaceTrue)# comment1_df comment1_df.iloc[0: int(0.2*len(comment_df)), ]# 形容词匹配逆向匹配comment2_df pd.DataFrame(columns[comment_tag, similarity], index[np.arange(100)])index 0for i in range(len(a_list)):f_si 0for j in range(len(n_list)):try:si co_model.similarity(n_list[j], a_list[i])if si f_si:f_si sicomment_tag n_list[j] a_list[i]else:f_si f_siexcept Exception as e:print(语料库中缺少该词, e)comment2_df.loc[index, ] [comment_tag, f_si]index 1comment2_df comment2_df.sort_values(bysimilarity, ascendingFalse, ignore_indexTrue)comment1_df.dropna(subset[comment_tag], inplaceTrue)comment_df pd.merge(comment1_df, comment2_df, oncomment_tag, howinner)comment_df.dropna(subset[comment_tag], inplaceTrue)return comment_dfdef cleandata(self, x):对数据进行清洗替换一些不规则的标点符号pat re.compile([^\u4e00-\u9fa5^.^a-z^A-Z^0-9]) # 只保留中英文去掉符号x x.replace( , ,)emoji.demojize(x) # 去掉表情表情符号x re.sub(pat, ,, x)return x
第二步提取实体和相关实体信息 python
# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
from cixing import Sentence_Parser
import pandas as pd
import numpy as np
import heapq
import re
import emojiclass Extractor:def __init__(self):self.co_model Word2Vec.load(coseg_text.model)self.parser Sentence_Parser()def get_seginfo(self, comment_list):for c in range(len(comment_list)):if len(comment_list[c]) 200:sentence comment_list[c]else:sentence comment_list[c][0: 201]if sentence ! :sentence self.parser.cleandata(sentence)words, postags, child_dict_list, roles_dict, format_parse_list self.parser.parser_main(sentence)n_list, a_list self.parser.select(words, postags)tags []for j in range(len(a_list)):# print(child_dict_list[j])p words.index(a_list[j])if child_dict_list[p]:# print(child_dict_list[p])# 构成的是主谓关系if SBV in child_dict_list[p]:# print(child_dict_list[p])si_p []for po in child_dict_list[p][SBV]:try:si self.co_model.similarity(words[po], 手机)si_p.append(si)except Exception as e:si_p.append(0)id list(map(si_p.index, heapq.nlargest(1, si_p))) # 和该形容词最高的名词s child_dict_list[p][SBV][id[0]]w1 words[s] a_list[j]if child_dict_list[s]:# print(child_dict_list[s])if ATT in child_dict_list[s]:if postags[child_dict_list[s][ATT][0]] n:w2 words[child_dict_list[s][ATT][0]] w1tags.append(w2)else:tags.append(w1)else:tags.append(w1)if ATT in child_dict_list[p]:# print(child_dict_list[p])s child_dict_list[p][ATT][0]if SBV in child_dict_list[s]:w3 words[child_dict_list[s][SBV][0]]w4 w3 a_list[j]id1 words.index(w3)if child_dict_list[id1]:if ATT in child_dict_list[id1]:if postags[child_dict_list[id1][ATT][0]] n:w5 words[child_dict_list[id1][ATT][0]] w4tags.append(w5)else:tags.append(w4)with open(F:\pycharm project data\\taobao\phone\\tags.txt, a) as t:t.writelines( .join(tags))t.writelines(\n)# f.close()print(tags)# 获取相关的名词和用户组n_list list(set(n_list))if n_list:with open(F:\pycharm project data\\taobao\phone\\noun.txt, a) as f:f.writelines( .join(n_list))f.writelines(\n)# f.close()si_p []u_list [小孩子, 作业, 高中, 初中, 儿童, 学校, 小孩, 老师, 网瘾, 中学生, 小学, 女儿, 小学生, 孩子, 闺女, 儿子, 学生, 网课, 小朋友,同事, 表弟, 亲戚, 姐妹, 表哥, 邻居, 同学, 朋友, 盆友, 链接,姥姥, 老太太, 老人, 岳母, 父亲, 老娘, 小姨, 老丈人, 舅舅, 岳父, 亲人, 老妈子, 老头儿, 婆婆, 老太, 老头子, 父母, 家婆, 老父亲, 老爹, 长辈, 大人, 外爷, 爷爷, 我爸, 老头, 老妈, 老爷子, 爸妈, 奶奶, 老伴, 老爸, 母亲, 老人家, 妈妈, 公公, 爸爸, 丈母娘, 姥爷, 家里人, 家人,老奶奶, 小伙子, 阿姨, 娘娘, 小姑子, 姐姐, 老妹, 婶婶, 大姐, 外孙, 小屁孩, 孙子, 姨妈, 棉袄, 伯母, 孝心,媳妇, 妹妹, 男朋友, 对象, 生日, 女朋友, 男票, 老婆, 弟弟, 情人节, 爹妈, 麻麻, 老公, 外甥, 老弟]# print(n_list)# print(n_list)for n in range(len(n_list)):for u in range(len(u_list)):try:s self.co_model.similarity(n_list[n], u_list[u])si_p.append(s)except Exception as e:si_p.append(0)index_list list(map(si_p.index, heapq.nlargest(1, si_p))) # 取出和手机相关度最高的n# print(index_list)user_list []for index in index_list:index int(index/len(u_list))user_list.append(n_list[index])# print(user_list)with open(F:\pycharm project data\\taobao\phone\\user.txt, a) as u:u.writelines(user_list)u.writelines(\n)# f.close()t.close()f.close()u.close()
第三步测试数据以及测试模型
# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import heapq
import re
import emoji
from extractor import Extractorpd.set_option(display.max_columns, 100)
pd.set_option(display.max_rows, 5000)
pd.set_option(max_colwidth, 30)
pd.set_option(display.width, 1000)
pd.set_option(display.unicode.ambiguous_as_wide, True)
pd.set_option(display.unicode.east_asian_width, True)# 一、数据处理
# 导入数据
df pd.read_csv(F:\pycharm project data\\taobao\phone\\comment1.csv, encodingutf-8-sig)
# 提取评论数据
co_df df[[content]]
co_df co_df.loc[co_df[content] ! 15天内买家未作出评价, [content]]
co_df co_df.loc[co_df[content] ! 评价方未及时做出评价,系统默认好评!, [content]]
comment_list co_df[content].tolist()if __name__ __main__:myextractor Extractor()#myextractor.get_seginfo(comment_list)