当前位置：首页 > news >正文

三明市住房与建设局网站网站建设所需

news 2025/12/27 16:40:27

三明市住房与建设局网站,网站建设所需,做seo时网站发文目的,wordpress采集vip视频首先读入的文件有movie和book和news三种#xff0c;肯定会选择一种进行解析#xff01; 下面我们以movies作为样例分析#xff1a; 本论文中使用的数据集是movie-1m ratings.dat#xff1a; 分别是用户#xff1a;#xff1a;电影#xff1a;#xff1a;评分#xff…首先读入的文件有movie和book和news三种肯定会选择一种进行解析下面我们以movies作为样例分析本论文中使用的数据集是movie-1m ratings.dat 分别是用户电影评分电影编号 item_index2entity_id_rehashed.txt文件内容 import argparse import numpy as npRATING_FILE_NAME dict({movie: ratings.dat, book: BX-Book-Ratings.csv, news: ratings.txt}) #定义字典字典保存中保存的都是原始文件 SEP dict({movie: ::, book: ;, news: \t}) #定义的分隔符 THRESHOLD dict({movie: 4, book: 0, news: 0}) #定义电影喜好的阈值吧def read_item_index_to_entity_id_file(): #看名字读item的索引转化为实体的idfile ../data/ DATASET /item_index2entity_id_rehashed.txt#../data/movie/item_index2entity_id_rehashed.txtprint(reading item index to entity id file: file ...)i 0for line in open(file, encodingutf-8).readlines():item_index line.strip().split(\t)[0] satori_id line.strip().split(\t)[1] #返回字符列表并获取第一个和第二个元素第一个元素是原item的索引第二个元素是satori中实体的索引。 satori是微软的大型知识图谱。具体看后面解析item_index_old2new[item_index] i # item 的旧的index转换为新的indexentity_id2index[satori_id] i # 实体id转换为index i 1 def convert_rating():file ../data/ DATASET / RATING_FILE_NAME[DATASET]# ../data/movie/ratings.datprint(reading rating file ...)item_set set(item_index_old2new.values()) # 将item新的index转化为集合user_pos_ratings dict() # 用户正样本的评分user_neg_ratings dict() # 用户负样本的评分for line in open(file, encodingutf-8).readlines()[1:]:array line.strip().split(SEP[DATASET]) #看上面我们经过分割后得到四个元素# remove prefix and suffix quotation marks for BX datasetif DATASET book:array list(map(lambda x: x[1:-1], array))item_index_old array[1] # 取的是第二个元素item的旧index if item_index_old not in item_index_old2new: # the item is not in the final item set # 比较的是keys不是valuesitem_index_old也是字符,查看评价的items是不是在我们记录的item_index中如果不在直接终止continueitem_index item_index_old2new[item_index_old] #如果在那么我们就赋值新的item_indexuser_index_old int(array[0]) # 获得user旧的id的indexrating float(array[2]) #获得用户的电影评分if rating THRESHOLD[DATASET]: #我们选取列表中所有大于阈值的评分if user_index_old not in user_pos_ratings: #注意这里比较的是keys值user_pos_ratings[user_index_old] set() # 积极评分的设置为set集合user_pos_ratings[user_index_old].add(item_index) #list列表中添加用户旧的index#并且添加了item新的indexelse:if user_index_old not in user_neg_ratings: #同样的道理这里存储列表中小于阈值的评分user_neg_ratings[user_index_old] set()user_neg_ratings[user_index_old].add(item_index)print(converting rating file ...) #将用户的index转为新的writer open(../data/ DATASET /ratings_final.txt, w, encodingutf-8)user_cnt 0user_index_old2new dict()for user_index_old, pos_item_set in user_pos_ratings.items():if user_index_old not in user_index_old2new:user_index_old2new[user_index_old] user_cnt #记录user的总数user_cnt 1user_index user_index_old2new[user_index_old] #for item in pos_item_set:writer.write(%d\t%d\t1\n % (user_index, item))unwatched_set item_set - pos_item_setif user_index_old in user_neg_ratings:unwatched_set - user_neg_ratings[user_index_old]for item in np.random.choice(list(unwatched_set), sizelen(pos_item_set), replaceFalse):writer.write(%d\t%d\t0\n % (user_index, item))writer.close()print(number of users: %d % user_cnt)print(number of items: %d % len(item_set))def convert_kg(): #基本都是转变id的事 print(converting kg file ...)entity_cnt len(entity_id2index)relation_cnt 0writer open(../data/ DATASET /kg_final.txt, w, encodingutf-8)files []if DATASET movie:files.append(open(../data/ DATASET /kg_part1_rehashed.txt, encodingutf-8))files.append(open(../data/ DATASET /kg_part2_rehashed.txt, encodingutf-8))else:files.append(open(../data/ DATASET /kg_rehashed.txt, encodingutf-8))for file in files:for line in file:array line.strip().split(\t)head_old array[0]relation_old array[1]tail_old array[2]if head_old not in entity_id2index:entity_id2index[head_old] entity_cntentity_cnt 1head entity_id2index[head_old]if tail_old not in entity_id2index:entity_id2index[tail_old] entity_cntentity_cnt 1tail entity_id2index[tail_old]if relation_old not in relation_id2index:relation_id2index[relation_old] relation_cntrelation_cnt 1relation relation_id2index[relation_old]writer.write(%d\t%d\t%d\n % (head, relation, tail))writer.close()print(number of entities (containing items): %d % entity_cnt)print(number of relations: %d % relation_cnt)if __name__ __main__:np.random.seed(555)parser argparse.ArgumentParser()parser.add_argument(-d, --dataset, typestr, defaultmovie, helpwhich dataset to preprocess)args parser.parse_args()DATASET args.datasetentity_id2index dict()relation_id2index dict()item_index_old2new dict()read_item_index_to_entity_id_file()convert_rating()convert_kg()print(done) 补充 1. line.strip.split(’\t’) 描述 Python strip() 方法用于移除字符串头尾指定的字符默认为空格或字符序列。注意该方法只能删除开头或是结尾的字符不能删除中间部分的字符。语法 strip()方法语法 str.strip([chars]);参数 chars – 移除字符串头尾指定的字符序列。返回值返回移除字符串头尾指定的字符序列生成的新字符串 2. split(’\t’) 已经在上个代码分析中讨论过了这里只是简单说一下它会返回字符列表源代码分析 def read_item_index_to_entity_id_file():file ../data/movie/item_index2entity_id_rehashed.txtprint(reading item index to entity id file: file ...)i 0for line in open(file, encodingutf-8).readlines():i i 1if i 10:print(line)print(len(line))print(line.strip())print(len(line.strip()))print(line.strip().split(\t))print(line.strip().split(\t)[0]) read_item_index_to_entity_id_file()可以看出其一如果只是输出一行的数据长度为4该字符串是1 \t 0 空格多个空格为一个所以我们在获取一行数据的时候要特别注意这些空格符在首尾、分隔符在中间最后split返回的是字符列表 3. Set()集合集合是为了啥关系运算啊并交差集定义 set() 函数创建一个无序不重复元素集可进行关系测试删除重复数据还可以计算交集、差集、并集等。注意是没有顺序而是是不重复的集合返回值返回新的集合对象实例 x set(runoob)y set(google)x, y (set([b, r, u, o, n]), set([e, o, g, l])) # 重复的被删除x y # 交集 set([o])x | y # 并集 set([b, e, g, l, o, n, r, u])x - y # 差集 set([r, b, u, n])4. “XXX” not in dict 比较的是keys不是values 如果字典中没有那么就返回False否则返回True。配合的操作就是如果没有那么就添加该key值源码举例 if item_index_old not in item_index_old2new: # the item is not in the final item set # 比较的是keys不是valuesitem_index_old也是字符,查看评价的items是不是在我们记录的item_index中如果不在直接终止continueitem_index item_index_old2new[item_index_old] #如果在那么我们就赋值新的item_index

查看全文

http://wiki.neutronadmin.com/news/278822/