这个项目是要从存储了许多菜谱的json文件中依据关键词快速找到相应菜谱。

使用时，直接使用search函数即可找相关菜谱，支持多个关键词搜索，支持高评分词条优先显示。搜索结果为前十个相关菜谱，显示时有三种排序方式：
ordering=‘normal’：默认排序，依照相关性。
ordering=‘health’：依据卡路里，蛋白质，脂肪数值计算菜谱是否健康，将更健康的排在前面。
ordering=‘simple’：依据原材料数量和烧菜步骤数量，将更简单的菜谱排在前面。

import json
import re
import sys
#有用到sys.maxsize，这是系统最大int型整数，方便排序#parse and tokenize (split into words) of all recipes
def process_recipes(filename):#定义新的dictiorary数据库存储菜谱数据title_to_terms = {}categories_to_terms = {}ingredients_to_terms = {}directions_to_terms = {}pattern = re.compile('[\W_]+')#打开json文件with open(filename) as f:recipes = json.load(f)print(len(recipes))for recipe_number in range(20):#将每个菜谱句子分解成单词，存入词袋库（dictionary实现）recipe = recipes[recipe_number]recipe_to_terms[recipe_number] = {}#如果此recipe有title，将此title句子分割为一组单词并存入数据库if 'title' in recipe.keys():title = recipe['title']title_to_terms['title'] = pattern.sub(' ',title)re.sub(r'[\W_]+','', title_to_terms['title'])title_to_terms['title'] = title_to_terms['title'].split()recipe_to_terms[recipe_number].update(title_to_terms)#如果此recipe有categories，将此categories句子分割为一组单词并存入数据库 if 'categories' in recipe.keys():categories = str(recipe['categories'])categories_to_terms['categories'] = pattern.sub(' ',categories)re.sub(r'[\W_]+','', categories_to_terms['categories'])categories_to_terms['categories'] = categories_to_terms['categories'].split()recipe_to_terms[recipe_number].update(categories_to_terms)#如果此recipe有ingredients，统计原料数量存入数据库，将此ingredients句子分割为一组单词并存入数据库 if 'ingredients' in recipe.keys():recipe_to_terms[recipe_number].update({'number':len(recipe['ingredients'])})ingredients = str(recipe['ingredients'])ingredients_to_terms['ingredients'] = pattern.sub(' ',ingredients)re.sub(r'[\W_]+','', ingredients_to_terms['ingredients'])ingredients_to_terms['ingredients'] = ingredients_to_terms['ingredients'].split()recipe_to_terms[recipe_number].update(ingredients_to_terms)#如果此recipe有directions，统计步骤数量存入数据库，将此directions句子分割为一组单词并存入数据库 if 'directions' in recipe.keys():recipe_to_terms[recipe_number].update({'step':len(recipe['directions'])})directions = str(recipe['directions'])directions_to_terms['directions'] = pattern.sub(' ',directions)re.sub(r'[\W_]+','', directions_to_terms['directions'])directions_to_terms['directions'] = directions_to_terms['directions'].split()recipe_to_terms[recipe_number].update(directions_to_terms)#如果此recipe有rating，存入数据库if 'rating' in recipe.keys():recipe_to_terms[recipe_number].update({'rating':recipe['rating']})#如果此recipe有calories，存入数据库if 'calories' in recipe.keys():recipe_to_terms[recipe_number].update({'calories':recipe['calories']})#如果此recipe有protein，存入数据库if 'protein' in recipe.keys():recipe_to_terms[recipe_number].update({'protein':recipe['protein']})#如果此recipe有fat，存入数据库if 'fat' in recipe.keys():recipe_to_terms[recipe_number].update({'fat':recipe['fat']})#返回数据库return recipe_to_termsdef word_count(word,term_dictionary):count={}#统计每个recipe中这个单词的数量for recipe_number in term_dictionary.keys():count[recipe_number]={}title_count=0categories_count=0ingredients_count=0directions_count=0line_count={}#统计所有单词出现的次数for i in range(len(word)):#统计title中的数量if 'title' in term_dictionary[recipe_number].keys():for j in range(len(term_dictionary[recipe_number]['title'])):if term_dictionary[recipe_number]['title'][j]==word[i]:title_count+=1#统计categories中的数量if 'categories' in term_dictionary[recipe_number].keys():for j in range(len(term_dictionary[recipe_number]['categories'])):if term_dictionary[recipe_number]['categories'][j]==word[i]:categories_count+=1#统计ingredients中的数量if 'ingredients' in term_dictionary[recipe_number].keys():for j in range(len(term_dictionary[recipe_number]['ingredients'])):if term_dictionary[recipe_number]['ingredients'][j]==word[i]:ingredients_count+=1#统计directions中的数量if 'directions' in term_dictionary[recipe_number].keys():for j in range(len(term_dictionary[recipe_number]['directions'])):if term_dictionary[recipe_number]['directions'][j]==word[i]:directions_count+=1#将rating的值保留下来作为权重计算参数之一if 'rating' in term_dictionary[recipe_number].keys():rating=term_dictionary[recipe_number]['rating']print(rating)if rating is None:rating=0else:rating=0#为每个recipe生成一个字典存储数量统计结果line_count={'title':title_count,'categories':categories_count,'ingredients':ingredients_count,'directions':directions_count,'rating':rating}#将此recipe的统计结果添加进大字典中count[recipe_number].update(line_count)#返回大字典return countdef calculate_weight(count):weight=[]#根据每个recipe的统计结果乘以权重并相加得到相关性for recipe_number in range(len(count)):weight.append(8*count[recipe_number]['title']+4*count[recipe_number]['categories']+2*count[recipe_number]['ingredients']+1*count[recipe_number]['directions']+count[recipe_number]['rating'])return weightdef sort(weight):top_index=[]#依据降序找到前10个相关度最大的recipe的indexfor i in range(10):#将weight列表中最大值的index添加进top_indextop_index.append(weight.index(max(weight)))#将weight列表中最大值改为-1weight[weight.index(max(weight))]=-1return top_indexdef search(query,ordering):result=[]#访问文件生成完整数据库termlisttermlist = process_recipes('recipes.json')print(termlist[3])#去掉query中的,和.query=query.replace(',','')query=query.replace('.','')#将query分解为单词query=query.split(' ')print('query:')print(query)#统计每个recipe中query出现的次数count=word_count(query,termlist)print(count)#根据次数统计计算相关度weight=calculate_weight(count)print('weight: ')print(weight)#找到相关度前10的recipe的indextop_index=sort(weight)print('Top ten index: ')print(top_index)if ordering=='normal':#直接输出排序结果result_index=top_indexif ordering=='simple':simple_degree=[]#对于相关度前10的recipe来说for i in range(10):if termlist[top_index[i]]['number']*termlist[top_index[i]]['step']==0:#如果原料数量或烹饪步骤为0，将此recipe排至末位simple_degree.append(sys.maxsize)else:#计算这个recipe的简单度并加入list中simple_degree.append(termlist[top_index[i]]['number']*termlist[top_index[i]]['step'])print(simple_degree)#按照简单度对相关度高的recipes重新升序排序for i in range(0,len(simple_degree)):for j in range (i,len(simple_degree)):if simple_degree[i]>simple_degree[j]:temp=simple_degree[i]simple_degree[i]=simple_degree[j]simple_degree[j]=temptemp=top_index[i]top_index[i]=top_index[j]top_index[j]=temp#输出新的排序结果result_index=top_indexprint(result_index)if ordering=='health':n=1health_degree=[]for i in range(10):if 'calories' in termlist[top_index[i]].keys() and 'protein' in termlist[top_index[i]].keys() and 'fat' in termlist[top_index[i]].keys():if termlist[top_index[i]]['calories'] is None or termlist[top_index[i]]['protein'] is None or termlist[top_index[i]]['fat'] is None:#如果卡路里，蛋白质或脂肪值不存在，将此recipe排至末位health_degree.append(sys.maxsize)else:#计算这个recipe的健康度并加入list中health_degree.append((abs(termlist[top_index[i]]['calories']-(510*n))/510)+(2*(abs(termlist[top_index[i]]['protein']-(18*n))/18))+(4*(abs(termlist[top_index[i]]['fat']-(150*n))/150)))else:#如果卡路里，蛋白质或脂肪信息不存在，将此recipe排至末位health_degree.append(sys.maxsize)print(health_degree)#按照健康度对相关度高的recipes重新升序排序for i in range(0,len(health_degree)):for j in range (i,len(health_degree)):if health_degree[i]>health_degree[j]:temp=health_degree[i]health_degree[i]=health_degree[j]health_degree[j]=temptemp=top_index[i]top_index[i]=top_index[j]top_index[j]=temp#输出新的排序结果result_index=top_indexprint(result_index)#从完整数据库中找到排序结果对应的titlefor i in range(len(result_index)):title=""for j in range(len(termlist[result_index[i]]['title'])):title=title+termlist[result_index[i]]['title'][j]+" "title=title[:-1]result.append(title)#返回已经排序后的title的listreturn result#输入待寻找的关键词和排序方法
result=search('oil, Mahi',ordering='normal')
print('search results: ')
#输出结果
for i in range(len(result)):print(result[i])

菜谱的json数据文件

recipes.json

python实现关键词搜索相关推荐

python搜索关键词自动提交_简单爬虫：调用百度接口,实现关键词搜索（python_003)...
需求: 如何用python代码实现百度搜索关键词的功能? 比如输入关键词:"python爬虫",得到一个搜索结果的页面,并保存到本地. 这是经典的python爬虫教学案例之一,也是 ...
更新！Python文献超级搜索工具，可关键词搜索并批量下载！
文献搜索对于广大学子来说真的是个麻烦事,如果你的学校购买的论文下载权限不够多,或者不在校园内,那就很头痛了.幸好,我们有Python制作的这个论文搜索工具,简化了我们学习的复杂性 2020-05-28 ...
Python脚本刷网页访问量或关键词搜索频率
生活中经常会有一些刷票.刷热搜.刷访问量的情况,其原理是怎么实现的呢,本篇研究了利用脚本刷取网页访问量或关键词搜索频率,声明如下:本篇仅供学习交流,作者水平有限如有出入请纠正,请勿恶意使用封号后果自负 ...
python爬虫京东关键词搜索商品及具体参数和评论
文章目录爬取京东关键词搜索商品及具体参数和评论查看京东关键词搜索,分析其网址各部分代表的意义,选取特定分类和时间区间进行爬取解析网页内容,查看商品参数的位置: 筛选所需数据,输出并保存,尝试输出 ...
淘宝/天猫关键词搜索采集接口分析商品价格走势（商品列表，商品销量，商品价格，分类ID采集精准商品数据）接口代码对接流程
淘宝/天猫关键词搜索采集接口分析商品价格走势(商品列表,商品销量,商品价格)接口代码对接流程如下: 1.公共参数名称类型必须描述(接口代码教程wx19970108018) key String ...
淘宝关键词搜索商品接口分析商品价格走势（商品列表接口，商品销量接口，商品价格接口，分类ID采集商品数据接口）接口代码对接教程
淘宝关键词搜索商品接口分析商品价格走势(商品列表接口,商品销量接口,商品价格接口,分类ID采集商品数据接口)接口代码对接教程如下: 1.公共参数名称类型必须描述(接口教程wx199701080 ...
淘宝关键词搜索采集商品价格销量接口分析商品价格走势（商品列表接口，商品销量接口，商品价格接口，分类ID采集精准商品数据接口）接口代码对接流程
淘宝关键词搜索采集商品价格销量接口分析商品价格走势(商品列表接口,商品销量接口,商品价格接口,分类ID采集精准商品数据接口)接口代码对接流程如下: 1.公共参数名称类型必须描述(接口代码教程w ...
白杨SEO：如何快速收集百度、抖音、知乎、小红书等关键词搜索下拉词及挖掘更精准长尾关键词？
前言:这是白杨SEO公众号原创第377篇.为什么想到写这个,这来自一个步入中年健忘症老SEOer的真实故事引发的,哈哈哈. 真实故事起因是这样的:白杨SEO在给一个企业做SEO顾问,看到对方最近写的 ...
利用八爪鱼爬取关键词搜索的微博数据
写在开头今天是我在CSDN上的两周年创作纪念日,两年里我写了不少有意思的文章,虽然没收获什么粉丝,但看到自己的博客阅读量越来越大,能帮助到更多人解决实际问题还是很开心的.今天就分享一下我最近做的一个 ...

python实现关键词搜索

菜谱的json数据文件

python实现关键词搜索相关推荐

最新文章

热门文章