赞
踩
所用技术:scrapy、django、协同过滤
使用scrapy爬取https://www.douguo.com网站的食谱数据
主要代码:
- # 在 parse 方法中获取下一页链接并发送请求
- def parse(self, response):
- # 获取当前页面的菜谱详情链接并发送请求
- for url in response.xpath('//ul[@class="cook-list"]//a[@class="cook-img"]/@href').getall():
- pipei_url = re.sub("/0.*", "", response.url)
- tag_id = self.url_tag[unquote(pipei_url)][0]
- tag_name = self.url_tag[unquote(pipei_url)][1]
- m = {"tag_id": tag_id, "tag_name": tag_name}
- yield scrapy.Request(url=self.url_root + url, callback=self.parse_detail, meta=m)
-
- # 解析出下一页链接并发送请求
- next_page = response.xpath('//a[@class="anext"]/@href')
- if next_page:
- yield scrapy.Request(url=next_page[0].get().replace("http", "https"), callback=self.parse)
-
- def md5_encrypt(self, s):
- md5 = hashlib.md5()
- md5.update(s.encode("utf-8"))
- return md5.hexdigest()
-
- def parse_detail(self, response, **kwargs):
- tag_id = response.meta["tag_id"]
- tag_name = response.meta["tag_name"]
- divs = response.xpath('//div[@class="step"]/div')
- title = response.xpath("//h1/text()").get()
-
- # 步骤
- step_list = []
- for div in divs:
- step_img = div.xpath("a/img//@src").get()
- step_index = div.xpath('div[@class="stepinfo"]/p/text()').get()
- step_text = "\n".join(div.xpath('div[@class="stepinfo"]/text()').getall()).strip()
- step_list.append([step_img, step_index, step_text])
-
- # 配料
- mix_list = []
- for td in response.xpath("//table/tr/td"):
- mix_name = td.xpath('span[@class="scname"]//text()').get()
- mix_cot = td.xpath('span[@class="right scnum"]//text()').get()
- mix_list.append([mix_name, mix_cot])
-
- info_item = FoodInfoItem(
- tag_id=tag_id,
- tag_name=tag_name,
- food_id=self.md5_encrypt(response.url),
- food_url=response.url,
- title=title,
- step_list=str(step_list),
- img=response.xpath('//*[@id="banner"]/a/img/@src').get(),
- desc1="\n".join(response.xpath('//p[@class="intro"]/text()').getall()).strip(),
- mix_list=str(mix_list),
- all_key=tag_name + title + str(mix_list),
- )
- yield info_item
基于用户的协同过滤推荐算法
- def recommend_by_user_cf(user_id, similarity_matrix, N=10):
- """
- 基于用户的协同过滤推荐算法
- Args:
- user_id: 目标用户ID
- similarity_matrix: 用户之间的相似度矩阵
- records: 所有用户的点菜记录,格式为 [(user_id, food_id, eat_date), ...]
- N: 推荐菜品数量
- Returns:
- recommended_foods: 推荐的菜品列表,格式为 [(food_id_1, score_1), (food_id_2, score_2), ...]
- """
- # 找到和指定用户吃过相同菜品的其他用户
- user_eating_records = EatingRecord.objects.filter(user_id=user_id).values("food_id", "eat_date")
- user_food_ids = [record["food_id"] for record in user_eating_records]
- similar_users = []
- for i in range(similarity_matrix.shape[0]):
- if i != user_id - 1:
- similarity = similarity_matrix[user_id - 1, i]
- if similarity > 0:
- # 找到该相似用户在最近M天内吃过的所有菜品
- similar_user_eating_records = (
- EatingRecord.objects.filter(user_id=i + 1).exclude(food_id__in=user_food_ids).values("food_id", "eat_date").order_by("-eat_date")[:10]
- )
- similar_user_food_ids = [record["food_id"] for record in similar_user_eating_records]
-
- # 计算该相似用户与指定用户之间的相似度,并加入相似用户列表中
- similar_users.append((i + 1, similarity, similar_user_food_ids))
-
- # 统计所有相似用户对每个菜品的兴趣度得分
- scores = {}
- for similar_user_id, similarity, similar_user_food_ids in similar_users:
- for food_id in similar_user_food_ids:
- if food_id not in user_food_ids:
- scores[food_id] = scores.get(food_id, 0) + similarity
-
- # 按照得分从高到低排序,选取前N个菜品作为推荐结果
- sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
- recommended_foods = sorted_scores[:N]
-
- return recommended_foods
python 毕设帮助,指导,源码分享,调试部署:worthy_life_
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。