当前位置:   article > 正文

Python爬取京东商品信息以及评论存进MySQL_怎么用python爬去京东的商品信息和价格并存储到mysql

怎么用python爬去京东的商品信息和价格并存储到mysql

目录

构建mysql数据表

第一版:

第二版 :

第三版:

总结:


构建mysql数据表

问题:使用SQL alchemy时,非主键不能设置为自增长,但是我想让这个非主键仅仅是为了作为索引,autoincrement=True无效,该怎么实现让它自增长呢?

  1. from sqlalchemy import String,Integer,Text,Column
  2. from sqlalchemy import create_engine
  3. from sqlalchemy.orm import sessionmaker
  4. from sqlalchemy.orm import scoped_session
  5. from sqlalchemy.ext.declarative import declarative_base
  6. engine=create_engine(
  7. "mysql+pymysql://root:root@127.0.0.1:3306/jdcrawl?charset=utf8",
  8. pool_size=200,
  9. max_overflow=300,
  10. echo=False
  11. )
  12. BASE=declarative_base() # 实例化
  13. class Goods(BASE):
  14. __tablename__='goods'
  15. id=Column(Integer(),primary_key=True,autoincrement=True)
  16. sku_id = Column(String(200), primary_key=True, autoincrement=False)
  17. name=Column(String(200))
  18. price=Column(String(200))
  19. comments_num=Column(Integer)
  20. shop=Column(String(200))
  21. link=Column(String(200))
  22. class Comments(BASE):
  23. __tablename__='comments'
  24. id=Column(Integer(),primary_key=True,autoincrement=True,nullable=False)
  25. sku_id=Column(String(200),primary_key=True,autoincrement=False)
  26. comments=Column(Text())
  27. BASE.metadata.create_all(engine)
  28. Session=sessionmaker(engine)
  29. sess_db=scoped_session(Session)

第一版:

问题:爬取几页评论后就会爬取到空白页,添加refer后依旧如此

尝试解决方法:将获取评论地方的线程池改为单线程,并每获取一页评论增加延时1s

  1. # 不能爬太快!!!不然获取不到评论
  2. from bs4 import BeautifulSoup
  3. import requests
  4. from urllib import parse
  5. import csv,json,re
  6. import threadpool
  7. import time
  8. from jd_mysqldb import Goods,Comments,sess_db
  9. headers={
  10. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
  11. 'Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918',
  12. 'Referer': 'https://www.jd.com/'
  13. }
  14. num=0 # 商品数量
  15. comments_num=0 # 评论数量
  16. # 获取商品信息和SkuId
  17. def getIndex(url):
  18. session=requests.Session()
  19. session.headers=headers
  20. global num
  21. res=session.get(url,headers=headers)
  22. print(res.status_code)
  23. res.encoding=res.apparent_encoding
  24. soup=BeautifulSoup(res.text,'lxml')
  25. items=soup.select('li.gl-item')
  26. for item in items[:3]: # 爬取3个商品测试
  27. title=item.select_one('.p-name a em').text.strip().replace(' ','')
  28. price=item.select_one('.p-price strong').text.strip().replace('¥','')
  29. try:
  30. shop=item.select_one('.p-shopnum a').text.strip() # 获取书籍时查找店铺的方法
  31. except:
  32. shop=item.select_one('.p-shop a').text.strip() # 获取其他商品时查找店铺的方法
  33. link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))
  34. SkuId=re.search('\d+',link).group()
  35. comments_num=getCommentsNum(SkuId,session)
  36. print(SkuId,title, price, shop, link, comments_num)
  37. print("开始存入数据库...")
  38. try:
  39. IntoGoods(SkuId,title, price, shop, link, comments_num)
  40. except Exception as e:
  41. print(e)
  42. sess_db.rollback()
  43. num += 1
  44. print("正在获取评论...")
  45. # 获取评论总页数
  46. url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'
  47. headers['Referer'] = f'https://item.jd.com/{SkuId}.html'
  48. headers['Connection']='keep-alive'
  49. res2 = session.get(url1,headers=headers)
  50. res2.encoding = res2.apparent_encoding
  51. json_data = json.loads(res2.text)
  52. max_page = json_data['maxPage'] # 经测试最多可获取100页评论,每页10条
  53. args = []
  54. for i in range(0, max_page):
  55. # 使用此链接获取评论得到的为json格式
  56. url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'
  57. # 使用此链接获取评论得到的非json格式,需要提取
  58. # url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'
  59. args.append(([session,SkuId,url2], None))
  60. pool2 = threadpool.ThreadPool(2) # 2个线程
  61. reque2 = threadpool.makeRequests(getComments,args) # 创建任务
  62. for r in reque2:
  63. pool2.putRequest(r) # 提交任务到线程池
  64. pool2.wait()
  65. # 获取评论总数量
  66. def getCommentsNum(SkuId,sess):
  67. headers['Referer']=f'https://item.jd.com/{SkuId}.html'
  68. url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'
  69. res=sess.get(url,headers=headers)
  70. try:
  71. res.encoding=res.apparent_encoding
  72. json_data=json.loads(res.text) # json格式转为字典
  73. num=json_data['CommentsCount'][0]['CommentCount']
  74. return num
  75. except:
  76. return 'Error'
  77. # 获取评论
  78. def getComments(sess,SkuId,url2):
  79. global comments_num
  80. print(url2)
  81. headers['Referer'] = f'https://item.jd.com/{SkuId}.html'
  82. res2 = sess.get(url2,headers=headers)
  83. res2.encoding='gbk'
  84. json_data=res2.text
  85. '''
  86. # 如果用url2_2需要进行如下操作提取json
  87. start = res2.text.find('jQuery9287224(') + len('jQuery9287224(')
  88. end = res2.text.find(');')
  89. json_data=res2.text[start:end]
  90. '''
  91. dict_data = json.loads(json_data)
  92. try:
  93. comments=dict_data['comments']
  94. for item in comments:
  95. comment=item['content'].replace('\n','')
  96. # print(comment)
  97. comments_num+=1
  98. try:
  99. IntoComments(SkuId,comment)
  100. except Exception as e:
  101. print(e)
  102. sess_db.rollback()
  103. except:
  104. pass
  105. # 商品信息入库
  106. def IntoGoods(SkuId,title, price, shop, link, comments_num):
  107. goods_data=Goods(
  108. sku_id=SkuId,
  109. name=title,
  110. price=price,
  111. comments_num=comments_num,
  112. shop=shop,
  113. link=link
  114. )
  115. sess_db.add(goods_data)
  116. sess_db.commit()
  117. # 评论入库
  118. def IntoComments(SkuId,comment):
  119. comments_data=Comments(
  120. sku_id=SkuId,
  121. comments=comment
  122. )
  123. sess_db.add(comments_data)
  124. sess_db.commit()
  125. if __name__ == '__main__':
  126. start_time=time.time()
  127. urls=[]
  128. KEYWORD=parse.quote(input("请输入要查询的关键词:"))
  129. for i in range(1,2): # 爬取一页进行测试
  130. url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'
  131. urls.append(([url,],None)) # threadpool要求必须这样写
  132. pool=threadpool.ThreadPool(2) # 2个线程的线程池
  133. reque=threadpool.makeRequests(getIndex,urls) # 创建任务
  134. for r in reque:
  135. pool.putRequest(r) # 向线程池提交任务
  136. pool.wait() # 等待所有任务执行完毕
  137. print("共获取{}件商品,获得{}条评论,耗时{}".format(num,comments_num,time.time()-start_time))

第二版 :

经测试,的确不会出现空白页的情况

进一步优化:同时获取2个以上商品的评论

  1. # 不能爬太快!!!不然获取不到评论
  2. from bs4 import BeautifulSoup
  3. import requests
  4. from urllib import parse
  5. import csv,json,re
  6. import threadpool
  7. import time
  8. from jd_mysqldb import Goods,Comments,sess_db
  9. headers={
  10. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
  11. 'Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918',
  12. 'Referer': 'https://www.jd.com/'
  13. }
  14. num=0 # 商品数量
  15. comments_num=0 # 评论数量
  16. # 获取商品信息和SkuId
  17. def getIndex(url):
  18. session=requests.Session()
  19. session.headers=headers
  20. global num
  21. res=session.get(url,headers=headers)
  22. print(res.status_code)
  23. res.encoding=res.apparent_encoding
  24. soup=BeautifulSoup(res.text,'lxml')
  25. items=soup.select('li.gl-item')
  26. for item in items[:2]: # 爬取2个商品测试
  27. title=item.select_one('.p-name a em').text.strip().replace(' ','')
  28. price=item.select_one('.p-price strong').text.strip().replace('¥','')
  29. try:
  30. shop=item.select_one('.p-shopnum a').text.strip() # 获取书籍时查找店铺的方法
  31. except:
  32. shop=item.select_one('.p-shop a').text.strip() # 获取其他商品时查找店铺的方法
  33. link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))
  34. SkuId=re.search('\d+',link).group()
  35. headers['Referer'] = f'https://item.jd.com/{SkuId}.html'
  36. headers['Connection'] = 'keep-alive'
  37. comments_num=getCommentsNum(SkuId,session)
  38. print(SkuId,title, price, shop, link, comments_num)
  39. print("开始将商品存入数据库...")
  40. try:
  41. IntoGoods(SkuId,title, price, shop, link, comments_num)
  42. except Exception as e:
  43. print(e)
  44. sess_db.rollback()
  45. num += 1
  46. print("正在获取评论...")
  47. # 获取评论总页数
  48. url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'
  49. res2 = session.get(url1,headers=headers)
  50. res2.encoding = res2.apparent_encoding
  51. json_data = json.loads(res2.text)
  52. max_page = json_data['maxPage'] # 经测试最多可获取100页评论,每页10条
  53. print("{}评论共{}页".format(SkuId,max_page))
  54. if max_page==0:
  55. IntoComments(SkuId,'0')
  56. else:
  57. for i in range(0, max_page):
  58. # 使用此链接获取评论得到的为json格式
  59. url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'
  60. # 使用此链接获取评论得到的非json格式,需要提取
  61. # url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'
  62. print("开始获取第{}页评论:{}".format(i+1,url2) )
  63. getComments(session,SkuId,url2)
  64. time.sleep(1)
  65. # 获取评论总数量
  66. def getCommentsNum(SkuId,sess):
  67. url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'
  68. res=sess.get(url)
  69. try:
  70. res.encoding=res.apparent_encoding
  71. json_data=json.loads(res.text) # json格式转为字典
  72. num=json_data['CommentsCount'][0]['CommentCount']
  73. return num
  74. except:
  75. return 'Error'
  76. # 获取评论
  77. def getComments(sess,SkuId,url2):
  78. global comments_num
  79. res2 = sess.get(url2)
  80. res2.encoding=res2.apparent_encoding
  81. json_data=res2.text
  82. '''
  83. # 如果用url2_2需要进行如下操作提取json
  84. start = res2.text.find('jQuery9287224(') + len('jQuery9287224(')
  85. end = res2.text.find(');')
  86. json_data=res2.text[start:end]
  87. '''
  88. dict_data = json.loads(json_data)
  89. comments=dict_data['comments']
  90. for item in comments:
  91. comment=item['content'].replace('\n','')
  92. # print(comment)
  93. comments_num+=1
  94. try:
  95. IntoComments(SkuId,comment)
  96. except Exception as e:
  97. print(e)
  98. sess_db.rollback()
  99. # 商品信息入库
  100. def IntoGoods(SkuId,title, price, shop, link, comments_num):
  101. goods_data=Goods(
  102. sku_id=SkuId,
  103. name=title,
  104. price=price,
  105. comments_num=comments_num,
  106. shop=shop,
  107. link=link
  108. )
  109. sess_db.add(goods_data)
  110. sess_db.commit()
  111. # 评论入库
  112. def IntoComments(SkuId,comment):
  113. comments_data=Comments(
  114. sku_id=SkuId,
  115. comments=comment
  116. )
  117. sess_db.add(comments_data)
  118. sess_db.commit()
  119. if __name__ == '__main__':
  120. start_time=time.time()
  121. urls=[]
  122. KEYWORD=parse.quote(input("请输入要查询的关键词:"))
  123. for i in range(1,2): # 爬取一页进行测试
  124. url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'
  125. urls.append(([url,],None)) # threadpool要求必须这样写
  126. pool=threadpool.ThreadPool(2) # 2个线程的线程池
  127. reque=threadpool.makeRequests(getIndex,urls) # 创建任务
  128. for r in reque:
  129. pool.putRequest(r) # 向线程池提交任务
  130. pool.wait() # 等待所有任务执行完毕
  131. print("共获取{}件商品,获得{}条评论,耗时{}".format(num,comments_num,time.time()-start_time))

第三版:

 。。。。不行,又出现空白页了

  1. # 不能爬太快!!!不然获取不到评论
  2. from bs4 import BeautifulSoup
  3. import requests
  4. from urllib import parse
  5. import csv,json,re
  6. import threadpool
  7. import time
  8. from jd_mysqldb import Goods,Comments,sess_db
  9. headers={
  10. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
  11. 'Cookie': '__jdv=76161171|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1613711947911; __jdu=16137119479101182770449; areaId=7; ipLoc-djd=7-458-466-0; PCSYCityID=CN_410000_0_0; shshshfpa=07383463-032f-3f99-9d40-639cb57c6e28-1613711950; shshshfpb=u8S9UvxK66gfIbM1mUNrIOg%3D%3D; user-key=153f6b4d-0704-4e56-82b6-8646f3f0dad4; cn=0; shshshfp=9a88944b34cb0ff3631a0a95907b75eb; __jdc=122270672; 3AB9D23F7A4B3C9B=SEELVNXBPU7OAA3UX5JTKR5LQADM5YFJRKY23Z6HDBU4OT2NWYGX525CKFFVHTRDJ7Q5DJRMRZQIQJOW5GVBY43XVI; jwotest_product=99; __jda=122270672.16137119479101182770449.1613711948.1613738165.1613748918.4; JSESSIONID=C06EC8D2E9384D2628AE22B1A6F9F8FC.s1; shshshsID=ab2ca3143928b1b01f6c5b71a15fcebe_5_1613750374847; __jdb=122270672.5.16137119479101182770449|4.1613748918',
  12. 'Referer': 'https://www.jd.com/'
  13. }
  14. num=0 # 商品数量
  15. comments_num=0 # 评论数量
  16. # 获取商品信息和SkuId
  17. def getIndex(url):
  18. global num
  19. skuids=[]
  20. session=requests.Session()
  21. session.headers=headers
  22. res=session.get(url,headers=headers)
  23. print(res.status_code)
  24. res.encoding=res.apparent_encoding
  25. soup=BeautifulSoup(res.text,'lxml')
  26. items=soup.select('li.gl-item')
  27. for item in items[:3]: # 爬取3个商品测试
  28. title=item.select_one('.p-name a em').text.strip().replace(' ','')
  29. price=item.select_one('.p-price strong').text.strip().replace('¥','')
  30. try:
  31. shop=item.select_one('.p-shopnum a').text.strip() # 获取书籍时查找店铺的方法
  32. except:
  33. shop=item.select_one('.p-shop a').text.strip() # 获取其他商品时查找店铺的方法
  34. link=parse.urljoin('https://',item.select_one('.p-img a').get('href'))
  35. SkuId=re.search('\d+',link).group()
  36. skuids.append(([SkuId,session],None))
  37. headers['Referer'] = f'https://item.jd.com/{SkuId}.html'
  38. headers['Connection'] = 'keep-alive'
  39. comments_num=getCommentsNum(SkuId,session) # 评论数量
  40. print(SkuId,title, price, shop, link, comments_num)
  41. print("开始将商品存入数据库...")
  42. try:
  43. IntoGoods(SkuId,title, price, shop, link, comments_num)
  44. except Exception as e:
  45. print(e)
  46. sess_db.rollback()
  47. num += 1
  48. print("开始获取评论并存入数据库...")
  49. pool2=threadpool.ThreadPool(3) # 可同时获取3个商品的评论
  50. task=threadpool.makeRequests(getComments,skuids)
  51. for r in task:
  52. pool2.putRequest(r)
  53. pool2.wait()
  54. # 获取评论
  55. def getComments(SkuId,sess):
  56. # 获取评论总页数
  57. url1 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page=0&pageSize=10'
  58. res2 = sess.get(url1, headers=headers)
  59. res2.encoding = res2.apparent_encoding
  60. json_data = json.loads(res2.text)
  61. max_page = json_data['maxPage'] # 经测试最多可获取100页评论,每页10条
  62. print("{}评论共{}页".format(SkuId, max_page))
  63. if max_page == 0:
  64. IntoComments(SkuId, '0')
  65. else:
  66. for i in range(0, max_page):
  67. # 使用此链接获取评论得到的为json格式
  68. url2 = f'https://club.jd.com/comment/productPageComments.action?productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'
  69. # 使用此链接获取评论得到的非json格式,需要提取
  70. # url2_2=f'https://club.jd.com/comment/productPageComments.action?callback=jQuery9287224&productId={SkuId}&score=0&sortType=5&page={i}&pageSize=10'
  71. print("开始获取第{}页评论:{}".format(i + 1, url2))
  72. getComments_one(sess, SkuId, url2)
  73. time.sleep(1)
  74. # 获取评论总数量
  75. def getCommentsNum(SkuId,sess):
  76. url=f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={SkuId}'
  77. res=sess.get(url)
  78. try:
  79. res.encoding=res.apparent_encoding
  80. json_data=json.loads(res.text) # json格式转为字典
  81. num=json_data['CommentsCount'][0]['CommentCount']
  82. return num
  83. except:
  84. return 'Error'
  85. # 获取单个评论
  86. def getComments_one(sess,SkuId,url2):
  87. global comments_num
  88. res2 = sess.get(url2)
  89. res2.encoding=res2.apparent_encoding
  90. json_data=res2.text
  91. '''
  92. # 如果用url2_2需要进行如下操作提取json
  93. start = res2.text.find('jQuery9287224(') + len('jQuery9287224(')
  94. end = res2.text.find(');')
  95. json_data=res2.text[start:end]
  96. '''
  97. dict_data = json.loads(json_data)
  98. comments=dict_data['comments']
  99. for item in comments:
  100. comment=item['content'].replace('\n','')
  101. # print(comment)
  102. comments_num+=1
  103. try:
  104. IntoComments(SkuId,comment)
  105. except Exception as e:
  106. print(e)
  107. print("rollback!")
  108. sess_db.rollback()
  109. # 商品信息入库
  110. def IntoGoods(SkuId,title, price, shop, link, comments_num):
  111. goods_data=Goods(
  112. sku_id=SkuId,
  113. name=title,
  114. price=price,
  115. comments_num=comments_num,
  116. shop=shop,
  117. link=link
  118. )
  119. sess_db.add(goods_data)
  120. sess_db.commit()
  121. # 评论入库
  122. def IntoComments(SkuId,comment):
  123. comments_data=Comments(
  124. sku_id=SkuId,
  125. comments=comment
  126. )
  127. sess_db.add(comments_data)
  128. sess_db.commit()
  129. if __name__ == '__main__':
  130. start_time=time.time()
  131. urls=[]
  132. KEYWORD=parse.quote(input("请输入要查询的关键词:"))
  133. for i in range(1,2): # 爬取一页进行测试
  134. url=f'https://search.jd.com/Search?keyword={KEYWORD}&wq={KEYWORD}&page={i}'
  135. urls.append(([url,],None)) # threadpool要求必须这样写
  136. pool=threadpool.ThreadPool(2) # 2个线程的线程池
  137. reque=threadpool.makeRequests(getIndex,urls) # 创建任务
  138. for r in reque:
  139. pool.putRequest(r) # 向线程池提交任务
  140. pool.wait() # 等待所有任务执行完毕
  141. print("共获取{}件商品,获得{}条评论,耗时{}".format(num,comments_num,time.time()-start_time))

总结:

京东的反爬有点强,如果不想爬取到空白页,只能用单线程加延时一条一条的爬

 

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号