当前位置:   article > 正文

Python爬虫爬取热门电影及其购票链接和简介_爬虫抓电影网址

爬虫抓电影网址

安装BeautifulSoup以及requests

打开window 的cmd窗口输入命令pip install requests 执行安装,等待他安装完成就可以了

BeautifulSoup库也是同样的方法

BeautifulSoup库的具体使用方法:https://cuiqingcai.com/1319.html

requests库的具体使用方法:https://blog.csdn.net/weixin_36279318/article/details/79442629

其他工具: Chrome浏览器

Python版本: Python3.6

运行平台: Windows

1、首先我们搜索猫眼电影打开热门电影:https://maoyan.com/films

获取网页的代码:

  1. def getHTMLText(url,k):
  2. try:
  3. if(k==0):
  4. a={}
  5. else:
  6. a={'offset':k}
  7. r = requests.get(url,params=a,headers={'User-Agent': 'Mozilla/4.0'})
  8. r.raise_for_status()
  9. r.encoding = r.apparent_encoding
  10. return r.text
  11. except:
  12. print("Failed!")

经过观察其中因为每一页的网址其offset都不相同,故只要改变offset=k便可获取每一页的信息

获取网页每一张的海报,我采用了正则匹配获取了每一张图片的链接,通过urlretrieve函数把图片下载到指定的目录并编号:

  1. def getImg(html):
  2. reg = r'data-src="(.*)"'
  3. imgre = re.compile(reg)
  4. imglist2 = re.findall(imgre,html)
  5. #print(imglist2)
  6. global x
  7. imgurl=[]
  8. namelist=[]
  9. path = 'E:\\test'
  10. if not os.path.isdir(path):
  11. os.makedirs(path)
  12. paths = path+'\\'
  13. for imgurls in imglist2:
  14. urllib.request.urlretrieve(imgurls,'{0}{1}.jpg'.format(paths,x))
  15. x=x+1
  16. return imgurl

用Beautifulsoup获取标签,把特惠选座的以及电影简介的链接获取后通过字符串拼接成完整的链接后,存入字典中

  1. def getname(html):
  2. global page #通过global把字典定义成全局
  3. global jianjie
  4. soup = BeautifulSoup(html, "html.parser")
  5. movname = soup.find_all(class_='channel-detail movie-item-title')
  6. movitem = soup.find_all(class_='movie-item')
  7. #print(type(movitem))
  8. #print(movitem)
  9. for i in movitem:
  10. for i1 in i.find_all('a'):
  11. try:
  12. ac.append('http://maoyan.com'+i1['href'])
  13. except Exception as e:
  14. continue
  15. lenth = len(movname)
  16. for j in movitem:
  17. for i2 in j.find_all('a'):
  18. try:
  19. c=i2['data-val'].replace('{','').replace('}','').replace('movieid:','movieId=')
  20. ai.append('http://maoyan.com/cinemas?'+c)
  21. except Exception as e:
  22. continue
  23. for i in range(lenth):
  24. mov.append(movname[i].attrs['title'])
  25. file= open('F:/Python/dianying/dianying.doc', 'a')
  26. for i,j in zip(mov,ac):
  27. file.writelines(mode.format(i,j,chr(12288)))
  28. file.writelines('\n')
  29. file.close
  30. jianjie=dict(zip(mov,ac))
  31. page=dict(zip(mov,ai))
  32. for key,v in page.items():
  33. print(mode.format(key,v,chr(12288)))

最后采用input,可以通过输入想看的电影名称来获取特惠选座的链接。

若需要查看简介,即会输出电影简介的链接并自动打开该网页

  1. def main():
  2. basicurl='http://maoyan.com/films'
  3. k=0
  4. print(mode.format('电影名称','特惠选座链接'))
  5. while k<=100:
  6. html=getHTMLText(basicurl,k)
  7. #getImg(html)
  8. getname(html)
  9. k+=30
  10. n1=input('请输入电影名称:')
  11. for key,v in page.items():
  12. if(n1==key):
  13. print(v)
  14. n2=input('是否查看简介:')
  15. if(n2=='是'):
  16. print(jianjie[n1])
  17. webbrowser.open(jianjie[n1])
  18. main()

代码运行的结果如下图:

因输出的内容较多,所以截取一部分(可改变main()中的k值来改变输出电影的部数)

输入电影名称:

输入“是”查看简介:

以下为完整代码:

  1. #coding:utf-8
  2. import requests
  3. from bs4 import BeautifulSoup
  4. import re
  5. import urllib.request
  6. import os
  7. import webbrowser
  8. def getHTMLText(url,k):
  9. try:
  10. if(k==0):
  11. a={}
  12. else:
  13. a={'offset':k}
  14. r = requests.get(url,params=a,headers={'User-Agent': 'Mozilla/4.0'})
  15. r.encoding = r.apparent_encoding
  16. return r.text
  17. except:
  18. print("Failed!")
  19. ac=[]
  20. mov=[]
  21. ai=[]
  22. x=0
  23. mode= "{0:<30}\t{1:^30}"
  24. def getname(html):
  25. global page
  26. global jianjie
  27. soup = BeautifulSoup(html, "html.parser")
  28. movname = soup.find_all(class_='channel-detail movie-item-title')
  29. movitem = soup.find_all(class_='movie-item')
  30. #print(type(movitem))
  31. #print(movitem)
  32. for i in movitem:
  33. for i1 in i.find_all('a'):
  34. try:
  35. ac.append('http://maoyan.com'+i1['href'])
  36. except Exception as e:
  37. continue
  38. lenth = len(movname)
  39. for j in movitem:
  40. for i2 in j.find_all('a'):
  41. try:
  42. c=i2['data-val'].replace('{','').replace('}','').replace('movieid:','movieId=')
  43. ai.append('http://maoyan.com/cinemas?'+c)
  44. except Exception as e:
  45. continue
  46. for i in range(lenth):
  47. mov.append(movname[i].attrs['title'])
  48. file= open('F:/Python/dianying/dianying.doc', 'a')
  49. for i,j in zip(mov,ac):
  50. file.writelines(mode.format(i,j,chr(12288)))
  51. file.writelines('\n')
  52. file.close
  53. jianjie=dict(zip(mov,ac))
  54. page=dict(zip(mov,ai))
  55. for key,v in page.items():
  56. print(mode.format(key,v,chr(12288)))
  57. def getImg(html):
  58. reg = r'data-src="(.*)"'
  59. imgre = re.compile(reg)
  60. imglist2 = re.findall(imgre,html)
  61. #print(imglist2)
  62. global x
  63. imgurl=[]
  64. namelist=[]
  65. path = 'E:\\test'
  66. if not os.path.isdir(path):
  67. os.makedirs(path)
  68. paths = path+'\\'
  69. for imgurls in imglist2:
  70. urllib.request.urlretrieve(imgurls,'{0}{1}.jpg'.format(paths,x))
  71. x=x+1
  72. return imgurl
  73. def main():
  74. basicurl='http://maoyan.com/films'
  75. k=0
  76. print(mode.format('电影名称','特惠选座链接'))
  77. while k<=100:
  78. html=getHTMLText(basicurl,k)
  79. #getImg(html)
  80. getname(html)
  81. k+=30
  82. n1=input('请输入电影名称:')
  83. for key,v in page.items():
  84. if(n1==key):
  85. print(v)
  86. n2=input('是否查看简介:')
  87. if(n2=='是'):
  88. print(jianjie[n1])
  89. webbrowser.open(jianjie[n1])
  90. main()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/393332
推荐阅读
相关标签
  

闽ICP备14008679号