- 1 import requests
- 2 import json
- 3 from bs4 import BeautifulSoup #网页解析获取数据
- 4 import sys
- 5 import re
- 6 import urllib.request,urllib.error #制定url,获取网页数据
- 7 import sqlite3
- 8 import xlwt #excel操作
- 9
- 10 def get_ten():
- 11 url="https://v.qq.com/channel/movie?_all=1&channel=movie&listpage=1&sort=18"
- 12 headers={
- 13 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '+
- 14 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
- 15 }
- 16 # res = urllib.request.urlopen(url)
- 17 res = urllib.request.Request(url=url,headers=headers) #编辑request请求
- 18 response=urllib.request.urlopen(res).read().decode() #读取
- 19 html=BeautifulSoup(response,"html.parser") #解析
- 20 #
- 21 # list=html.select(".figure_score")
- 22 # for item in list:
- 23 # print(item)
- 24 dataRes=[]
- 25 findLink=re.compile(r'href="(.*?)"') #链接
- 26 findName=re.compile(r'title="(.*?)"') #影片名
- 27 soup=html.find_all(r"a",class_="figure")
- 28 for i in soup:
- 29 # print(i)
- 30 words=str(i)
- 31 dataRes.append(re.findall(findLink,words)) #添加链接
- 32 dataRes.append(re.findall(findName,words)) #添加影片名
- 33 for i in dataRes:
- 34 print(i)
- 35 # print(html)
- 36 # print(html.head.contents) #输出tag的所有子节点(list)
- 37 # print(response)
- 38 return res
- 39 if __name__ == '__main__':
- 40 get_ten()