赞
踩
#导入Beautifulsoup包 from bs4 import BeautifulSoup as bs from urllib.request import urlopen import re # 请求URL并把结果用utf-8编码 resp=urlopen("https://en.wikipedia.org/wiki/Main_page").read().decode("utf-8") # 使用BeautifulSoup去解析 soup=bs(resp,"html.parser") # 获取所有以/wiki开头的a 标签的href属性 listUrls=soup.findAll("a",href=re.compile("^/wiki/")) # 打印出url for url in listUrls: # print(url) #打印出来是整条a标签 if not re.search("\.(jpg|JPG)$",url["href"]): #上面取的有包含.jpg的图片,故要在href属性中排除 #将url的名字+"https://en.wikipedia.org"+url中的href属性合并打印出来 print(url.get_text(),"<---->","https://en.wikipedia.org"+url["href"])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。