赞
踩
pip install requests
pip install bs4
pip install lxml
import requests import json if __name__ == "__main__": # 根据ajax的请求url,进行翻译 post_url = "https://fanyi.baidu.com/sug" headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36' } key = input("Input word:") params = { 'kw':key } response = requests.post(url=post_url,data=params,headers=headers) dic_obj = response.json() fp = open('./translation.json','w',encoding='utf-8') # json数据持久化存储 json.dump(dic_obj,fp=fp,ensure_ascii=False) # 打印获取的json数据 print(dic_obj)
输出结果如下
Input word:python
{'errno': 0, 'data': [{'k': 'Python', 'v': '蛇属,蟒蛇属'}, {'k': 'python', 'v': 'n. 巨蛇,大蟒'}, {'k': 'pythons', 'v': 'n. 巨蛇,大蟒( python的名词复数 )'}]}
post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' cname = input("请输入城市:") params = { 'cname': cname, 'pid':'', 'keyword': '', 'pageIndex': '1', 'pageSize': '10' } response = requests.post(url=post_url,params=params,headers=headers) page_txt = response.text filename = cname + '.html' with open(filename , 'w' , encoding='utf-8') as fp: fp.write(page_txt) print(page_txt , 'OVER!')
输入城市地址就可以完成周边查询
请输入城市:北京
{"Table":[{"rowcount":443}],"Table1":[{"rownum":1,"storeName":"前门","addressDetail":"西城区前门西大街正阳市场1号楼中部","pro":"Wi-Fi,礼品卡","provinceName":"北京市","cityName":"北京市"},{"rownum":2,"storeName":"京源","addressDetail":"左家庄新源街24号","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"},{"rownum":3,"storeName":"东大桥","addressDetail":"朝外大街东大桥路1号楼","pro":"Wi-Fi,店内参观,礼品卡","provinceName":"北京市","cityName":"北京市"},{"rownum":4,"storeName":"方庄","addressDetail":"蒲芳路26号","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"},{"rownum":5,"storeName":"安定门","addressDetail":"安定门外大街西河沿13号楼","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"},{"rownum":6,"storeName":"展览路(德宝)","addressDetail":"西外大街德宝新园14号","pro":"Wi-Fi,店内参观,礼品卡","provinceName":"北京市","cityName":"北京市"},{"rownum":7,"storeName":"劲松","addressDetail":"劲松4区401楼","pro":"24小时,Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"},{"rownum":8,"storeName":"西罗园","addressDetail":"西罗园4区南二段","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"},{"rownum":9,"storeName":"蓝桥","addressDetail":"蓝桥餐厅工体北路11-1号","pro":"24小时,Wi-Fi,点唱机,礼品卡","provinceName":"北京市","cityName":"北京市"},{"rownum":10,"storeName":"万惠","addressDetail":"金台里甲15号","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"}]} OVER!
if not os.path.exists('./img_url'): os.mkdir('./img_url') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/99.0.4844.51 Safari/537.36 ' } # 正则表达式 reg = '<img src="(.*?)".*?/>' home_page_url = "https://www.biedoul.com/t/5pCe56yR5Zu%2B5paH_{}.html" for index in range(1,14): home_page_url_num = home_page_url.format(str(index)) print(home_page_url_num) home_text = requests.get(url=home_page_url_num, headers=headers).text # print(home_text) # ex = '<div class="nr"><dl class="xhlist" id="xh.*?"><dd>.*?<img src="(.*?)">.*?</dd></dl></div>' img_src = re.findall(reg, home_text, re.S) for i, src in enumerate(img_src): if i < 3: continue img_cont = requests.get(url=src, headers=headers).content img_path = './img_url/'+src.split('/')[-1] with open(img_path, 'wb') as fp: fp.write(img_cont) print("爬取完成")
from bs4 import BeautifulSoup import re import requests if __name__ == "__main__": book_url = 'https://www.shicimingju.com/book/sanguoyanyi.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0' } # 解析详情页数据 页面数据乱码处理 page_text = requests.get(url=book_url, headers=headers).content.decode('utf-8') soup = BeautifulSoup(page_text, 'lxml') list_data = soup.select('.book-mulu ul li') fp = open('sanguo.txt','w',encoding='utf-8') for src in list_data: title = src.a.string detail_url = 'https://www.shicimingju.com' + src.a['href'] # 解析详情页数据 页面数据乱码处理 page_texts = requests.get(url=detail_url,headers=headers).content.decode('utf-8') #解析文章 拿到数据 detail_soup = BeautifulSoup(page_texts , 'lxml') page_content = detail_soup.find('div',class_='chapter_content') content = page_content.text fp.write(title+":"+content+'\n') print(title + "爬取成功")
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36' } login_url = 'http://www.chaojiying.com/user/login/' page_text = requests.get(url=login_url,headers=headers).text # 使用Etree解析 page_tree = etree.HTML(page_text) # 使用xpath解析出图片 img_url = page_tree.xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img/@src')[0] img_data = requests.get(url='http://www.chaojiying.com' + img_url , headers=headers).content with open('./img_code.jpg','wb') as fp: fp.write(img_data) # 一个很不错的GitHub IMG博主 ocr = ddddocr.DdddOcr() with open('./img_code.jpg', 'rb') as f: img_bytes = f.read() res = ocr.classification(img_bytes) user_login_url = 'http://www.chaojiying.com/user/login/' data = { 'user': '用户名', 'pass': '密码', 'imgtxt': res, # 图片解析的验证码 'act': '1' } # 解析登录成功后的界面 模拟登录成功 user_page_text = requests.post(url=user_login_url,headers=headers,data=data).status_code
1.2:可以使用cookie登录
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
'cookie': 'cookie值'
}
if user_page_text == 200:
succ_url = 'http://www.chaojiying.com/user/'
success_page_text = requests.get(url=succ_url,headers=headers).text
print(success_page_text)
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36' } if not os.path.exists('./pptPic'): os.mkdir('./pptPic') if not os.path.exists('./ppt'): os.mkdir('./ppt') # PPT图片爬下来 并且将地址归并 home_url = 'http://www.51pptmoban.com/ppt/' home_page = requests.get(url=home_url,headers=headers).text home_tree = etree.HTML(home_page) # 获取当前界面的所有的PPT图片 img_list = home_tree.xpath('//div[@class="pdiv"]//img/@src') for img_path in img_list: img_path = 'http://www.51pptmoban.com' + img_path img_name = img_path.split('/')[-1] img_store = 'pptPic/' + img_name img_birna = requests.get(url=img_path , headers=headers).content with open(img_store,'wb') as fp: fp.write(img_birna) # 获取PPT详情地址 ppt_list = home_tree.xpath('//div[@class="pdiv"]/a/@href') for ppt_url in ppt_list: ppt_url = 'http://www.51pptmoban.com' + ppt_url middle_page = requests.get(url=ppt_url,headers=headers).text middle_tree = etree.HTML(middle_page) down_url = 'http://www.51pptmoban.com' + middle_tree.xpath('//div[@class="ppt_xz"]/a/@href')[0] down_page = requests.get(url=down_url,headers=headers).text down_tree = etree.HTML(down_page) load_url = down_tree.xpath('//div[@class="down"]/a/@href')[0] load_url = load_url.split('..')[1] load_url = 'http://www.51pptmoban.com/e/DownSys' + load_url # 下载zip文件用IO流的方式写入到文件夹中 f = requests.get(url=load_url,headers=headers).content down_name = down_tree.xpath('//div[@class="wz"]/a/text()')[1] with open('ppt/'+down_name + '.zip','wb') as output: output.write(f) print(down_name + ": 下载完成")
因为很简单,就不多簪述
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。