当前位置:   article > 正文

python爬虫的9个实例_python爬虫实例

python爬虫实例

1、爬取百度

  1. import requests
  2. keyword = "Python"
  3. try:
  4. kv = {'wd':keyword}
  5. r = requests.get("http://www.baidu.com/s",params=kv)
  6. print(r.request.url)
  7. r.raise_for_status()
  8. print(r.text)
  9. except:
  10. print("爬取失败")

2、查询IP地址

  1. import requests
  2. url = "https://ip138.com/iplookup.asp?ip="
  3. kv={'user-agent':'Mozilla/5.0'}
  4. try:
  5. r= requests.get(url+'ip'+'&action=2','202.204.80.112',headers=kv)
  6. r.raise_for_status()
  7. r.encoding = r.apparent_encoding
  8. print(r.text[-500:])
  9. except:
  10. print("失败")

3、股票爬取

  1. import requests
  2. from bs4 import BeautifulSoup
  3. import traceback
  4. import re
  5. def getHTMLText(url):
  6. try:
  7. r=requests.get(url)
  8. r.raise_for_ststus()
  9. r.encoding=r.apparent_encoding
  10. return r.text
  11. except:
  12. return""
  13. def getStockList(lst,stockURL):
  14. html=getHTMLText(stockURL)
  15. soup=BeautifulSoup(html,'html.parser')
  16. a=soup.find_all('a')
  17. for i in a:
  18. try:
  19. href=i.attrs['href']
  20. lst.append(re.finall(r"[s][hz]\d{6}",href)[0])
  21. except:
  22. continue
  23. return ""
  24. def getStockInfo(lst,stockURL,fpath):
  25. count=0
  26. timel=time.time()
  27. for stock in lst:
  28. url = stockURL + stock
  29. html = getHTMLText(url)
  30. try:
  31. if html == "":
  32. continue
  33. infoDict = {}
  34. soup = BeautifulSoup(html,'html.parser')
  35. stockInfo = soup.find('header',attrs = {'class':'stock_title'})
  36. name = stockInfo.find_all(attrs = {'h1':''})[0]
  37. infoDict.update({'股票名称':name.text.split('</h1>')[0]})
  38. stockInfo = soup.find('div',attrs = {'class':'stock_top clearfix'})
  39. keyList = stockInfo.find_all('dt')
  40. valueList = stockInfo.find_all('dd')
  41. for i in range(len(keyList)):
  42. key = keyList[i].text
  43. val = valueList[i].text
  44. infoDict[key] = val
  45. with open(fpath,'a',encoding = 'utf-8') as f:
  46. f.write(str(infoDict) + '\n')
  47. count = count + 1
  48. time2 = time.time() - time1
  49. print('\r当前速度:{:.2f}%\t用时:{:.2f}秒'.format((count * 100 /len(lst)),time2),end ='')
  50. except:
  51. count = count + 1
  52. time2 = time.time() - time1
  53. print('\r当前速度:{:.2f}%\t用时:{:.2f}秒'.format((count * 100 /len(lst)),time2),end ='')
  54. traceback.print_exc()
  55. continue
  56. return ""
  57. def main():
  58. stock_list_url = 'https://hq.gucheng.com/gpdmylb.html'
  59. stock_info_url = 'https://hq.gucheng.com/'
  60. output_file = 'C:\\StockInfo.txt'
  61. slist = []
  62. getStockList(slist,stock_list_url)
  63. getStockInfo(slist,stock_info_url,output_file)
  64. main()

4、爬取京东商品

  1. import requests
  2. url="http://item.jd.com/2967929.html"
  3. try:
  4. r=requests.get(url)
  5. r.raise_for_status()
  6. r.encoding=r.apparent_encoding
  7. print(r.text[:1000])
  8. except:
  9. print("爬取失败")

5、爬取图片

  1. import requests
  2. import os
  3. url= "http://pic.87g.com/upload/2020/0102/20200102093322295.jpg"
  4. root= "C://Users//15133//Pictures//Camera Roll"
  5. path=root+url.split('/')[-1]
  6. try:
  7. if not os.path.exists(root):
  8. os.mkdir(root)
  9. if not os.path.exists(path):
  10. r=requests.get(url)
  11. with open(path,'wb') as f:
  12. f.write(r.content)
  13. f.close()
  14. print("成功")
  15. else:
  16. print("存在")
  17. except:
  18. print("失败")

6、图表库练习

  1. import matplotlib.pyplot as plt
  2. labels= 'Frogs', 'Hogs', 'Dogs', 'Logs'
  3. sizes = [15, 30,45, 10]
  4. explode = (0, 0.1,0, 0)
  5. plt.pie(sizes, explode=explode, labels=labels, autopct= '%1.1f%%',shadow=False, startangle=90)
  6. plt. show()

7、图片转水墨画

  1. from PIL import Image
  2. import numpy as np
  3. a = np.asarray(Image.open('C:\Users\15133\Pictures\Saved Pictures\QQ图片20191029201945.jpg').convert('L')).astype('float')
  4. depth = 10. # (0-100)
  5. grad = np.gradient(a) #取图像灰度的梯度值
  6. grad_x, grad_y = grad #分别取横纵图像梯度值
  7. grad_x = grad_x*depth/100.
  8. grad_y = grad_y*depth/100.
  9. A = np.sqrt(grad_x**2 + grad_y**2 + 1.)
  10. uni_x = grad_x/A
  11. uni_y = grad_y/A
  12. uni_z = 1./A
  13. vec_el = np.pi/2.2 # 光源的俯视角度,弧度值
  14. vec_az = np.pi/4. # 光源的方位角度,弧度值
  15. dx = np.cos(vec_el)*np.cos(vec_az) #光源对x 轴的影响
  16. dy = np.cos(vec_el)*np.sin(vec_az) #光源对y 轴的影响
  17. dz = np.sin(vec_el) #光源对z 轴的影响
  18. b = 255*(dx*uni_x + dy*uni_y + dz*uni_z) #光源归一化
  19. b = b.clip(0,255)
  20. im = Image.fromarray(b.astype('uint8')) #重构图像
  21. im.save('C:\Users\15133\Pictures\Saved Pictures\QQ图片20191029201945.jpg')

8、爬取高校排名

  1. import requests
  2. from bs4 import BeautifulSoup
  3. import bs4
  4. def getHTMLText(url) :
  5. try:
  6. r= requests.get(url,timeout=30)
  7. r.raise_for_status()
  8. r.encoding=r.apparent_encoding
  9. return r.text
  10. except:
  11. return ""
  12. def fillUnivList(ulist,html):
  13. soup=BeautifulSoup(html,"html.parser")
  14. for tr in soup.find('tbody').children:
  15. if isinstance(tr,bs4.element.Tag):
  16. tds=tr('td')
  17. ulist.append([tds[0].string,tds[1].string,tds[3].string])
  18. def printUnivList(ulist,num):
  19. print("{:^10}\t{:^6}\t{:^10}".format("排名","学校","总分"))
  20. for i in range(num):
  21. u=ulist[i]
  22. print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
  23. def main():
  24. uinfo=[]
  25. url='http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
  26. html=getHTMLText(url)
  27. fillUnivList(uinfo,html)
  28. printUnivList(uinfo,20)# 20 unives
  29. main()

9、亚马逊商品爬取

  1. import requests
  2. url = "https://www.amazon.cn/gp/product/B01M8L5Z3Y"
  3. try:
  4. kv = {'user-agent':'Mozilla/5.0'}
  5. r = requests.get(url,headers=kv)
  6. r.raise_for_status()
  7. r.encoding = r.apparent_encoding
  8. print(r.text[1000:2000])
  9. except:
  10. print("爬取失败")

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/738256
推荐阅读
相关标签
  

闽ICP备14008679号