当前位置:   article > 正文

爬虫---获取网页数据的几种方式_爬虫时获取网站数据的方法有哪几种

爬虫时获取网站数据的方法有哪几种

1.post方式(有道词典)

# -*- coding:utf-8 -*-

import urllib
import urllib2

url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
headers = {
'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}


key = raw_input('请输入翻译的英语:')

formdata = {
'i':'hello',
'from':'AUTO',
'to':'AUTO',
'smartresult':'dict',
'client':'fanyideskweb',
'salt':'1526387112925',
'doctype':'json',
'version':'2.1',
'keyfrom':'fanyi.web',
'action':'FY_BY_CLICKBUTTION',
'typoResult':'false',
}

data = urllib.urlencode(formdata)

request = urllib2.Request(url ,data=data, headers=headers)

print(urllib2.urlopen(request).read())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32

2.ajax方式(豆瓣电影网)

# -*- coding:utf-8 -*-

import urllib
import urllib2

url = 'https://movie.douban.com/j/new_search_subjects?'

headers = {'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

formdata = {
    'sort':'T',
    'range':'0,10',
    'tags':'',
    'start':'10',
    'genres':'科幻',
}
data = urllib.urlencode(formdata)

request = urllib2.Request(url, data=data, headers=headers)

print(urllib2.urlopen(request).read())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

3.cookie登录方式(CSDN)

# -*- coding:utf-8 -*-

import urllib2

url = 'http://blog.csdn.net/jian15093532273/article/details/80313793'
header = {
    'Accept':'*/*',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Connection':'keep-alive',
    'Content-Length':'286',
    'Content-Type':'text/plain;charset=UTF-8',
    'Cookie':'uuid_tt_dd=5624585880944396552_20180509; kd_user_id=7059fcfd-6b66-454b-bb99-f87b70676dde; UN=jian15093532273; dc_session_id=10_1526392746369.315469; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1526284068,1526381959,1526388494,1526392747; smidV2=20180515215924c4d5f3ee6537ace030bddb96f86f76020096344313ac34450; kd_0e1a1f29-37da-4c44-8a33-b4735dc85f10_kuickDeal_pageIndex=0; kd_0e1a1f29-37da-4c44-8a33-b4735dc85f10_kuickDeal_leaveTime=1526392783862; UserName=jian15093532273; UserInfo=OYXG1lKtKnuQ5RuGXlV0ddyWw4m%2FHJnX%2B%2Ftpib66T3S%2FjZ7%2Bv%2BuBm6yb90pmiyaRMXpMvJrbQ31cvjx%2Fu1Lp1VRW3NTSkn53a5C4ZF7aPED%2Fm%2F0dM%2BCekDkMqEF2rgU2a5R5Annds%2B5S4sEnaKk%2FCA%3D%3D; UserNick=OnlyLove_%E7%90%B3; AU=315; BT=1526392781724; dc_tos=p8rw7l; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1526392786',
    'Host':'re.csdn.net',
    'Origin:https':'//www.csdn.net',
    'Referer:https':'//www.csdn.net/',
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}

request = urllib2.Request(url,  headers=header)

print(urllib2.urlopen(request).read())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

3.关闭ssl安全证书验证(12306)

# -*- coding:utf-8 -*-

import urllib2
import ssl

# 忽略ssl安全认证
context = ssl._create_unverified_context()

url = 'https://www.12306.cn/mormhweb/'

request = urllib2.Request(url)

response = urllib2.urlopen(request, context=context)

print(response.read())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/845320
推荐阅读
相关标签
  

闽ICP备14008679号