赞
踩
#coding:utf-8
importbase64importrandom, reimportsqlite3importredis, pickleimportjson, timeimporturllib3,urllib2,hashlibfrom datetime importdatetimeimportthreadingimportlogging.handlersimportsys
reload(sys)
sys.setdefaultencoding('utf-8')importuuidimportrequests
session=requests.session()
#把连接加密成 MD5 生成唯一的主键defmd5(str):importhashlib
m=hashlib.md5()
m.update(str)returnm.hexdigest()defjinri():
list_data=[]for i in range(1,20):
#请求得到url 链接url= "http://www.toutiao.com/api/pc/feed/"data={"category":"news_game","utm_source":"toutiao","widen":str(i),"max_behot_time":"0","max_behot_time_tmp":"0","tadrequire":"true","as":"479BB4B7254C150","cp":"7E0AC8874BB0985",
}
headers={"Host":"www.toutiao.com","Connection":"keep-alive","Accept":"text/javascript, text/html, application/xml, text/xml, */*","X-Requested-With":"XMLHttpRequest","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36","Content-Type":"application/x-www-form-urlencoded","Referer":"http://www.toutiao.com/ch/news_hot/","Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.8",
}
result1= session.get(url=url,params=data,headers=headers).text
result2=json.loads(result1)if result2["message1"] =="success":
for i in result2["data"]:
source_url=i["source_url"]
headers={"Host":"www.toutiao.com","Connection":"keep-alive","Cache-Control":"max-age=0","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.8",
}
url1= "http://www.toutiao.com" +str(source_url)try:
return_data= session.get(url=url1, headers=headers).contentexcept:pass
#print return_data
try:
contentData= re.findall('(.*?)',return_data)[0]except:
contentData= ""cx= sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3",check_same_thread=False)
cx.text_factory=strtry:print "正在插入链接 %s 数据" %(url)
chinese_ta= i["chinese_tag"]
media_avatar_url= i["media_avatar_url"]
is_feed_ad= i["is_feed_ad"]
tag_url= i["tag_url"]
title= i["title"]
tag= i["tag"]
label= str(i["label"])
abstract= i["abstract"]
source_url= i["source_url"]printtitleprintchinese_taprintmedia_avatar_urlprintis_feed_adprinttag_urlprinttagprintlabelprintabstractprintsource_url
url2=md5(str(url1))
cx.execute("INSERT INTO toutiao (title,chinese_ta,media_avatar_url,is_feed_ad,tag_url,tag,label,abstract,source_url,url,contentData)VALUES (?,?,?,?,?,?,?,?,?,?,?)",
(str(title), str(chinese_ta), str(media_avatar_url), str(is_feed_ad), str(tag_url), str(tag), str(label), str(abstract), str(source_url), str(url2),str(contentData)))
cx.commit()#time.sleep(2)
exceptException as e:printeprint "cha ru shi bai"cx.close()else:print "请求失败"
returnlist_dataprint jinri()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。