1 # -*- coding: utf-8 -*- 2 """ 3 Created on Thu Oct 18 09:13:32 2018 4 5 @author: Gawen 6 7 实现爬取IEEE目标网页上该页所有论文的摘要 8 并通过百度翻译api进行翻译 9 并将链接与翻译后的摘要存到文本文档中 10 其中百度的api的appid以及secertkey需要自己申请 11 每月有200万字符的免费额度 12 python版本3.6 13 需要的包:selenium 14 需要的软件:chrome-driver 15 """ 16 #example url https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8360187&punumber=8360187&filter=issueId%20EQ%20%228363090%22&pageNumber=9&pageNumber=10 17 import requests 18 from bs4 import BeautifulSoup 19 from selenium import webdriver 20 import time 21 from selenium.webdriver.chrome.options import Options 22 import hashlib 23 import urllib 24 import random 25 import json 26 27 28 def writetxt(file,url,abstract): 29 with open(file, 'a', encoding='GBK') as file_txt: 30 file_txt.write('链接:\n'+url) 31 file_txt.write('\n') 32 file_txt.write('摘要:\n'+abstract) 33 file_txt.write('\n') 34 file_txt.write('\n') 35 36 37 def trans(q): 38 appid = ''#你自己的百度翻译appid 39 secretkey = ''#你自己的百度翻译secretkey 40 myurl = '/api/trans/vip/translate' 41 fromLang = 'en' 42 toLang = 'zh' 43 salt = random.randint(32768,65536) 44 sign = appid + q + str(salt) + secretkey 45 sign = hashlib.md5(sign.encode(encoding = 'utf-8')).hexdigest() 46 myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(salt) + '&sign=' + sign 47 print(myurl) 48 try: 49 r = requests.get('http://api.fanyi.baidu.com'+myurl) 50 print(r.content.decode('utf-8')) 51 except Exception as e: 52 print(e) 53 html = r.content.decode('utf-8') 54 soup = BeautifulSoup(html,'lxml') 55 text = soup.find('p').text 56 text_dict = json.loads(text) 57 if ('error_code' in text_dict.keys()): 58 return 'error' 59 return text_dict['trans_result'][0]['dst'] 60 61 62 url = input('please input the url that you want to download:\n') 63 fore = 'https://ieeexplore.ieee.org' 64 r = requests.get(url) 65 html = r.content.decode('utf-8') 66 soup = BeautifulSoup(html,'lxml') 67 h3 = soup.find('div', class_='cf jrnl-results-filter').find_all('h3') 68 h3text = [] 69 errtitle = [] 70 links = [] 71 for h in h3: 72 h3text.append(h.text.strip()) 73 print(h3text) 74 for i in range(len(h3text)): 75 if ((soup.find('a', attrs={'aria-label':'View HTML: ' + h3text[i]}))==None): 76 errtitle.append(h3text[i]) 77 continue 78 href = (soup.find('a', attrs={'aria-label':'View HTML: ' + h3text[i]})['href']) 79 links.append(fore + href) 80 print(links) 81 chrome_options = Options() 82 chrome_options.add_argument('--headless') 83 chrome_options.add_argument('--disable-gpu') 84 driver = webdriver.Chrome(chrome_options=chrome_options) 85 count = 0 86 for link in links: 87 driver.get(link) 88 driver.implicitly_wait(20) 89 ps = driver.page_source 90 lsoup = BeautifulSoup(ps,'lxml') 91 abstract = lsoup.select('body > div > div > div > div > div > div > xpl-root > xpl-document-details > div > div > div > div > section > div > div > xpl-document-abstract > section > div > div > div > div > div')[0].text 92 abstract = trans(abstract) 93 if(abstract=='error'): 94 errtitle.append(link) 95 continue 96 writetxt(r'C:\Users\Gawen\Desktop\abstract.txt',link,abstract)#输出路径 97 count += 1 98 print(count) 99 time.sleep(5) 100 driver.close() 101 print("共有"+str(len(errtitle))+"篇论文下载失败") 102 for err in errtitle: 103 print(err) 104 105 106