赞
踩
软件:pycharm navicat
获取相应关键词的数据
数据题目 数据连接 作者 日期 摘要 数据库名字
将数据保存到数据库中 在进行后续的数据处理
#selenium库
from selenium import webdriver
from selenium.webdriver.common.by import By
#bs4
from bs4 import BeautifulSoup
import re
#处理时间
import time
import pandas as pd
from datetime import datetime
#连接数据库
import pymysql
#发送请求
def askurl(url):
#添加请求的头部
options = webdriver.EdgeOptions() #开启启动参数
useragent = '' #写入自己的useragent
options.add_argument("user-agent:{}".format(useragent))
options.add_argument("--proxy-server = http://{}".format(ip)) #代理ip ip要写自己的
#开启模拟浏览器
driver = webdriver.Edge(options = options)
driver.get(url)
#关闭所有不需要的窗口
now = driver.current_window_handle #获取当前的主窗口
all = driver.window_handles #获取所有窗口柄
for i in all:
if i != now:
driver.switch_to.window(i)
driver.close()
time.sleep(1)
#返回主窗口
driver.switch_to.window(now)
#返回数据
return driver
url = 'https://www.webofscience.com/wos/woscc/basic-search'
driver = tk.askurl(url)
time.sleep(2)
page = driver.page_source.encode('UTF-8')
page = BeautifulSoup(page,'lxml')
# print(page)
if page.find_all('button',class_ = 'cookie-setting-link'):
driver.find_element(By.CSS_SELECTOR,'#onetrust-accept-btn-handler').click()
time.sleep(5)
if page.find_all('button',class_ = 'bb-button _pendo-button-primaryButton _pendo-button'):
driver.find_element(By.CSS_SELECTOR,'#pendo-button-59b176ac').click()
#写入关键词
driver.find_element(By.CSS_SELECTOR,'#mat-input-0').send_keys('high-entropy alloy for hydrogen storage')
time.sleep(3)
#点击检索
driver.find_element(By.CSS_SELECTOR,'#snSearchType > div.button-row > button.mat-focus-indicator.cdx-but-md.search.uppercase-button.mat-flat-button.mat-button-base.mat-primary').click()
time.sleep(5)
n = 0
while True:
#翻页
n = n+1
page3 = change_page(driver,n)
#获取页数
page_num = page3.find('span',class_ = 'end-page ng-star-inserted').text
print(page_num)
# print(page3)
if n > int(page_num):
print('全部读取完了')
break
else:
#第一页
i = 0
#进入到搜索得到的页面
for da in page3.find_all('app-record',class_ = 'ng-star-inserted'):
i = i + 1
print('==================第'+str(i)+'条数据======================')
#转换成lxml格式
data = BeautifulSoup(str(da), 'lxml')
#找到数据中题目的a连接
data = data.select('a[class="title title-link font-size-18 ng-star-inserted"]')
if data:
# print(data)
# articalink
data = data[0]
link = data.get('href')
if link:
articalink = 'https://www.webofscience.com' + str(link)
print(articalink)
else:
articalink = ''
# 点击连接 进入详情页面
try:
driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-records-list > app-record:nth-child(' + str(i) + ') > div > div > div.data-section > div:nth-child(1) > app-summary-title > h3 > a').click()
time.sleep(10)
except:
# 要在等一会刷新
print('页面刷新 刷新一下')
#滚动滚动条 刷新页面
for s in range(0, 12000, 250):
time.sleep(0.1)
driver.execute_script('window.scrollTo(0, %s)' % s)
time.sleep(10)
try:
driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-records-list > app-record:nth-child(' + str(i) + ') > div > div > div.data-section > div:nth-child(1) > app-summary-title > h3 > a').click()
time.sleep(6)
except:
print("两次点击失败......")
# 解析数据
page4 = driver.page_source.encode('UTF-8')
page4 = BeautifulSoup(page4, 'lxml')
pp = page4.find('div', class_='data-column ng-star-inserted')
# print(type(pp))
if pp:
page5 = BeautifulSoup(str(pp), 'lxml')
# articalname
artname = page5.find('h2', class_='title text--large')
if artname:
articalname = artname.text
print(articalname)
else:
articalname = ''
# author
aur = page5.find('div', class_='authors-div')
# print(aur)
if aur:
aur = aur.text
author = str(aur).replace(' 作者:', '')
print(author)
else:
author = ''
# releasetime
reltime = page5.find('span', id='FullRTa-pubdate')
if reltime:
try:
timet = reltime.text
timet = pd.to_datetime(str(timet))
except:
timet = None
releasetime = timet
print(releasetime)
else:
releasetime = None
# dbname
dbn = page5.find('span', id='FullRTa-doctype-0')
if dbn:
dbname = dbn.text
print(dbname)
else:
dbname = ''
# abstract
ab = page5.find('div', id='FullRTa-abstract-basic')
if ab:
abstract = ab.text
print(abstract)
else:
abstract = ''
print(articalname, articalink, author, releasetime, dbname, abstract)
information(articalname, articalink, author, releasetime, dbname, abstract).connectmysql()
# 返回上一个页面
driver.back()
time.sleep(10)
#关闭浏览器!!!
driver.quit()
def change_page(driver,n):
print('=================================第'+str(n)+'页====================================')
#第一页的话就不用滚
if n == 1:
# 滚动条滚18条数据要全刷出来
for s in range(0, 15000, 250):
time.sleep(0.1)
driver.execute_script('window.scrollTo(0, %s)' % s)
time.sleep(1)
content = driver.page_source.encode('UTF-8')
soup = BeautifulSoup(content,'lxml')
time.sleep(2)
return soup
else:
#点击翻页
a = driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-page-controls:nth-child(4) > div > form > div > button:nth-child(4)')
driver.execute_script("arguments[0].click();", a)
time.sleep(6)
# 18条数据要全刷出来
for s in range(0, 15000, 250):
time.sleep(0.1)
driver.execute_script('window.scrollTo(0, %s)' % s)
time.sleep(1)
# 解析数据
content = driver.page_source.encode('UTF-8')
soup = BeautifulSoup(content, 'lxml')
# print(soup)
return soup
暂时没有 有问题随时私信我~
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。