当前位置:   article > 正文

python+selenium+bs4爬取web of science的数据_python爬取web of science

python爬取web of science


前言

软件:pycharm navicat


一、需求

  • 获取相应关键词的数据
    在这里插入图片描述

  • 数据题目 数据连接 作者 日期 摘要 数据库名字
    在这里插入图片描述

  • 将数据保存到数据库中 在进行后续的数据处理


二、步骤

1.引入库

#selenium库
from selenium import webdriver
from selenium.webdriver.common.by import By

#bs4
from bs4 import BeautifulSoup
import re

#处理时间
import time
import pandas as pd
from datetime import datetime

#连接数据库
import pymysql
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

2.selenium打开虚拟窗口

打开窗口

#发送请求
def askurl(url):
	#添加请求的头部
    options = webdriver.EdgeOptions()								#开启启动参数
    useragent = ''													#写入自己的useragent
    options.add_argument("user-agent:{}".format(useragent))
    options.add_argument("--proxy-server = http://{}".format(ip))	#代理ip ip要写自己的

    #开启模拟浏览器
    driver = webdriver.Edge(options = options)
    driver.get(url)

    #关闭所有不需要的窗口
    now = driver.current_window_handle					#获取当前的主窗口
    all = driver.window_handles							#获取所有窗口柄
    for i in all:
        if i != now:
            driver.switch_to.window(i)
            driver.close()
            time.sleep(1)
    
    #返回主窗口
    driver.switch_to.window(now)

    #返回数据
    return driver

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27

数据处理

  • 接收返回的信息
    url = 'https://www.webofscience.com/wos/woscc/basic-search'
    driver = tk.askurl(url)
    time.sleep(2)
  • 1
  • 2
  • 3
  • 点击开头的无关选项
    page = driver.page_source.encode('UTF-8')
    page = BeautifulSoup(page,'lxml')
    # print(page)
    if page.find_all('button',class_ = 'cookie-setting-link'):
        driver.find_element(By.CSS_SELECTOR,'#onetrust-accept-btn-handler').click()
    
    time.sleep(5)
    
    if page.find_all('button',class_ = 'bb-button _pendo-button-primaryButton _pendo-button'):
        driver.find_element(By.CSS_SELECTOR,'#pendo-button-59b176ac').click()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 写入关键词 点击检索按钮 复制selector路径
    在这里插入图片描述
    #写入关键词
    driver.find_element(By.CSS_SELECTOR,'#mat-input-0').send_keys('high-entropy alloy for hydrogen storage')
    time.sleep(3)

    #点击检索
    driver.find_element(By.CSS_SELECTOR,'#snSearchType > div.button-row > button.mat-focus-indicator.cdx-but-md.search.uppercase-button.mat-flat-button.mat-button-base.mat-primary').click()
    time.sleep(5)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 题目a连接的获取
	n = 0
    while True:
        #翻页
        n = n+1
        page3 = change_page(driver,n)

        #获取页数
        page_num =  page3.find('span',class_ = 'end-page ng-star-inserted').text
        print(page_num)

        # print(page3)
        if n > int(page_num):
            print('全部读取完了')
            break
        else:
            #第一页
            i = 0

			#进入到搜索得到的页面
			for da in page3.find_all('app-record',class_ = 'ng-star-inserted'):
			     i = i + 1
			
			     print('==================第'+str(i)+'条数据======================')
			     
			     #转换成lxml格式
			     data = BeautifulSoup(str(da), 'lxml')
			     #找到数据中题目的a连接
			     data = data.select('a[class="title title-link font-size-18 ng-star-inserted"]')
			
			     if data:
			         # print(data)
			
			         # articalink
			         data = data[0]
			         link = data.get('href')
			         if link:
			             articalink = 'https://www.webofscience.com' + str(link)
			             print(articalink)
			         else:
			             articalink = ''
			
			         # 点击连接 进入详情页面
			         try:
			             driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-records-list > app-record:nth-child(' + str(i) + ') > div > div > div.data-section > div:nth-child(1) > app-summary-title > h3 > a').click()
			             
			             time.sleep(10)
			         
			         except:
			
			             # 要在等一会刷新
			             print('页面刷新 刷新一下')
						 #滚动滚动条 刷新页面
			             for s in range(0, 12000, 250):
			                 time.sleep(0.1)
			                 driver.execute_script('window.scrollTo(0, %s)' % s)
			
			             time.sleep(10)
			
			             try:
			                 driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-records-list > app-record:nth-child(' + str(i) + ') > div > div > div.data-section > div:nth-child(1) > app-summary-title > h3 > a').click()
			                 time.sleep(6)
			             except:
			                 print("两次点击失败......")

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 详情页的详情数据获取
				 		# 解析数据
				         page4 = driver.page_source.encode('UTF-8')
				         page4 = BeautifulSoup(page4, 'lxml')
				
				         pp = page4.find('div', class_='data-column ng-star-inserted')
				         # print(type(pp))
				         if pp:
				             page5 = BeautifulSoup(str(pp), 'lxml')
				
				             # articalname
				             artname = page5.find('h2', class_='title text--large')
				             if artname:
				                 articalname = artname.text
				                 print(articalname)
				             else:
				                 articalname = ''
				
				             # author
				             aur = page5.find('div', class_='authors-div')
				             # print(aur)
				             if aur:
				                 aur = aur.text
				                 author = str(aur).replace(' 作者:', '')
				                 print(author)
				             else:
				                 author = ''
				
				             # releasetime
				             reltime = page5.find('span', id='FullRTa-pubdate')
				             if reltime:
				                 try:
				                     timet = reltime.text
				                     timet = pd.to_datetime(str(timet))
				                 except:
				                     timet = None
				                 releasetime = timet
				                 print(releasetime)
				             else:
				                 releasetime = None
				
				             # dbname
				             dbn = page5.find('span', id='FullRTa-doctype-0')
				             if dbn:
				                 dbname = dbn.text
				                 print(dbname)
				             else:
				                 dbname = ''
				
				             # abstract
				             ab = page5.find('div', id='FullRTa-abstract-basic')
				             if ab:
				                 abstract = ab.text
				                 print(abstract)
				             else:
				                 abstract = ''
				
				             print(articalname, articalink, author, releasetime, dbname, abstract)
				             information(articalname, articalink, author, releasetime, dbname, abstract).connectmysql()
				
				         # 返回上一个页面
				         driver.back()
				         time.sleep(10)
				   
   #关闭浏览器!!!
   driver.quit()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65

点击翻页

def change_page(driver,n):
    print('=================================第'+str(n)+'页====================================')
    #第一页的话就不用滚
    if n == 1:
        # 滚动条滚18条数据要全刷出来
        for s in range(0, 15000, 250):
            time.sleep(0.1)
            driver.execute_script('window.scrollTo(0, %s)' % s)
        time.sleep(1)

        content = driver.page_source.encode('UTF-8')
        soup = BeautifulSoup(content,'lxml')
        time.sleep(2)

        return soup
    else:
    	#点击翻页
        a = driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-page-controls:nth-child(4) > div > form > div > button:nth-child(4)')
        driver.execute_script("arguments[0].click();", a)
        time.sleep(6)

        # 18条数据要全刷出来
        for s in range(0, 15000, 250):
            time.sleep(0.1)
            driver.execute_script('window.scrollTo(0, %s)' % s)
        time.sleep(1)

        # 解析数据
        content = driver.page_source.encode('UTF-8')
        soup = BeautifulSoup(content, 'lxml')


        # print(soup)
        return soup
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34

总结

暂时没有 有问题随时私信我~

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/623746
推荐阅读
相关标签
  

闽ICP备14008679号