赞
踩
应用场景:毕业论文中文献往往高达几十篇,调整引用格式(如标点后需加“空格”,作者超过三人,第三人以后需改为“,等”),虽然有各种论文管理软件,但能利用自己所学技能批量获取文献引用也挺有趣。
核心:selenium+知网。selenium是python写爬虫常用的库,不知道的小伙伴可以先在CSDN搜索下,做好selenium的前置准备。本文selenium使用的浏览器为Google浏览器。
具体逻辑:首先将文献下载保存至一个文件夹,文件命名格式为 " XXXXXX_作者.pdf "(知网下载pdf默认格式),然后通过python提取文件名,利用selenium访问知网实现搜索、引用的功能,然后抓取该文献引用格式。
详细代码如下:
其中file_address即为保存有文献的文件夹路径,修改即可。
- import time
- from selenium import webdriver
- from selenium.webdriver import ActionChains
- from selenium.webdriver.common.by import By
- import os
- import random
- import re
-
- time_s=random.random()*2+2
- #保存文献文件夹
- #file_address=input("文献文件夹:")
- file_address=r"C:\Users\86156\Desktop\研二下\开题报告\文献\开挖变形\监测"
- #读取文件名
- #储存引用容器
- cite_box=[]
- #default
- default=[]
- # 遍历当前路径下所有文件
- file = os.listdir(file_address)
- for f in file:
- #论文名容器
- thesis=[]
- #论文名
- try:
- name=re.search(r".*_",f).group()
- name=name.replace("_","")
- except:
- default.append(f)
- continue
- thesis.append(name)
- #论文作者
- e=f.split("_")
- peo=e[len(e)-1].replace(".pdf","")
- thesis.append(peo)
- #print(thesis)
-
- driver = webdriver.Chrome()
- driver.implicitly_wait(5)
- #print("读取js")
- with open('JS/stealth.min.js') as f:
- js = f.read()
- driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js})
- #print("读取完成,开始打开网页")
- #driver.maximize_window()#最大化窗口
- #打开知网
- driver.get('https://www.cnki.net/')
- time.sleep(0.25)
- #打开输入栏,键入文章标题
- search=driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div[1]/input[1]")
- search_click=driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div[1]/input[2]")
- ActionChains(driver) \
- .send_keys_to_element(search, name) \
- .click(search_click) \
- .perform()
- time.sleep(0.5)
- #搜索结果
- number=driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span/em").text
- number=int(number)
- #作者检查
- fla=True
- i=1
- while fla:
- try:
- us=driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr["+str(i)+"]/td[3]/a[1]").text
- except:
- default.append(thesis[0])
- fla=False
- if us==thesis[1]:
- #print(number,i)
- # 两个中,第二个#gridTable > table > tbody > tr:nth-child(2) > td.operat > a.icon-quote
- # 两个中,第一个#gridTable > table > tbody > tr.odd > td.operat > a.icon-quote
- # 3~多个中,第i个#gridTable > table > tbody > tr:nth-child(i) > td.operat > a.icon-quote
- if number == 1 and i == 1:
- cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr > td.operat > a.icon-quote")
- ActionChains(driver) \
- .click(cite) \
- .perform()
- time.sleep(0.5)
- cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
- time.sleep(0.5)
- elif number == 2 and i == 1:
- cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr.odd > td.operat > a.icon-quote")
- ActionChains(driver) \
- .click(cite) \
- .perform()
- time.sleep(0.5)
- cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
- time.sleep(0.5)
- elif number == 2 and i == 2:
- cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr:nth-child("+str(i)+") > td.operat > a.icon-quote")
- ActionChains(driver) \
- .click(cite) \
- .perform()
- time.sleep(0.5)
- cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
- time.sleep(0.5)
- elif number > 2:
- cite = driver.find_element(By.CSS_SELECTOR, "#gridTable > table > tbody > tr:nth-child(" + str(i) + ") > td.operat > a.icon-quote")
- ActionChains(driver) \
- .click(cite) \
- .perform()
- time.sleep(0.5)
- cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
- time.sleep(0.5)
- #引用信息
- cite_box.append(cite_info)
- #跳出
- fla=False
- i=i+1
- print("获取引用文献 ",len(cite_box)," 篇。")
- for ci in cite_box:
- # 姓名省略
- names = re.search(r".*?\.", ci).group()
- # print("前段:",names)
- name_li = names.split(",")
- be_name = names
- if len(name_li) > 3:
- be_name = name_li[0] + "," + name_li[1] + "," + name_li[2] + ",等."
- # 后段
- af = ci.split(".")
- af_content = af[1] + af[2] + "."
- # 拼接
- cite_info_per = be_name + af_content
- # print("后段:",af_content)
- # 打空格
- ci = cite_info_per.replace(",", ", ").replace(".", ". ").replace(":", ": ")
- print(ci)
- if len(default)!=0:
- print("============失败文件:===========")
- for i in default:
- print(i)
例:
文件夹内容如下:
获取结果如下:
缺点:缺点很多,包括英文文章没有考虑,文献命名格式固定等等,不过作为一个小demo玩玩还是可以的。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。