当前位置:   article > 正文

爬虫批量获取中文文献引用[selenium+知网]_selenium知网

selenium知网

应用场景:毕业论文中文献往往高达几十篇,调整引用格式(如标点后需加“空格”,作者超过三人,第三人以后需改为“,等”),虽然有各种论文管理软件,但能利用自己所学技能批量获取文献引用也挺有趣。

核心:selenium+知网。selenium是python写爬虫常用的库,不知道的小伙伴可以先在CSDN搜索下,做好selenium的前置准备。本文selenium使用的浏览器为Google浏览器。

具体逻辑:首先将文献下载保存至一个文件夹,文件命名格式为 " XXXXXX_作者.pdf "(知网下载pdf默认格式),然后通过python提取文件名,利用selenium访问知网实现搜索、引用的功能,然后抓取该文献引用格式。

详细代码如下

其中file_address即为保存有文献的文件夹路径,修改即可。

  1. import time
  2. from selenium import webdriver
  3. from selenium.webdriver import ActionChains
  4. from selenium.webdriver.common.by import By
  5. import os
  6. import random
  7. import re
  8. time_s=random.random()*2+2
  9. #保存文献文件夹
  10. #file_address=input("文献文件夹:")
  11. file_address=r"C:\Users\86156\Desktop\研二下\开题报告\文献\开挖变形\监测"
  12. #读取文件名
  13. #储存引用容器
  14. cite_box=[]
  15. #default
  16. default=[]
  17. # 遍历当前路径下所有文件
  18. file = os.listdir(file_address)
  19. for f in file:
  20. #论文名容器
  21. thesis=[]
  22. #论文名
  23. try:
  24. name=re.search(r".*_",f).group()
  25. name=name.replace("_","")
  26. except:
  27. default.append(f)
  28. continue
  29. thesis.append(name)
  30. #论文作者
  31. e=f.split("_")
  32. peo=e[len(e)-1].replace(".pdf","")
  33. thesis.append(peo)
  34. #print(thesis)
  35. driver = webdriver.Chrome()
  36. driver.implicitly_wait(5)
  37. #print("读取js")
  38. with open('JS/stealth.min.js') as f:
  39. js = f.read()
  40. driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js})
  41. #print("读取完成,开始打开网页")
  42. #driver.maximize_window()#最大化窗口
  43. #打开知网
  44. driver.get('https://www.cnki.net/')
  45. time.sleep(0.25)
  46. #打开输入栏,键入文章标题
  47. search=driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div[1]/input[1]")
  48. search_click=driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div[1]/input[2]")
  49. ActionChains(driver) \
  50. .send_keys_to_element(search, name) \
  51. .click(search_click) \
  52. .perform()
  53. time.sleep(0.5)
  54. #搜索结果
  55. number=driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span/em").text
  56. number=int(number)
  57. #作者检查
  58. fla=True
  59. i=1
  60. while fla:
  61. try:
  62. us=driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr["+str(i)+"]/td[3]/a[1]").text
  63. except:
  64. default.append(thesis[0])
  65. fla=False
  66. if us==thesis[1]:
  67. #print(number,i)
  68. # 两个中,第二个#gridTable > table > tbody > tr:nth-child(2) > td.operat > a.icon-quote
  69. # 两个中,第一个#gridTable > table > tbody > tr.odd > td.operat > a.icon-quote
  70. # 3~多个中,第i个#gridTable > table > tbody > tr:nth-child(i) > td.operat > a.icon-quote
  71. if number == 1 and i == 1:
  72. cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr > td.operat > a.icon-quote")
  73. ActionChains(driver) \
  74. .click(cite) \
  75. .perform()
  76. time.sleep(0.5)
  77. cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
  78. time.sleep(0.5)
  79. elif number == 2 and i == 1:
  80. cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr.odd > td.operat > a.icon-quote")
  81. ActionChains(driver) \
  82. .click(cite) \
  83. .perform()
  84. time.sleep(0.5)
  85. cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
  86. time.sleep(0.5)
  87. elif number == 2 and i == 2:
  88. cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr:nth-child("+str(i)+") > td.operat > a.icon-quote")
  89. ActionChains(driver) \
  90. .click(cite) \
  91. .perform()
  92. time.sleep(0.5)
  93. cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
  94. time.sleep(0.5)
  95. elif number > 2:
  96. cite = driver.find_element(By.CSS_SELECTOR, "#gridTable > table > tbody > tr:nth-child(" + str(i) + ") > td.operat > a.icon-quote")
  97. ActionChains(driver) \
  98. .click(cite) \
  99. .perform()
  100. time.sleep(0.5)
  101. cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
  102. time.sleep(0.5)
  103. #引用信息
  104. cite_box.append(cite_info)
  105. #跳出
  106. fla=False
  107. i=i+1
  108. print("获取引用文献 ",len(cite_box)," 篇。")
  109. for ci in cite_box:
  110. # 姓名省略
  111. names = re.search(r".*?\.", ci).group()
  112. # print("前段:",names)
  113. name_li = names.split(",")
  114. be_name = names
  115. if len(name_li) > 3:
  116. be_name = name_li[0] + "," + name_li[1] + "," + name_li[2] + ",等."
  117. # 后段
  118. af = ci.split(".")
  119. af_content = af[1] + af[2] + "."
  120. # 拼接
  121. cite_info_per = be_name + af_content
  122. # print("后段:",af_content)
  123. # 打空格
  124. ci = cite_info_per.replace(",", ", ").replace(".", ". ").replace(":", ": ")
  125. print(ci)
  126. if len(default)!=0:
  127. print("============失败文件:===========")
  128. for i in default:
  129. print(i)

例:

文件夹内容如下:

获取结果如下:

 缺点:缺点很多,包括英文文章没有考虑,文献命名格式固定等等,不过作为一个小demo玩玩还是可以的。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/木道寻08/article/detail/843564
推荐阅读
相关标签
  

闽ICP备14008679号