当前位置:   article > 正文

bbb_333bbb

333bbb
  1. from bs4 import BeautifulSoup
  2. import urllib
  3. import urllib.request
  4. import sys
  5. import io
  6. sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
  7. page_num=15
  8. def get_paper_url(page_url):
  9. html = urllib.request.urlopen(page_url).read()
  10. soup = BeautifulSoup(html,'html.parser')
  11. f = open('data-detail.txt','a+', encoding='utf-8')
  12. all = soup.find_all('div', class_='wz_content')
  13. for string in all:
  14. item = string.find('a', target='_blank')#文章标题与链接
  15. href = item.get('href')# 获取文章url
  16. title = item.get_text() # 获取文章标题
  17. year_count = string.find('span', class_='year-count')#获取文章出处与引用次数
  18. #year_count = year_count.get_text()
  19. publish = ''
  20. reference = ''
  21. for item in year_count:
  22. item = item.string
  23. item = item.replace('\n','')
  24. item = item.replace('\r', '')
  25. if '被引次数' in item:
  26. reference = item# 获取被引次数
  27. elif '年' in item: # 获取文章出处
  28. publish = item
  29. #print(publish)
  30. #print(reference)
  31. #print(year_count)
  32. f.write(href + '\t' + title + '\t' + publish + '\t' + reference +'\n')
  33. f.close()

  1. # -*- coding: utf-8 -*-
  2. import socket
  3. from bs4 import BeautifulSoup
  4. import urllib
  5. import requests
  6. import time
  7. import xlwt
  8. from configparser import ConfigParser
  9. def spider_paper():
  10. start = time.clock()
  11. # f=urllib2.urlopen(url, timeout=5).read()
  12. # soup=BeautifulSoup(html)
  13. # tags=soup.find_all('a')
  14. file = open("data-detail.txt", encoding='utf8')
  15. cf = ConfigParser()
  16. cf.read("Config.conf", encoding='utf-8')
  17. keyword = cf.get('base', 'keyword')# 关键词
  18. # 写入Excel
  19. wb = xlwt.Workbook("data_out.xls")
  20. sheet = wb.add_sheet("data-out")
  21. sheet.write(0, 0, '下载网址')
  22. sheet.write(0, 1, '标题')
  23. sheet.write(0, 2, '来源')
  24. sheet.write(0, 3, '引用')
  25. sheet.write(0, 4, '作者')
  26. sheet.write(0, 5, '作者单位')
  27. sheet.write(0, 6, '关键词')
  28. sheet.write(0, 7, '摘要')
  29. sheet.write(0, 8, '共引文献')
  30. lines = file.readlines()
  31. txt_num = 1
  32. lin_num = 1
  33. paper_list = []
  34. for line in lines:
  35. object = line.split('\t')
  36. paper_url = object[0]
  37. if paper_url in paper_list:
  38. continue
  39. paper_list.append(paper_url)
  40. attempts = 0
  41. success = False
  42. while attempts < 50 and not success:
  43. try:
  44. html = urllib.request.urlopen(paper_url).read()
  45. soup = BeautifulSoup(html, 'html.parser')
  46. socket.setdefaulttimeout(10) # 设置10秒后连接超时
  47. success = True
  48. except socket.error:
  49. attempts += 1
  50. print("第"+str(attempts)+"次重试!!")
  51. if attempts == 50:
  52. break
  53. except urllib.error:
  54. attempts += 1
  55. print("第"+str(attempts)+"次重试!!")
  56. if attempts == 50:
  57. break
  58. title = soup.find_all('div', style="text-align:center; width:740px; font-size: 28px;color: #0000a0; font-weight:bold; font-family:'宋体';")
  59. abstract = soup.find_all('div', style='text-align:left;word-break:break-all')
  60. author = soup.find_all('div', style='text-align:center; width:740px; height:30px;')
  61. #获取作者名字
  62. for item in author:
  63. author = item.get_text()
  64. # print(item)
  65. #获取摘要信息
  66. tmp = ''
  67. for thing in abstract:
  68. a = thing.strings
  69. for string in a:
  70. tmp = tmp + string
  71. txt_num += 1
  72. result = tmp.split(' ')
  73. tstr = ''
  74. for t in result:
  75. test = t.split('\n')
  76. # print(test)
  77. if test != '\t' and test != '\n' and test != '\r' and test != '':
  78. for i in test:
  79. if len(i) > 1:
  80. item = i.split('\r')
  81. for j in item:
  82. object = j.split('\t')
  83. for k in object:
  84. tstr += k
  85. ifreferen = soup.find_all('td', class_='b14', rowspan='2')
  86. ref = ''
  87. for i in range(len(ifreferen)):
  88. if ('【共引文献】' in ifreferen[i].get_text()):
  89. referenceList = soup.find_all('div', id='div_Ref') # 共引文献列表
  90. if len(referenceList) == 0:
  91. referenceList = soup.find_all('div', class_='div_Ref')
  92. referenceList = referenceList[i]
  93. for tdref in referenceList.find_all('td', width='676'):
  94. refitem = tdref.a.get("href")
  95. refitem = refitem.strip()
  96. print(refitem)
  97. ref = ref + refitem + ' ,'
  98. # 获取作者单位,处理字符串匹配
  99. authorUnitScope = soup.find('div', style='text-align:left;', class_='xx_font')
  100. author_unit = ''
  101. author_unit_text = authorUnitScope.get_text()
  102. # print(author_unit_text)
  103. if '【作者单位】:' in author_unit_text:
  104. auindex = author_unit_text.find('【作者单位】:', 0)
  105. else:
  106. auindex = author_unit_text.find('【学位授予单位】:', 0)
  107. for k in range(auindex, len(author_unit_text)):
  108. if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \
  109. author_unit_text[k] == '】':
  110. continue
  111. if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':
  112. continue
  113. if author_unit_text[k] != '【':
  114. author_unit = author_unit + author_unit_text[k]
  115. if author_unit_text[k] == '【' and k != auindex:
  116. break
  117. # 获取关键字
  118. key_word = ''
  119. kwindex = author_unit_text.find('【关键词】:', 0)
  120. for k in range(kwindex, len(author_unit_text)):
  121. if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \
  122. author_unit_text[k] == '】':
  123. continue
  124. if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':
  125. continue
  126. if author_unit_text[k] != '【':
  127. key_word = key_word + author_unit_text[k]
  128. if author_unit_text[k] == '【' and k != kwindex:
  129. break
  130. # print(author_unit)
  131. # print(key_word)
  132. line = line.strip('\n')
  133. line = line + '\t' + str(author) + '\t' + str(author_unit) + '\t'+ str(key_word) + '\t'+ str(tstr) + '\t' + str(ref) + '\n'
  134. outstring = line.split('\t')
  135. for i in range(len(outstring)):
  136. sheet.write(lin_num, i, outstring[i])
  137. print('写入第'+str(lin_num)+'行')
  138. lin_num += 1
  139. wb.save('data_out_'+str(keyword)+'.xls')
  140. file.close()
  141. end = time.clock()
  142. print('Running time: %s Seconds' % (end - start))

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/192557
推荐阅读
相关标签
  

闽ICP备14008679号