当前位置:   article > 正文

Python经典基础习题(网络爬虫)_网络爬虫课后习题

网络爬虫课后习题

1.批量爬取yuan士信息,把每位yuan士的文字介绍保存到该yuan士名字为名的记事本文件中,照片保存到该院士名字为名的jpg文件中。

程序代码:

  1. from urllib.request import urlopen
  2. from re import findall
  3. import requests
  4. url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
  5. with urlopen(url) as file:
  6. content = file.read().decode()
  7. pattern = 'href="(.+?)"'
  8. result = findall(pattern,content)
  9. end = []
  10. for urls in result:
  11. if urls[-4:-1] == "htm":
  12. if urls[15:20] == "colys":
  13. end.append('https://www.cae.cn'+urls)
  14. pattern_jpg = 'img src="(.+?)" style="width:150px;height:210px;"'
  15. pattern_message = '<p>&ensp;&ensp;&ensp;&ensp;(.+?)</p><p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>'
  16. pattern_name = '<div class="right_md_name">(.+?)</div>'
  17. for urls1 in end:
  18. with urlopen(urls1) as file:
  19. content1 = file.read().decode()
  20. result_jpg = findall(pattern_jpg,content1)
  21. result_message = findall(pattern_message,content1)
  22. result_name = findall(pattern_name,content1)
  23. for a in result_jpg:
  24. result_jpg_str = "".join(a)
  25. for b in result_message:
  26. result_message_str = "".join(b)
  27. for c in result_name:
  28. result_name_str = "".join(c)
  29. with open(result_name_str+'.txt','w',encoding="utf-8") as file_message:
  30. file_message.write(result_message_str)
  31. file_message.close()
  32. with open(result_name_str+'.jpg','wb') as file_jpg:
  33. url_new ="https://www.cae.cn/"+result_jpg_str
  34. r = requests.get(url_new)
  35. file_jpg.write(r.content)
  36. file_jpg.close()


2.根据院士名单,爬取该yuan士性别,族别信息;根据yuan士简介提取该院士就读本科学校,入选院士年份;将院士姓名,性别,族别信息,本科学校,入选yuan士年份信息写入excel文件。

程序代码:

  1. from urllib.request import urlopen
  2. from re import findall
  3. import openpyxl
  4. from openpyxl import Workbook
  5. fn = r'D:\message.xlsx'
  6. wb = Workbook()
  7. ws = wb.create_sheet(title="工程院士信息")
  8. ws['A1'] = '姓名'
  9. ws['B1'] = '性别'
  10. ws['C1'] = '民族'
  11. ws['D1'] = '毕业院校'
  12. ws['E1'] = '入选年份'
  13. wb.save(fn)
  14. wb = openpyxl.load_workbook(fn)
  15. ws = wb.worksheets[1]
  16. url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
  17. with urlopen(url) as file:
  18. content = file.read().decode()
  19. pattern = 'href="(.+?)"'
  20. result = findall(pattern,content)
  21. end = []
  22. for urls in result:
  23. if urls[-4:-1] == "htm":
  24. if urls[15:20] == "colys":
  25. end.append('https://www.cae.cn'+urls)
  26. pattern_message_year = '<p>&ensp;&ensp;&ensp;&ensp;(.+?)</p><p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>' \
  27. '(<p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>)*'
  28. pattern_message = '<p>&ensp;&ensp;&ensp;&ensp;(.+?)</p><p>&nbsp;</p><p>&ensp;&ensp;&ensp;&ensp;(.+?)</p>'
  29. pattern_next_url = '<a href="(.+?)" target="_blank">'
  30. count = 1
  31. number = 2
  32. compare = []
  33. count_year = 0
  34. number_year = 2
  35. count_number = 2
  36. for url_new in end:
  37. if number == 41:
  38. break
  39. with urlopen(url_new) as file:
  40. content1 = file.read().decode()
  41. result_new_url = findall(pattern_next_url, content1)
  42. with urlopen(result_new_url[0]) as file_enter:
  43. file_enter_url = file_enter.read().decode()
  44. pattern_name = '(<span>:</span><h4>|<span>:</span>)(.+?)(</h4></div>|</div>)'
  45. message = findall(pattern_name, file_enter_url)
  46. ws.cell(row=count_number, column=1, value=message[0][1])
  47. ws.cell(row=count_number, column=3, value=message[1][1])
  48. ws.cell(row=count_number, column=2, value=message[2][1])
  49. count_number += 1
  50. wb.save(fn)
  51. result_message = findall(pattern_message,content1)
  52. result_message_year = findall(pattern_message_year,content1)
  53. number += 1
  54. for i in result_message:
  55. pattern_study = '毕业于(.+?)大学'
  56. for j in i:
  57. study = findall(pattern_study,j)
  58. for end in study:
  59. endd = end + "大学"
  60. if count == 40:
  61. break
  62. if len(endd) in range(4,10):
  63. count += 1
  64. ws.cell(row=count,column=4,value=endd)
  65. wb.save(fn)
  66. else:
  67. count += 1
  68. for year in result_message_year[0]:
  69. pattern_enter = '\d{4}年当选'
  70. enter = findall(pattern_enter, year)
  71. enter_year = "".join(enter)
  72. if len(enter_year) != 0:
  73. compare.append(enter_year)
  74. count_year += 1
  75. if count_year == 2:
  76. if compare[0] == compare [1]:
  77. ws.cell(row=number_year, column=5, value=compare[0][:-3])
  78. wb.save(fn)
  79. number_year += 1
  80. compare = []
  81. count_year = 0
  82. else:
  83. ws.cell(row=number_year, column=5, value=compare[0][:-3])
  84. number_year += 1
  85. wb.save(fn)
  86. del compare[0]
  87. count_year -= 1
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Monodyee/article/detail/475813
推荐阅读
相关标签
  

闽ICP备14008679号