赞
踩
程序代码:
- from urllib.request import urlopen
- from re import findall
- import requests
- url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
- with urlopen(url) as file:
- content = file.read().decode()
- pattern = 'href="(.+?)"'
- result = findall(pattern,content)
- end = []
- for urls in result:
- if urls[-4:-1] == "htm":
- if urls[15:20] == "colys":
- end.append('https://www.cae.cn'+urls)
- pattern_jpg = 'img src="(.+?)" style="width:150px;height:210px;"'
- pattern_message = '<p>    (.+?)</p><p> </p><p>    (.+?)</p>'
- pattern_name = '<div class="right_md_name">(.+?)</div>'
- for urls1 in end:
- with urlopen(urls1) as file:
- content1 = file.read().decode()
- result_jpg = findall(pattern_jpg,content1)
- result_message = findall(pattern_message,content1)
- result_name = findall(pattern_name,content1)
- for a in result_jpg:
- result_jpg_str = "".join(a)
- for b in result_message:
- result_message_str = "".join(b)
- for c in result_name:
- result_name_str = "".join(c)
- with open(result_name_str+'.txt','w',encoding="utf-8") as file_message:
- file_message.write(result_message_str)
- file_message.close()
- with open(result_name_str+'.jpg','wb') as file_jpg:
- url_new ="https://www.cae.cn/"+result_jpg_str
- r = requests.get(url_new)
- file_jpg.write(r.content)
- file_jpg.close()
程序代码:
- from urllib.request import urlopen
- from re import findall
- import openpyxl
- from openpyxl import Workbook
- fn = r'D:\message.xlsx'
- wb = Workbook()
- ws = wb.create_sheet(title="工程院士信息")
- ws['A1'] = '姓名'
- ws['B1'] = '性别'
- ws['C1'] = '民族'
- ws['D1'] = '毕业院校'
- ws['E1'] = '入选年份'
- wb.save(fn)
- wb = openpyxl.load_workbook(fn)
- ws = wb.worksheets[1]
- url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
- with urlopen(url) as file:
- content = file.read().decode()
- pattern = 'href="(.+?)"'
- result = findall(pattern,content)
- end = []
- for urls in result:
- if urls[-4:-1] == "htm":
- if urls[15:20] == "colys":
- end.append('https://www.cae.cn'+urls)
- pattern_message_year = '<p>    (.+?)</p><p> </p><p>    (.+?)</p>' \
- '(<p> </p><p>    (.+?)</p>)*'
- pattern_message = '<p>    (.+?)</p><p> </p><p>    (.+?)</p>'
- pattern_next_url = '<a href="(.+?)" target="_blank">'
- count = 1
- number = 2
- compare = []
- count_year = 0
- number_year = 2
- count_number = 2
- for url_new in end:
- if number == 41:
- break
- with urlopen(url_new) as file:
- content1 = file.read().decode()
- result_new_url = findall(pattern_next_url, content1)
- with urlopen(result_new_url[0]) as file_enter:
- file_enter_url = file_enter.read().decode()
- pattern_name = '(<span>:</span><h4>|<span>:</span>)(.+?)(</h4></div>|</div>)'
- message = findall(pattern_name, file_enter_url)
- ws.cell(row=count_number, column=1, value=message[0][1])
- ws.cell(row=count_number, column=3, value=message[1][1])
- ws.cell(row=count_number, column=2, value=message[2][1])
- count_number += 1
- wb.save(fn)
- result_message = findall(pattern_message,content1)
- result_message_year = findall(pattern_message_year,content1)
- number += 1
- for i in result_message:
- pattern_study = '毕业于(.+?)大学'
- for j in i:
- study = findall(pattern_study,j)
- for end in study:
- endd = end + "大学"
- if count == 40:
- break
- if len(endd) in range(4,10):
- count += 1
- ws.cell(row=count,column=4,value=endd)
- wb.save(fn)
- else:
- count += 1
- for year in result_message_year[0]:
- pattern_enter = '\d{4}年当选'
- enter = findall(pattern_enter, year)
- enter_year = "".join(enter)
- if len(enter_year) != 0:
- compare.append(enter_year)
- count_year += 1
- if count_year == 2:
- if compare[0] == compare [1]:
- ws.cell(row=number_year, column=5, value=compare[0][:-3])
- wb.save(fn)
- number_year += 1
- compare = []
- count_year = 0
- else:
- ws.cell(row=number_year, column=5, value=compare[0][:-3])
- number_year += 1
- wb.save(fn)
- del compare[0]
- count_year -= 1
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。