赞
踩
代码仅仅是例子,主要是方法的使用,因为是韩国论坛需要代理才能访问。欢迎留言讨论
import threading import requests from bs4 import BeautifulSoup import base64 import sqlite3 import re import datetime from multiprocessing.dummy import Pool as ThreadPool # import _thread conn = sqlite3.connect('reptile.db') #连接数据库 # 连接到SQLite数据库 c = conn.cursor() # 目标网站 URL # url = 'https://www.ppomppu.co.kr/zboard/zboard.php?id=freeboard&hotlist_flag=999' #爬取整页文章及内容 def getBBS(page): url = 'https://www.ppomppu.co.kr/zboard/zboard.php?id=freeboard&hotlist_flag=999&page='+str(page) # 发送请求并解析 HTML response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # 提取文章信息 articles = [] for article in soup.select('.list0,.list1'): author = article.select_one('.list_name').text.strip() title = article.select_one('.list_title').text.strip() uid=article.find_all('td',class_='eng list_vspace')[0].text.strip() timestamp = article.find_all('td',class_='eng list_vspace')[1].get('title') content_url ='https://www.ppomppu.co.kr/zboard/'+article.find_all('a')[1].get('href') content_response = requests.get(content_url) content_soup = BeautifulSoup(content_response.content, 'html.parser') date=content_soup.find('div',class_='sub-top-text-box').get_text() print(date) pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}" match = re.search(pattern, date) if match: print("匹配到的时间为:", match.group(0)) date=match.group(0) else: print("未找到匹配的时间") # 获取当前时间 now = datetime.datetime.now() date=now content_element = content_soup.find_all('table',class_='pic_bg')[2] if content_element: content = content_element.text.strip() images = content_element.find_all('img') image_data = [] for image in images: image_url = image['src'] image_response =requests.get('https:'+image_url) # image_data.append(base64.b64encode(image_response.content).decode()) # 将 img 标签的 src 属性替换成 base64 编码的内容 base64_data = base64.b64encode(image_response.content).decode() image["src"] = "data:image/png;base64," + base64_data print(content_element) # content_element=content_element.replace(image_url,image["src"]) # print(content_element) # articles.append({ # 'author': author, # 'title': title, # 'timestamp': timestamp, # 'content': content, # 'images': image_data # }) print('作者:', author) print('标题:', title) print('发布时间:', timestamp) content=str(content_element) # 插入一条记录 c.execute("INSERT OR IGNORE INTO getData (dataID,textID,dataName,textUrl,textTitle,textTime,textBody,textState,textName,regTime,EncodingStr) VALUES(:dataID,:textID,:dataName,:textUrl,:title,:textTime,:textBody,:textState,:author,:regTime,:EncodingStr)", {"dataID":'1',"textID":uid,"dataName":'ppomppu','textUrl':content_url,'title':title,'textTime':date,'textBody':content,'textState':'1','author':author,'regTime':timestamp,'EncodingStr':'EUC-KR'}) # 提交当前事务,保存数据 conn.commit() print(articles) # 开始 def startUp(): # urls = ['https://www.ppomppu.co.kr/zboard/zboard.php?id=freeboard&hotlist_flag=999&page={}'.format(i) for i in range(1, 6575)] # 设置线程数量 num_threads = 10 # 设置要爬取的页数 pages = range(1, 10000) # 创建线程池 pool = ThreadPool(num_threads) # 在线程池中运行函数 results = pool.map(getBBS, pages) # 关闭线程池 pool.close() pool.join() if __name__ == '__main__': startUp() # 打印文章信息 # def print(): # for article in articles: # print('作者:', article['author']) # print('标题:', article['title']) # print('发布时间:', article['timestamp']) # print('文章内容:', article['content']) # print('文章图片:', article['images']) # print('------------------------') # 关闭数据库连接 conn.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。