赞
踩
# 统一时间 def parse_time(datetime): if datetime is None: return datetime if re.match('\d+年\d+月\d+日.+', datetime): # time模块 datetime = datetime datetime.replace("年","-") datetime.replace("月","-") datetime.replace("日","") if re.match('\d+月\d+日.+', datetime): # time模块 datetime = datetime.replace("月","-").replace("日","") datetime = time.strftime('%Y-', time.localtime(time.time())) + datetime if re.match('\d+分钟前', datetime): minute = re.match('(\d+)', datetime).group(1) # time.localtime()为当前的时间戳,用time.time()当前的时间戳-已过的时间来获取发布时的时间戳(1970纪元后经过的浮点秒数) datetime = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute)*60)) if re.match('\d+秒前', datetime): second = re.match('(\d+)', datetime).group(1) # time.localtime()为当前的时间戳,用time.time()当前的时间戳-已过的时间来获取发布时的时间戳(1970纪元后经过的浮点秒数) datetime = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(second))) if re.match('今天.*', datetime): temp = re.match("今天(\d+:\d+)",datetime).group(1) datetime = time.strftime('%Y-%m-%d ', time.localtime()) + temp return datetime # 开启爬虫 def start_crawl(base_url,pages): headers = request_header_article() # 遍历搜索时间 years = [2021,2022] for year in years: months = [*range(1,13)] if year != 2022 else list(1) days = [31,28,31,30,31,30,31,31,30,31,30,31] for num, month in enumerate(months): for day in range(1,days[num],7): day_end = day + 6 if day + 6 < 31 else 30 print("Spider正在爬取{}-{}-{}至{}-{}-{}时间段!".format(year,month,day,year,month,day_end)) time.sleep(1) for page in range(1, pages + 1): url = base_url.format(year_start=year, month_start=month, day_start=day, year_end=year, month_end=month, day_end=day_end, page=page) response = requests.get(url=url, headers=headers,timeout=(3,7)) if response.status_code == 200: res = et.HTML(response.text) print("Spider已完成第{}页爬取任务!".format(page)) yield res else: continue # 将数据存入MongoDB def save_mongoDB(base_url,pages): client = pymongo.MongoClient(host='localhost', port=27017) db = client.HotSpotBD table = db['HotSpotInfo'] for insert_list in data_clean(base_url,pages): table.insert_many(insert_list) print("完成一页数据插入任务!") client.close()
该部分数据用于数据分析
该部分数据用于文本情感分析
在做情感分析之前,需要对文本进行去除非中文字符、中文分词、去除停用词、词频统计等预处理,以下是对文本预处理后的部分数据:
词频统计后作出的词云图
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。