赞
踩
import prestodb import pandas as pd import numpy as np import math import pymysql import pymssql from sqlalchemy import create_engine import warnings warnings.filterwarnings("ignore") from IPython.display import display from urllib import parse import datetime import xlwings as xw import matplotlib.pyplot as plt import chinese_calendar from sklearn.preprocessing import StandardScaler import seaborn as sns import requests from scipy.stats import f_oneway from bs4 import BeautifulSoup import re import time
headers_lists =(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/65.0.3325.181 Safari/537.36',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0',
'Opera/9.80(Android2.3.4;Linux;Operamobi/adr-1107051709;U;zh-cn)Presto/2.8.149Version/11.10',
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/5.0(Android;Linuxarmv7l;rv:5.0)Gecko/Firefox/5.0fennec/5.0',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',)
#爬取天气 #url = 'https://lishi.tianqi.com/wujiang/202201.html' headers = {'User-Agent':random.choice(headers_lists), 'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' } def set_link(year): #year参数为需要爬取数据的年份 link = [] for i in range(1,13): #一年有12个月份 if i < 10: url='https://lishi.tianqi.com/shanghai/{}0{}.html'.format(year,i) else: url='https://lishi.tianqi.com/shanghai/{}{}.html'.format(year,i) link.append(url) return link def get_page(url,headers): html = requests.get(url,headers=headers) if html.status_code == 200: html.encoding = html.apparent_encoding return html.text else: return None date_box = [] max_temp = [] min_temp = [] weh = [] wind = [] week_box = [] def get_data(years): link = set_link(years) for url in link: html = get_page(url,headers) bs = BeautifulSoup(html,'html.parser') data = bs.find_all(class_='thrui') date = re.compile('class="th200">(.*?)</') tem = re.compile('class="th140">(.*?)</') time = re.findall(date,str(data)) # print(time) # print(len(time)) for item in time: week = item[10:] week_box.append(week) date_box.append(item[:10]) temp = re.findall(tem, str(data)) for i in range(len(time)): #之前因为自身需要的只是19年6月的天气信息,没有考虑到每个月的天数不一样,现在修改后就没有问题了 max_temp.append(temp[i*4+0]) min_temp.append(temp[i*4+1]) weh.append(temp[i*4+2]) wind.append(temp[i*4+3]) get_data(2022) get_data(2023) datas = pd.DataFrame({'日期':date_box,'星期':week_box,'最高温度':max_temp,'最低温度':min_temp,'天气':weh,'风向':wind}) df_climate = datas.drop_duplicates()
#爬取天气 #url = 'https://lishi.tianqi.com/wujiang/202201.html' headers = {'User-Agent':random.choice(headers_lists), 'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' } url='https://tianqi.2345.com/wea_history/58362.htm' def get_page(url,headers): html = requests.get(url,headers=headers) if html.status_code == 200: html.encoding = html.apparent_encoding return html.text else: return None html = get_page(url,headers) bs = BeautifulSoup(html,'html.parser') list_final = [] for i in bs.find("table",class_="history-table").find_all('tr'): if len(i.find_all('td')) >0: list_ = [] for i_info in i.find_all('td'): if len(i_info.text.split(' ')) == 2: list_.append(i_info.text.split(' ')[0]) list_.append(i_info.text.split(' ')[1].replace('周','星期')) else: list_.append(i_info.text) list_final.append(list_) df_climate_backup = pd.DataFrame(list_final,columns=['日期','星期','最高温度','最低温度','天气','风向','空气质量指数','空气质量等级']) df_climate_backup = df_climate_backup[['日期','星期','最高温度','最低温度','天气','风向']]
for i_date in df_climate_backup['日期']:
if i_date not in df_climate['日期'].tolist():
df_climate = pd.concat([df_climate,df_climate_backup[df_climate_backup['日期']==i_date]])
df_climate = df_climate.reset_index().drop('index',axis=1)
df_climate['城市'] = '上海'
df_climate
#爬取天气 import requests import re import time from bs4 import BeautifulSoup import pandas as pd #url = 'https://lishi.tianqi.com/wujiang/202201.html' headers = {'User-Agent':random.choice(headers_lists), 'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' } def get_climate(x): url='https://www.tianqi.com/alarmnews_02/{0}/'.format(x) def get_page(url,headers): html = requests.get(url,headers=headers) if html.status_code == 200: html.encoding = html.apparent_encoding return html.text else: time.sleep(1) url='https://www.tianqi.com/alarmnews_02/{0}'.format(x) print('time sleep') html = requests.get(url,headers=headers) if html.status_code == 200: return html.text html = get_page(url,headers) bs = BeautifulSoup(html,'html.parser') return bs list_alarm = [] list_timestamp = [] for i_x in range(1,50): try: bs = get_climate(i_x) for i_index in bs.find('ul',class_='leftlist').find_all('li'): if len(i_index.find('img').text)>0: alarm_ = i_index.find('img').text.split('发布')[1] list_alarm.append(alarm_) list_timestamp.append(i_index.find('span').text.split(' ')[0]) if len(i_index.find('img').text)==0: time_str = i_index.find('span').text alarm_ = i_index.find('a').text.split('发布')[1].replace('\n','').replace(time_str,'') time_str = time_str.split(' ')[0] list_alarm.append(alarm_) list_timestamp.append(time_str) except: pass
df_climate_alarm = pd.DataFrame([list_alarm,list_timestamp]).T.drop_duplicates()
df_climate_alarm.columns = ['Alarm_info','date']
df_climate_alarm = df_climate_alarm.groupby('date').agg({'Alarm_info':lambda x:x.str.cat(sep='/')}).reset_index()
df_climate_alarm
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df_climate_alarm['unic_version'] = cur_time
return_to_mysql(df_climate_alarm,'raw_climate_alarm_unic')
from datetime import date from workalendar.asia import China # 创建中国日历对象 cal = China() # 定义日期范围 start_date = date(2021, 10, 1) end_date = date(2023, 12, 31) # 定义一个空的DataFrame df = pd.DataFrame(columns=["date", "is_workday", "is_holiday", "holiday_name"]) # 遍历日期范围,将节假日信息添加到DataFrame中 for dt in pd.date_range(start=start_date, end=end_date): dt = dt.date() is_work = cal.is_working_day(dt) is_holi = not is_work and cal.is_holiday(dt) holi_name = cal.get_holiday_label(dt) if is_holi else "" df = df.append({"date": dt, "is_workday": is_work, "is_holiday": is_holi, "holiday_name": holi_name}, ignore_index=True) # 定义一个函数,用于将日期转换为星期几 from datetime import datetime def get_weekday(date_str): date_obj = datetime.strptime(str(date_str), "%Y-%m-%d") return date_obj.weekday() # 使用apply方法,将get_weekday函数应用到date列中的每个元素,生成一个新的列weekday df["weekday"] = df["date"].apply(get_weekday) def is_holiday_(x): if str(x) == 'False': return 'holiday' if str(x) == 'True': return 'workday' df["weekday_cat"] = df.apply(lambda x:is_holiday_(x['is_workday']),axis=1) df_workday = df.drop_duplicates() df_workday['date'] = pd.to_datetime(df_workday['date'])
import datetime
df_climate['日期'] = pd.to_datetime(df_climate['日期'])
df_workday['date'] = pd.to_datetime(df_workday['date'])
df_climate_alarm['date'] = pd.to_datetime(df_climate_alarm['date'])
df_climate_workday = pd.merge(df_climate,df_workday,left_on='日期',right_on='date',how='left')
df_climate_workday = pd.merge(df_climate_workday,df_climate_alarm.rename(columns={'date':'date_alarm'}),left_on='日期',right_on='date_alarm',how='left')
index_Shutdown = df_climate_workday[(df_climate_workday['日期']>=('2022-03-18')) &(df_climate_workday['日期']<('2022-04-27'))].index
for i_index in index_Shutdown:
df_climate_workday.loc[i_index,'weekday_cat']='lockdown day'
def get_key_alarm(x):
if pd.isna(x)==False:
if x.find('雷电')>=0:
return 'Abnormal climate'
if x.find('雾')>=0:
return 'Abnormal climate'
if x.find('雷电')>=0:
return 'Abnormal climate'
if x.find('台风')>=0:
return 'Abnormal climate'
if x.find('暴雨')>=0:
return 'Abnormal climate'
else:
return 'Normal climate'
df_climate_workday['Alarm_info_cat'] = df_climate_workday['Alarm_info'].map(lambda x:get_key_alarm(x))
df_climate_workday = df_climate_workday.drop('date',axis=1)
df_climate_workday = df_climate_workday.rename(columns={'日期':'Date','星期':'Week','最高温度':'maximum_temperature','最低温度':'minimum_temperature','天气':'climate','风向':'wind_direction','城市':'city'})
df_climate_workday = df_climate_workday[['Date', 'Week', 'maximum_temperature', 'minimum_temperature', 'climate',
'wind_direction', 'city', 'is_workday', 'is_holiday', 'holiday_name',
'weekday', 'weekday_cat', 'date_alarm', 'Alarm_info','Alarm_info_cat', 'unic_version'
]]
# df_climate_workday['unic_version'] = cur_time
# return_to_mysql(df_climate_workday,'raw_climate_workday_unic')
def Connect_MySQL_Get_data(sql):
# ************** 忽略
return data
df_climate_workday = Connect_MySQL_Get_data(sql="""
""")
df_climate_alarm = Connect_MySQL_Get_data(sql="""
""")
df_replen = Get_df_source('2022-01-21','2023-05-15')
df_replen_open = Get_df_source(s="""
""")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。