赞
踩
任务步骤
环境库准备:python基本库、beautifulsoup库、selenium库、pyecharts库
工具:chrome、jupyter notebook
import requests from bs4 import BeautifulSoup def getDATA(url): headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'} #User-Agent,浏览器的用户代理字符串,标明身份的第一层标识 #**如果仅一个User-Agent,也很容易被网站通过相同浏览器频繁访问而识别为爬虫程序,所以一般通过使用多个User-Agent随机调用的方式,避免一个请求头长时间访问。 source_file=r"C://**" try: parmas={ 'keyword':'pro'#搜索 } r=requests.get(url=url,headers=headers,params=parmas) # url(请求的url地址,必需 ),headers参数(请求头,可选), params参数 (请求参数,可选),proxies参数 (代理IP,可选),verify参数(ssl证书验证,可选) r.raise_for_status()#查看网络连接情况 r.encoding='utf-8'#字符编码 # print(r.text) with open (np1,'w',encoding='utf-8') as f1: f1.write(r.text)#写入数据 except Exception as np: print(np) #这个代码中我们用r.raise_for_status()方法,它就可以有效的判断网络连接的状态。如果网连接出现错误,那么它就会用try-except来获取一个异常,若异常出现则输出np def extractDATA(): source_file=r"C://**/*" compile_file=r"C://***/**" soup= BeautifulSoup(open(np1,'r',encoding='utf-8'),'lxml') # print(soup.title) test=soup.select("ul[class='gl-warp clearfix'] li") # print(test) with open(compile_file, 'w', encoding='utf-8') as f2: for x in test: w=x.select("div[class='p-price']")[0].get_text().strip() a=x.select("div[class='p-name p-name-type-2']")[0].get_text().strip() y = x.select("div[class='p-shop']")[0].get_text().strip() k = x.a.attrs['href'].strip() s='名称:'+a +'\n价格:'+w+'\n店铺:'+y+'\n链接:'+'http:'+k+'\n----------------------------' print(s,'\n') f2.write(s) def main(): url = '' getDATA(url)#调用函数 extractDATA() if __name__ == '__main__': main()
css修饰标(与上一样)
import re def getData(): pass def extractData(html_path,result_list): with open(html_path,'r',encoding='UTF-8') as f: html = f.read() #print(html) r_price = r'<div class="p-price">.*?<i.*?>(.*?)</i>.*?</div>' price_list = re.findall(r_price, html, re.S) print(price_list) r_url = r'<div class="p-name.*?"><a.*?href="(.*?)">.*?</div>' url_list = re.findall(r_url, html, re.S) url_list = ["https:" + url for url in url_list] print(url_list) r_desc = r'<div class="p-name.*?">.*?<em>(.*?)</em>.*?</div>' desc_list = re.findall(r_desc, html, re.S) r_sub = r'<.*?>' desc_list = [re.sub(r_sub, '', desc, re.S) for desc in desc_list] print(desc_list) dengxian = open("C://**", 'w', encoding="utf-8") for item in zip(price_list, url_list, desc_list): print(item) print(item, file=dengxian) dengxian.close() def saveResults(): pass def main(): html_path = r'source_file' r_list = list() #getData extractData(html_path,r_list) if __name__ == '__main__': main()
from selenium import webdriver from selenium.webdriver.common.by import By import time def scrollDown(driver, step=10): total_height = int(driver.execute_script("return document.body.scrollHeight")) #自动化 for i in range(1, total_height-1200, step): driver.execute_script("window.scrollTo(0, {});".format(i)) time.sleep(0.005) def getComments(url): driver = webdriver.Chrome()#使用chrmoe driver.get(url)#请求url driver.maximize_window()#全屏显示 time.sleep(3)#相应时间 comment_btn = driver.find_element(By.XPATH,'//li[@data-anchor="#comment"]') #点击所需标签(评论) comment_btn.click()#跳转 time.sleep(3) scrollDown(driver) with open(r'./***','a+',encoding='utf-8')as f: for i in range(1,5):#跳转页数 comments = driver.find_elements(By.CLASS_NAME,'comment-item') for comment in comments: user_info = comment.find_element_by_class_name('user-info').text text_info = comment.find_element(By.CLASS_NAME,'comment-con').text star_info = comment.find_element_by_xpath('.//div[@class="comment-column J-comment-column"]/div').get_attribute('class').strip()[-5:] days_info = comment.find_element_by_xpath('.//div[@class="order-info"]/span[last()]').text xian = comment.find_element_by_class_name('user-level').text if xian == "PLUS会员": print('\n'"---------------"'\n名称', user_info, '\n会员状态:', xian, '\n评论', text_info, '\n星级', star_info, '\n评论时间',days_info) f.writelines('\n'"---------------"'\n名称:'+ user_info+ '\n会员状态:'+ xian+'\n评论:'+text_info+'\n星级:'+star_info+'\n评论时间:'+days_info) else: print( '\n'"---------------"'\n名称', user_info, '\n会员状态:', "普通会员", '\n评论', text_info, '\n星级', star_info, '\n评论时间',days_info) f.writelines( '\n'"---------------"'\n名称:' + user_info + '\n会员状态:' + "普通会员" + '\n评论:' + text_info + '\n星级:' + star_info + '\n评论时间:' + days_info) next_btn = driver.find_element_by_xpath('.//a[@class="ui-pager-next"]') #所需点击标签 next_btn.click() time.sleep(3) scrollDown(driver) def main(): url=r'https://item.jd.com/100021707422.html' getComments(url) if __name__ == '__main__': main()
#jieba、pandas用来处理数据,数据源以xls格式存储的,这里用pandas进行处理 from pyecharts.charts import WordCloud from wordcloud import wordcloud from selenium import webdriver import time, jieba, pyecharts import pyecharts.options as opts # from selenium.webdriver.common.by import By #方法的引用 def getWord(word_list): #创建函数 with open(r'C://**',encoding='utf-8') as f : txt = f.read() #读取(评论)文件 re_move = [",", "。", " ", '\n', '\xa0',':','*','/d'] # 去除无效数据 for i in re_move: txt = txt.replace(i, " ") #判断是否存在,若存在则删除 word_lists = jieba.lcut(txt) # 使用精确模式对文本进行分词 getWordFreq(word_lists, word_count={}) #通过键值对的形式存储词语及其出现的次数 def getWordFreq(word_list, word_count): stopwords = [line.strip() for line in open('A:\\1\评论1.txt', encoding='utf-8').readlines()] #遍历并返回 #----readlines() 方法用于读取所有行(直到结束符 EOF)并返回列表。 ----strip() 方法用于移除字符串头尾指定的字符(默认为空格)或字符序列 #sjstopwords = [line.strip() for line in open('"B:\\3206574001\FileRecv\hit_stopwords.txt"', encoding='utf-8').readlines()] for word in word_list: #循环 if word not in stopwords: # if word not in sjstopwords: word_count[word] = word_count.get(word, 0) + 1 #统计词语的出现的次数 if len(word) == 1: # 单个词语不计算在内 continue #跳出 else: word_count[word] = word_count.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1 items = list(word_count.items()) #统计次数 items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序 print(items) drawWordCLoud(word_count) def drawWordCLoud(word_count): wCloud = WordCloud() #color_mask = imread("bg.png") # bg.png背景图片 #d = path.dirname(__file__) # 当前文件文件夹所在目录 # cloud = WordCloud( # font_path="STXINGKA.ttf", #华文行楷 # font_path=path.join(d,'simsun.ttc'), # 设置字体 宋体 # background_color='white', # 设置背景色 # mask=color_mask, # 词云形状 # max_words=2000, #允许最大词汇 # max_font_size=400, # 最大号字体,如果不指定则为图像高度 # max_font_size = 150, # 画布宽度和高度,如果设置了mask则不会生效 # prefer_horizontal = 0.8 # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1 # ) # word_cloud = cloud.generate(cut_text) # 产生词云 wCloud.set_global_opts(title_opts=opts.TitleOpts(title="高频评论词")) #标题 wCloud.add( series_name='评论', data_pair=list(word_count.items()), shape='pentagon' ) wCloud.render(r'wordCloud.html') #构建图片生成网页文件 print("已成功构建 wordCloud.html 文件,请访问它以查看最终效果。") def main(): word_file = open('C:/**', encoding="utf-8")、 #打开文件(评论文件) word_list = word_file.read() # read() 方法用于从文件读取指定的字节数,如果未给定或为负则读取所有 # print(word_list) # word_count = {} getWord(word_list) #调用 # getWordFreq(word_list,word_count) if __name__ == '__main__': #main方法 main()
侵权请联系必删
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。