赞
踩
import time import pandas as pd from selenium import webdriver #导入相关的包 def get_one_page_data(driver, id_list, movie_list, star_list, movie_time_list, score_list): #定义获取一页数据的函数 time.sleep(1.5) dd_list = driver.find_elements_by_xpath('//div[@id="app"]/div/div/div/dl/dd') #获取电影相关信息数据 for dd in dd_list: infos_list = str(dd.text).split('\n') # 遍历后的电影相关数据组成列表 print(infos_list) print('----------------------------------') id_list.append(infos_list[0]) movie_list.append(infos_list[1]) star_list.append(infos_list[2]) #将遍历后的数据添加到列表中 movie_time_list.append(infos_list[3]) score_list.append(infos_list[4]) print('----------------------------------') def quit_driver(driver): #定义退出浏览器函数 time.sleep(5) driver.quit() if __name__ == '__main__': driver = webdriver.Chrome() driver.get('https://www.maoyan.com/board/4') time.sleep(5) id_list = [] movie_list = [] star_list = [] movie_time_list = [] #定义相关信息空列表 score_list = [] page_num = 1 print(f'******************正在爬取第{page_num}页***********************') get_one_page_data(driver, id_list, movie_list, star_list, movie_time_list, score_list) while True: li = driver.find_element_by_xpath('//div[@id="app"]/div/div/div[last()]/ul/li[last()]') li_text = str(li.text) #根据是否有下一页进行判断,有下一页则点击下一页,翻页 if li_text == '下一页': page_num = page_num + 1 li.click() print(f'======================正在爬取第{page_num}页==============================================') get_one_page_data(driver, id_list, movie_list, star_list, movie_time_list, score_list) #爬取下一页 else: print('结束爬取,正在保存到csv表中') break quit_driver(driver) dict1 = { '排名': id_list, '电影名': movie_list, '主演': star_list, #将添加后的列表数据导入到字典中 '上映时间': movie_time_list, '评分': score_list } df1 = pd.DataFrame(dict1) #转二维表 df1.to_csv('data/猫眼电影榜Top100.csv',index=False) #最后将数据添加到data下,去索引
import time import pandas as pd from selenium import webdriver def get_one_page_data(driver, gname_list, price_list, commit_list, shop_list, icons_list): # 定位所有的li标签 li_list = driver.find_elements_by_xpath('//div[@id="J_goodsList"]/ul/li') for li in li_list: ''' div[@class="p-name p-name-type-2"] 商品名称 /n div[@class="p-price"] 价格 div[@class="p-commit"] 评价数 div[@class="p-shop"] 店铺 div[@class="p-icons"] 标签 ''' gname = str(li.find_element_by_xpath('./div/div[@class="p-name p-name-type-2"]').text).replace('\n', '') gname_list.append(gname) price = str(li.find_element_by_xpath('./div/div[@class="p-price"]').text).replace('\n', '') price_list.append(price) commit = str(li.find_element_by_xpath('./div/div[@class="p-commit"]').text).replace('\n', '') commit_list.append(commit) shop = str(li.find_element_by_xpath('./div/div[@class="p-shop"]').text).replace('\n', '') shop_list.append(shop) icons = str(li.find_element_by_xpath('./div/div[@class="p-icons"]').text).replace('\n', '|') icons_list.append(icons) print(f'商品:{gname}, 价格:{price}, 评价数:{commit}, 店铺:{shop}, 标签:{icons}') print('--------------------------') def load_data(driver): # 将当前页面的滚动条拉取到最底部 driver.execute_script( 'window.scrollTo(0,document.body.scrollHeight)' ) time.sleep(3) while True: try: driver.find_element_by_xpath('//div[@id="J_scroll_loading"]/span/a').click() print('加载不完全,正在点击加载...') time.sleep(0.2) except: print('没有重新加载。。。。') break if driver.find_element_by_xpath('//div[@id="J_scroll_loading"]/span').text == '正在加载中,请稍后~~': print('数据加载完毕!!') break def get_data(driver, gname_list, price_list, commit_list, shop_list, icons_list): load_data(driver) time.sleep(1) get_one_page_data(driver, gname_list, price_list, commit_list, shop_list, icons_list) def load_Data2(driver): # 点击下一页之后最上面出现的加载 while True: try: driver.find_element_by_xpath('//div[@id="J_loading"]/div/span/a').click() print('前30条数据加载不完全,正在点击加载...') time.sleep(0.2) except: # print('没有重新加载。。。。') break # if driver.find_element_by_xpath('//div[@id="J_loading"]/div/span/a').text == '正在加载中,请稍后~~': # print('前30条数据加载完毕!!') # break time.sleep(1) # 将当前页面的滚动条拉取到最底部 driver.execute_script( 'window.scrollTo(0,document.body.scrollHeight)' ) time.sleep(3) # 加载后30条数据 while True: try: driver.find_element_by_xpath('//div[@id="J_scroll_loading"]/span/a').click() print('加载不完全,正在点击加载...') time.sleep(0.2) except: print('没有重新加载。。。。') break if driver.find_element_by_xpath('//div[@id="J_scroll_loading"]/span').text == '正在加载中,请稍后~~': print('所有数据加载完毕!!') break def get_data2(driver, gname_list, price_list, commit_list, shop_list, icons_list): load_Data2(driver) time.sleep(1) get_one_page_data(driver, gname_list, price_list, commit_list, shop_list, icons_list) def get_all_data(driver, gname_list, price_list, commit_list, shop_list, icons_list): # 将第一页放在外面获取 print('====================正在获取第1页的数据...===============================') get_data(driver, gname_list, price_list, commit_list, shop_list, icons_list) for i in range(2, 11): # 定位到下一页的标签元素 next_page = driver.find_element_by_xpath('//div[@id="J_bottomPage"]/span[1]/a[last()]') info = str(next_page.get_attribute('class')) if info == 'pn-next': # 点击下一页 next_page.click() print('----已经点击----------') time.sleep(2) print(f'====================正在获取第{i}页的数据...===============================') get_data2(driver, gname_list, price_list, commit_list, shop_list, icons_list) else: break def quit_driver(driver): # 睡眠n秒 time.sleep(5) # 关闭整个浏览器 driver.quit() if __name__ == '__main__': ''' 主url: https://www.jd.com/ ''' driver = webdriver.Chrome() driver.get('https://www.jd.com/') goods_name = input('请输入你要查询的商品名: ') # 定位到搜索栏框 driver.find_element_by_xpath('//input[@id="key"]').send_keys(goods_name) # 定位到按钮并点击 driver.find_element_by_xpath('//button[@clstag="h|keycount|h|keycount|head|search_a"]').click() # 留时间给用户进行登录操作 time.sleep(15) ''' div[@class="p-name p-name-type-2"] 商品名称 /n div[@class="p-price"] 价格 div[@class="p-commit"] 评价数 div[@class="p-shop"] 店铺 div[@class="p-icons"] 标签 ''' gname_list = [] price_list = [] commit_list = [] shop_list = [] icons_list = [] get_all_data(driver, gname_list, price_list, commit_list, shop_list, icons_list) quit_driver(driver) dict1 = { '商品名称': gname_list, '价格': price_list, '评价数': commit_list, '店铺': shop_list, '标签': icons_list } df1 = pd.DataFrame(dict1) df1.to_csv(f'data/京东{goods_name}商品前10页信息.csv')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。