赞
踩
# -*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from time import sleep from pyquery import PyQuery as pq import pandas as pd import requests import re import os import common """ """ class crawl_infos: def __init__(self): self.browser = None # 初始化浏览器 self.init_chrome() def init_chrome(self): # options = webdriver.ChromeOptions() # # 主要处理淘宝登录滑块问题,右键谷歌软件选中属性,在快捷方式-目标中添加--remote-debugging-port=9222 前面跟空格 # options.add_experimental_option('debuggerAddress', '127.0.0.1:9222') # # 配置谷歌路径 # options.add_argument("headless") # options.binary_location = 'C:/Program Files/Google/Chrome/Application' # self.browser = webdriver.Chrome(options=options) options = webdriver.ChromeOptions() # 后面的两个是固定写法 必须这么写 options.add_experimental_option('excludeSwitches', ['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium options.add_argument("--disable-blink-features=AutomationControlled") # 就是这一行告诉chrome去掉了webdriver痕迹 options.add_argument("headless") # options.add_argument("--disable-gpu") # options.add_argument('--start-maximized') options.add_experimental_option('useAutomationExtension', False) # 创建浏览器对象 exec_path = 'chromedriver.exe' self.browser = webdriver.Chrome(executable_path=exec_path, options=options) self.browser.maximize_window() self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get:() => undefined }) """ }) # 模拟向下滑动浏览 def swipe_down(self, second): for i in range(int(second / 0.1)): js = "var q=document.documentElement.scrollTop=" + str(300 + 200 * i) self.browser.execute_script(js) sleep(0.1) js = "var q=document.documentElement.scrollTop=100000" self.browser.execute_script(js) sleep(0.2) def crawl_baidu(self, key_word, file_name): search_by = "百度" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate, br", "Host": "www.baidu.com", # 需要更换Cookie "Cookie": "BIDUPSID=729E480F1B8CEB5347D8572AE6495CFA; PSTM=1645237046; BAIDUID=729E480F1B8CEB53DEEB6344B7C88A22:FG=1; BD_UPN=123253; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjs_duid=1_695315baa9a328fc73db6db6ba9ee8781645357087938; MSA_WH=1324_311; H_PS_PSSID=35106_31660_35765_34584_35872_35818_35948_35954_35315_26350_22159; H_PS_645EC=ab89Uk1B6EQVOEBnfF64C5jyWp40Rge9HGeQ8Q2fEodX81kjh6WtOKBhR2A; BAIDUID_BFESS=729E480F1B8CEB53DEEB6344B7C88A22:FG=1; BA_HECTOR=2g8g040k818g0l21a31h1g5g60r; baikeVisitId=9a933a90-dc5c-4192-93d2-10526d401267; WWW_ST=1645745708722" } search_pot = "PC" self.browser.get("https://www.baidu.com/") self.browser.implicitly_wait(30) # 显式等待 sleep(3) # 点击下一页 input_text = WebDriverWait(self.browser, 30).until( EC.visibility_of( self.browser.find_element_by_xpath('//*[@id="kw"]'))) sleep(1) input_text.send_keys(key_word) confirm_btn = WebDriverWait(self.browser, 30).until( EC.visibility_of( self.browser.find_element_by_xpath('//*[@id="su"]'))) confirm_btn.click() self.browser.implicitly_wait(5) # 显式等待 sleep(3) # 接下来是全屏的关键,用js获取页面的宽高,如果有其他需要用js的部分也可以用这个方法 width = self.browser.execute_script("return document.documentElement.scrollWidth") height = self.browser.execute_script("return document.documentElement.scrollHeight") self.browser.set_window_size(width, height) num = 1 result_dic = [] for page in range(1, 4): self.browser.implicitly_wait(10) # 显式等待 sleep(3) html = self.browser.page_source doc = pq(html) # 先抓result result__items = doc.find("#content_left .c-container").items() for result__item in result__items: title = result__item.find("h3 a").text() url = result__item.attr("mu") pt = result__item.find(".siteLink_9TPP3").text() tt = result__item.find(".c-color-gray2").text() if title and url: dic = [search_by, search_pot, key_word, str(page), str(num), tt, pt, title, url] print(dic) result_dic.append(dic) num += 1 class_name = result__item.attr("class") if class_name.__contains__("result-op"): # 获取子信息 tpl = result__item.attr("tpl") if tpl == 'short_video': # 短视频 xsmall__items = result__item.find(".c-span4").items() for xsmall__item in xsmall__items: url = xsmall__item.find(".c-gap-top-xsmall a").attr("href") title = xsmall__item.find(".c-gap-top-xsmall a").text() if url and title: pt = xsmall__item.find(".c-color-gray").text() dic = [search_by, search_pot, key_word, str(page), str(num), '', pt, title, url] print(dic) result_dic.append(dic) num += 1 elif tpl == 'news-realtime': items = result__item.find(".c-row").items() for item in items: title = item.find(".tts-title").text() url = item.find(".tts-button_1V9FA").attr("data-url") pt = item.find(".c-gap-right-small").text() tt = item.find(".c-color-gray2").text() dic = [search_by, search_pot, key_word, str(page), str(num), tt, pt, title, url] print(dic) result_dic.append(dic) num += 1 # 截屏 self.browser.get_screenshot_as_file(f"image/{search_by}/{search_pot}/{key_word}_{str(page)}.png") if page == 3: break # 点击下一页 WebDriverWait(self.browser, 10).until( EC.visibility_of( self.browser.find_elements(by=By.CLASS_NAME, value='n')[-1])).click() self.browser.close() self.browser.quit() exists = os.path.exists(f'result/{file_name}.xlsx') result_dic = common.calc(result_dic) df_new = pd.DataFrame(result_dic, columns=['搜索引擎', '端口', '关键词', '页码', '排名', '发布时间', '收录平台', '标题', '链接']) if not exists: df_new.to_excel(f'result/{file_name}.xlsx', index=False) # 写出数据 else: df = pd.read_excel(f'result/{file_name}.xlsx', header=0, sheet_name='Sheet1') # 读入数据 df_all = pd.concat([df, df_new], ignore_index=True) # concat 合并有相同字段名的dataframe df_all.to_excel(f'result/{file_name}.xlsx', index=False) # 写出数据 def get_real_url(self, v_url): try: print(v_url) """ 获取百度链接真实地址 :param v_url: 百度链接地址 :return: 真实地址 """ r = requests.get(v_url, headers=self.headers, allow_redirects=False) # 不允许重定向 if r.status_code == 302: # 如果返回302,就从响应头获取真实地址 real_url = r.headers.get('Location') else: # 否则从返回内容中用正则表达式提取出来真实地址 real_url = re.findall("URL='(.*?)'", r.text)[0] except Exception as e: print(e) real_url = v_url return real_url if __name__ == '__main__': infos = crawl_infos() infos.crawl_baidu("中国联塑", "202303016")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。