当前位置:   article > 正文

Python关键词搜索排名抓取之百度PC端搜索结果抓取之selenium脚本(crawl_baidu.py)_selenium获取百度搜索结果

selenium获取百度搜索结果
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from pyquery import PyQuery as pq
import pandas as pd
import requests
import re
import os
import common

"""

"""

class crawl_infos:
    def __init__(self):
        self.browser = None
        # 初始化浏览器
        self.init_chrome()

    def init_chrome(self):
        # options = webdriver.ChromeOptions()
        # # 主要处理淘宝登录滑块问题,右键谷歌软件选中属性,在快捷方式-目标中添加--remote-debugging-port=9222 前面跟空格
        # options.add_experimental_option('debuggerAddress', '127.0.0.1:9222')
        # # 配置谷歌路径
        # options.add_argument("headless")
        # options.binary_location = 'C:/Program Files/Google/Chrome/Application'
        # self.browser = webdriver.Chrome(options=options)
        options = webdriver.ChromeOptions()
        # 后面的两个是固定写法 必须这么写
        options.add_experimental_option('excludeSwitches',
                                        ['enable-automation'])  # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
        options.add_argument("--disable-blink-features=AutomationControlled")  # 就是这一行告诉chrome去掉了webdriver痕迹
        options.add_argument("headless")
        # options.add_argument("--disable-gpu")
        # options.add_argument('--start-maximized')
        options.add_experimental_option('useAutomationExtension', False)
        # 创建浏览器对象
        exec_path = 'chromedriver.exe'
        self.browser = webdriver.Chrome(executable_path=exec_path, options=options)
        self.browser.maximize_window()
        self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                                Object.defineProperty(navigator, 'webdriver', {
                                    get:() => undefined
                                })
                            """
        })

    # 模拟向下滑动浏览
    def swipe_down(self, second):
        for i in range(int(second / 0.1)):
            js = "var q=document.documentElement.scrollTop=" + str(300 + 200 * i)
            self.browser.execute_script(js)
            sleep(0.1)
        js = "var q=document.documentElement.scrollTop=100000"
        self.browser.execute_script(js)
        sleep(0.2)

    def crawl_baidu(self, key_word, file_name):
        search_by = "百度"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
            "Connection": "keep-alive",
            "Accept-Encoding": "gzip, deflate, br",
            "Host": "www.baidu.com",
            # 需要更换Cookie
            "Cookie": "BIDUPSID=729E480F1B8CEB5347D8572AE6495CFA; PSTM=1645237046; BAIDUID=729E480F1B8CEB53DEEB6344B7C88A22:FG=1; BD_UPN=123253; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjs_duid=1_695315baa9a328fc73db6db6ba9ee8781645357087938; MSA_WH=1324_311; H_PS_PSSID=35106_31660_35765_34584_35872_35818_35948_35954_35315_26350_22159; H_PS_645EC=ab89Uk1B6EQVOEBnfF64C5jyWp40Rge9HGeQ8Q2fEodX81kjh6WtOKBhR2A; BAIDUID_BFESS=729E480F1B8CEB53DEEB6344B7C88A22:FG=1; BA_HECTOR=2g8g040k818g0l21a31h1g5g60r; baikeVisitId=9a933a90-dc5c-4192-93d2-10526d401267; WWW_ST=1645745708722"
        }
        search_pot = "PC"
        self.browser.get("https://www.baidu.com/")
        self.browser.implicitly_wait(30)  # 显式等待
        sleep(3)
        # 点击下一页
        input_text = WebDriverWait(self.browser, 30).until(
            EC.visibility_of(
                self.browser.find_element_by_xpath('//*[@id="kw"]')))
        sleep(1)
        input_text.send_keys(key_word)
        confirm_btn = WebDriverWait(self.browser, 30).until(
            EC.visibility_of(
                self.browser.find_element_by_xpath('//*[@id="su"]')))
        confirm_btn.click()
        self.browser.implicitly_wait(5)  # 显式等待
        sleep(3)
        # 接下来是全屏的关键,用js获取页面的宽高,如果有其他需要用js的部分也可以用这个方法
        width = self.browser.execute_script("return document.documentElement.scrollWidth")
        height = self.browser.execute_script("return document.documentElement.scrollHeight")
        self.browser.set_window_size(width, height)
        num = 1
        result_dic = []
        for page in range(1, 4):
            self.browser.implicitly_wait(10)  # 显式等待
            sleep(3)
            html = self.browser.page_source
            doc = pq(html)
            # 先抓result
            result__items = doc.find("#content_left .c-container").items()
            for result__item in result__items:
                title = result__item.find("h3 a").text()
                url = result__item.attr("mu")
                pt = result__item.find(".siteLink_9TPP3").text()
                tt = result__item.find(".c-color-gray2").text()
                if title and url:
                    dic = [search_by, search_pot, key_word, str(page), str(num), tt, pt, title, url]
                    print(dic)
                    result_dic.append(dic)
                    num += 1
                class_name = result__item.attr("class")
                if class_name.__contains__("result-op"):
                    # 获取子信息
                    tpl = result__item.attr("tpl")
                    if tpl == 'short_video':
                        # 短视频
                        xsmall__items = result__item.find(".c-span4").items()
                        for xsmall__item in xsmall__items:
                            url = xsmall__item.find(".c-gap-top-xsmall a").attr("href")
                            title = xsmall__item.find(".c-gap-top-xsmall a").text()
                            if url and title:
                                pt = xsmall__item.find(".c-color-gray").text()
                                dic = [search_by, search_pot, key_word, str(page), str(num), '', pt, title, url]
                                print(dic)
                                result_dic.append(dic)
                                num += 1
                    elif tpl == 'news-realtime':
                        items = result__item.find(".c-row").items()
                        for item in items:
                            title = item.find(".tts-title").text()
                            url = item.find(".tts-button_1V9FA").attr("data-url")
                            pt = item.find(".c-gap-right-small").text()
                            tt = item.find(".c-color-gray2").text()
                            dic = [search_by, search_pot, key_word, str(page), str(num), tt, pt, title, url]
                            print(dic)
                            result_dic.append(dic)
                            num += 1
            # 截屏
            self.browser.get_screenshot_as_file(f"image/{search_by}/{search_pot}/{key_word}_{str(page)}.png")
            if page == 3:
                break
                # 点击下一页
            WebDriverWait(self.browser, 10).until(
                EC.visibility_of(
                    self.browser.find_elements(by=By.CLASS_NAME, value='n')[-1])).click()
        self.browser.close()
        self.browser.quit()
        exists = os.path.exists(f'result/{file_name}.xlsx')
        result_dic = common.calc(result_dic)
        df_new = pd.DataFrame(result_dic, columns=['搜索引擎', '端口', '关键词', '页码', '排名', '发布时间', '收录平台', '标题', '链接'])
        if not exists:
            df_new.to_excel(f'result/{file_name}.xlsx', index=False)  # 写出数据
        else:
            df = pd.read_excel(f'result/{file_name}.xlsx', header=0, sheet_name='Sheet1')  # 读入数据
            df_all = pd.concat([df, df_new], ignore_index=True)  # concat 合并有相同字段名的dataframe
            df_all.to_excel(f'result/{file_name}.xlsx', index=False)  # 写出数据

    def get_real_url(self, v_url):
        try:
            print(v_url)
            """
                获取百度链接真实地址
                :param v_url: 百度链接地址
                :return: 真实地址
                """

            r = requests.get(v_url, headers=self.headers, allow_redirects=False)  # 不允许重定向
            if r.status_code == 302:  # 如果返回302,就从响应头获取真实地址
                real_url = r.headers.get('Location')
            else:  # 否则从返回内容中用正则表达式提取出来真实地址
                real_url = re.findall("URL='(.*?)'", r.text)[0]
        except Exception as e:
            print(e)
            real_url = v_url
        return real_url


if __name__ == '__main__':
    infos = crawl_infos()
    infos.crawl_baidu("中国联塑", "202303016")

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小桥流水78/article/detail/768246
推荐阅读
相关标签
  

闽ICP备14008679号