当前位置:   article > 正文

Selenium获取PTA平台所有题集和答案存为json文件_pta怎么查看答案

pta怎么查看答案

注意cv2库是opencv-python
在这里插入图片描述
使用方法最下方url列表写要爬的题集,该题集要已经提交完毕
最后的json文件自己建,里面存一对花括号

网站的网页结构变了,有点css类名变了,稍微改了一下
爬取代码 2021-12-12更新可用

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import requests
import time
import numpy
import cv2
import os
import json

# 定义全局变量请求网页之前等待的时间,防止请求过快被拒绝
my_time = 3.5


def login_PTA(my_account, my_password):
    # 输入账号密码并点击登录
    account = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[1]/div/div/div/input')
    password = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[2]/div/div/div/input')
    account.send_keys(my_account)
    password.send_keys(my_password)
    web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button').click()  # 找到登录按钮并点击
    web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button/div/div').click()
    print("ok")
    for i in range(5):
        time.sleep(3)  # 等待验证码加载完成,时间间隔可根据网速调整,
        # print('当前url:' + web.current_url)
        # 如果当前url改变说明已经登录成功
        if web.current_url != login_url:
            break
        cracking_captcha()


def cracking_captcha():
    """破解验证码"""
    # bg背景图片
    bg_img_src = web.find_element_by_xpath(
        '/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[1]').get_attribute('src')
    # front可拖动图片
    front_img_src = web.find_element_by_xpath(
        '/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[2]').get_attribute('src')

    # 保存图片
    with open("bg.jpg", mode="wb") as f:
        f.write(requests.get(bg_img_src).content)
    with open("front.jpg", mode="wb") as f:
        f.write(requests.get(front_img_src).content)

    # 将图片加载至内存
    bg = cv2.imread("bg.jpg")
    front = cv2.imread("front.jpg")

    # 将背景图片转化为灰度图片,将三原色降维
    bg = cv2.cvtColor(bg, cv2.COLOR_BGR2GRAY)
    # 将可滑动图片转化为灰度图片,将三原色降维
    front = cv2.cvtColor(front, cv2.COLOR_BGR2GRAY)
    front = front[front.any(1)]
    # 用cv算法匹配精度最高的xy值
    result = cv2.matchTemplate(bg, front, cv2.TM_CCOEFF_NORMED)
    # numpy解析xy,注意xy与实际为相反,x=y,y=x
    x, y = numpy.unravel_index(numpy.argmax(result), result.shape)
    # 找到可拖动区域
    div = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/div[2]/div[2]')
    # 拖动滑块,以实际相反的y值代替x
    ActionChains(web).drag_and_drop_by_offset(div, xoffset=y // 0.946, yoffset=0).perform()
    # 至此成功破解验证码,由于算法问题,准确率不能达到100%,所以加了循环判断


def get_question_type_url(headers_collection_url):
    time.sleep(my_time)
    """获取当前章节题目类型的url"""
    web.get(headers_collection_url)
    single_choice_url = ""
    judgment_url = ""
    fill_in_the_blanks_url = ""
    program_fill_in_the_blanks_url = ""
    function_url = ""
    programming_url = ""

    questions_type_list_a = web.find_element_by_css_selector(
        "[class='pc-h container_3U5RB pc-gap-default']").find_elements_by_css_selector("a")
    for t in questions_type_list_a:
        questions_type_name = t.find_element_by_css_selector("[class='pc-text-raw']").text
        this_url = t.get_attribute('href')
        if questions_type_name == '单选题':
            single_choice_url = this_url
        elif questions_type_name == '判断题':
            judgment_url = this_url
        elif questions_type_name == '填空题':
            fill_in_the_blanks_url = this_url
        elif questions_type_name == '程序填空题':
            program_fill_in_the_blanks_url = this_url
        elif questions_type_name == '函数题':
            function_url = this_url
        elif questions_type_name == '编程题':
            programming_url = this_url

    # print(single_choice_url)
    # print(judgment_url)
    # print(fill_in_the_blanks)
    # print(program_fill_in_the_blanks)
    # print(function)
    # print(programming)

    question_type_dict = {
        "single_choice_url": single_choice_url,
        "judgment_url": judgment_url,
        "fill_in_the_blanks_url": fill_in_the_blanks_url,
        "program_fill_in_the_blanks_url": program_fill_in_the_blanks_url,
        "function_url": function_url,
        "programming_url": programming_url
    }

    return question_type_dict


def get_judgment(judgment_url):
    """获取判断题并返回题集字典--判断题自动改错"""
    time.sleep(my_time)
    web.get(judgment_url)
    judgment_question_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
    questions_dict = {}
    num = 0
    success_num = 0
    fail_num = 0
    for judgment in judgment_question_list:
        try:
            question = judgment.find_element_by_css_selector("[class='rendered-markdown']").find_element_by_xpath(
                'p').text
            is_select_T = judgment.find_element_by_css_selector(
                "[class='mr-2 mt-1 focus:outline-none']").is_selected()
            isTrue = judgment.find_element_by_css_selector("[class='pc-text inline']").find_element_by_css_selector(
                "[class='pc-text-raw']").text

            if isTrue != "答案正确":
                if is_select_T:
                    answer = "F"
                else:
                    answer = "T"
            else:
                if is_select_T:
                    answer = "T"
                else:
                    answer = "F"

            print(question)
            print(answer)
            questions_dict[question] = answer
            num += 1
        except Exception:
            traceback.print_exc()
            fail_num += 1
            print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + judgment_url + ", 程序跳过该题继续执行")
            continue
    print("判断题--题集: " + judgment_url + "获取题目数量--成功: " + str(num) + " 失败: " + str(fail_num))
    return questions_dict


def get_single_choice(single_choice_url):
    """获取选择题并返回题集字典"""
    time.sleep(my_time)
    web.get(single_choice_url)
    single_choice_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
    questions_dict = {}
    success_num = 0
    fail_num = 0
    for single_choice in single_choice_list:
        try:
            success_num += 1
            question = single_choice.find_element_by_css_selector("[class='pc-x min-w-0 shrink']").text

            options_urls = single_choice.find_elements_by_css_selector(
                "[class='flex items-start p-2 rounded hover:bg-gray-100 focus:bg-gray-200 focus-within:bg-gray-100 focus-within:ring focus-within:ring-blue-300 multiple-choice-label min-w-0 items-baseline']")
            option = []
            answer = ""
            for option_url in options_urls:
                this_answer = option_url.find_element_by_css_selector(
                    "[class='rendered-markdown']").text
                option.append(this_answer)
                if option_url.find_element_by_css_selector(
                        "[class='mr-2 mt-1 focus:outline-none']").is_selected():
                    answer = this_answer
            is_true = single_choice.find_element_by_css_selector(
                "[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").text
            print(question)
            print(option)
            print(answer)
            print(is_true)

            questions_dict[question] = [option, answer, is_true]

        except:
            fail_num += 1
            success_num -= 1
            print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + single_choice_url + ", 程序跳过该题继续执行")
            continue

    print("选择题--题集: " + single_choice_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))
    return questions_dict


def get_fill_or_program_in_the_blanks(fill_or_program_in_the_blanks_url):
    """获取填空题并返回题集字典"""
    time.sleep(my_time)
    web.get(fill_or_program_in_the_blanks_url)
    fill_or_program_in_the_blanks_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
    questions_dict = {}
    success_num = 0
    fail_num = 0
    for fill_or_program_in_the_blanks in fill_or_program_in_the_blanks_list:
        try:
            success_num += 1
            question = fill_or_program_in_the_blanks.find_element_by_css_selector("[class='rendered-markdown']").text
            answer_list_input = fill_or_program_in_the_blanks.find_element_by_css_selector(
                "[class='rendered-markdown']").find_elements_by_css_selector("input")
            answer = []
            for a in answer_list_input:
                answer.append(a.get_attribute("value"))
            is_true = fill_or_program_in_the_blanks.find_element_by_css_selector(
                "[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").text

            print(question)
            print(answer)
            print(is_true)

            questions_dict[question] = [answer, is_true]
        except:
            fail_num += 1
            success_num -= 1
            print("当前题目获取失败, 上一题序号: " + str(
                success_num) + "-序号, 当前题集:" + fill_or_program_in_the_blanks_url + ", 程序跳过该题继续执行")
            continue

    print("填空/程序填空题--题集: " + fill_or_program_in_the_blanks_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(
        fail_num))
    return questions_dict


def get_function_or_programming(function_or_programming_url):
    """获取函数/编程题并返回题集字典"""
    time.sleep(my_time)
    web.get(function_or_programming_url)
    questions_dict = {}

    # 获取所以题目行
    trp_problems = web.find_elements_by_xpath('/html/body/div/div[2]/div[1]/div/div[2]/div[2]/div/div[1]/table//tbody/tr')
    # 存放所有题目的链接
    problems_href = []
    for tr in trp_problems:
        problems_href.append(tr.find_element_by_xpath('td[3]/a').get_attribute('href'))

    success_num = 0
    fail_num = 0
    for problem in problems_href:
        # 这里循环3次的目的是防止请求过快被限制,如果正常执行则退出,否则继续请求(3次还get不到跳过)
        for i in range(3):
            try:
                time.sleep(my_time)  # 根据网速设置时间间隔,访问太快也会被提示
                web.get(problem)
                # 获取题目和答案
                problem_title = web.find_element_by_css_selector(
                    "[class='text-center text-light text-base font-bold my-4']").text
                answer = web.find_element_by_css_selector(
                    "[class='codeEditor_2kCM6 grow shrink']").find_element_by_css_selector('textarea').get_attribute(
                    'value')
                problem_content = web.find_element_by_css_selector("[class='rendered-markdown']").text

                questions_dict[problem_title] = [problem_content, answer]

                print(problem_title)
                print(problem_content)
                print(answer)

                success_num += 1
                break
            except:
                continue
            fail_num += 1  # 如果能执行到这说明当前题目获取失败

    print("函数/编程题--题集: " + function_or_programming_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))
    return questions_dict


def write_question_file(url_list, judgment_file_name, single_choice_file_name, fill_in_the_blanks_name,
                        program_fill_in_the_blanks_name, function_name, programming_name):
    """将题目分类并写入json文件"""
    for url in url_list:

        this_question_type_dict = get_question_type_url(url)
        questions_dict = {}

        def write_file(this_name):
            if not os.path.exists(os.getcwd() + "\\" + this_name):
                new_file = open(this_name, 'w')
                new_file.write("{}")
                new_file.close()
            f = open(this_name, 'r', encoding="utf-8")
            content = f.read()
            file_dict = json.loads(content)
            f.close()

            file_dict.update(questions_dict)

            judgment_file = open(this_name, mode='w', encoding="utf-8")
            judgment_file.write(json.dumps(file_dict, ensure_ascii=False))
            judgment_file.close()
            print("-----当前题记长度-----------------------------------------------------" + str(len(questions_dict)))
            print("-----写入文件--总长度-------------------------------------------------" + str(len(file_dict)))

        # 判断题
        if this_question_type_dict['judgment_url'] != "":
            questions_dict = get_judgment(this_question_type_dict['judgment_url'])
            write_file(this_name=judgment_file_name)

        # 选择题
        if this_question_type_dict['single_choice_url'] != "":
            questions_dict = get_single_choice(this_question_type_dict['single_choice_url'])
            write_file(this_name=single_choice_file_name)

        # 填空题
        if this_question_type_dict['fill_in_the_blanks_url'] != "":
            questions_dict = get_fill_or_program_in_the_blanks(this_question_type_dict['fill_in_the_blanks_url'])
            write_file(this_name=fill_in_the_blanks_name)

        # 程序填空题
        if this_question_type_dict['program_fill_in_the_blanks_url'] != "":
            questions_dict = get_fill_or_program_in_the_blanks(
                this_question_type_dict['program_fill_in_the_blanks_url'])
            write_file(this_name=program_fill_in_the_blanks_name)

        # 函数题
        if this_question_type_dict['function_url'] != "":
            questions_dict = get_function_or_programming(
                function_or_programming_url=this_question_type_dict['function_url'])
            write_file(this_name=function_name)
        # # 编程题
        if this_question_type_dict['programming_url'] != "":
            questions_dict = get_function_or_programming(
                function_or_programming_url=this_question_type_dict['programming_url'])
            write_file(this_name=programming_name)




if __name__ == '__main__':
    # 创建 WebDriver 对象,指明使用chrome浏览器驱动
    web = webdriver.Chrome(service=Service(r'C:\Users\Cat\AppData\Local\Google\Chrome\Application\chromedriver.exe'))
    web.implicitly_wait(5)
    login_url = 'https://pintia.cn/auth/login'
    # 调用WebDriver 对象的get方法 可以让浏览器打开 指定网址
    web.get('https://pintia.cn/auth/login')

    login_PTA('zzz@qq.com', 'xxx')

    # # 题目集类型(数据库)
    # # 1判断;单选;多选!!!;填空
    # # 2判断;单选;填空
    # # 3判断;单选
    # # 4单选
    # # 5单选
    # # 6判断;单选
    # # 7判断;单选;填空
    # # 8判断;单选
    # # 9判断;单选
    # sql_url_list_walking = [
    #     'https://pintia.cn/problem-sets/1343789975057166336/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343794588401487872/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343799990153117696/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343806731523719168/problems/type/2',
    #     'https://pintia.cn/problem-sets/1343807501140754432/problems/type/2',
    #     'https://pintia.cn/problem-sets/1343808640402018304/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343811518420176896/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343798231569530880/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343819242272718848/problems/type/1'
    # ]
    #
    # # 题目集类型(java)
    # # 1判断;单选
    # # 2判断;单选;填空;函数;编程
    # # 3判断;单选;填空;程序填空;函数;编程
    # # 4判断;单选;填空;程序填空;函数;编程
    # # 5判断;单选;填空;函数;编程
    # # 6判断;单选;填空;程序填空;编程
    # # 7判断;单选;填空;编程
    # # 8判断;单选;填空;程序填空;函数;编程
    # java_url_list_lgr = [
    #     'https://pintia.cn/problem-sets/1368832382463172608/problems/type/1',
    #     'https://pintia.cn/problem-sets/1368833022220361728/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369164346714021888/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369165326734123008/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369165872660537344/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369166179822002176/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369166486127828992/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369166803779248128/problems/type/1'
    # ]

    java_url_list_lxf = [
        'https://pintia.cn/problem-sets/1468315811752116224/problems/type/1'
    ]


    write_question_file(java_url_list_lxf, "dataSql\\judgment.json", "dataSql\\single_choice.json",
                        "dataSql\\fill_in_the_blanks.json", "dataSql\\program_fill_in_the_blanks.json",
                        "dataSql\\function.json", "dataSql\\programming.json")

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
  • 249
  • 250
  • 251
  • 252
  • 253
  • 254
  • 255
  • 256
  • 257
  • 258
  • 259
  • 260
  • 261
  • 262
  • 263
  • 264
  • 265
  • 266
  • 267
  • 268
  • 269
  • 270
  • 271
  • 272
  • 273
  • 274
  • 275
  • 276
  • 277
  • 278
  • 279
  • 280
  • 281
  • 282
  • 283
  • 284
  • 285
  • 286
  • 287
  • 288
  • 289
  • 290
  • 291
  • 292
  • 293
  • 294
  • 295
  • 296
  • 297
  • 298
  • 299
  • 300
  • 301
  • 302
  • 303
  • 304
  • 305
  • 306
  • 307
  • 308
  • 309
  • 310
  • 311
  • 312
  • 313
  • 314
  • 315
  • 316
  • 317
  • 318
  • 319
  • 320
  • 321
  • 322
  • 323
  • 324
  • 325
  • 326
  • 327
  • 328
  • 329
  • 330
  • 331
  • 332
  • 333
  • 334
  • 335
  • 336
  • 337
  • 338
  • 339
  • 340
  • 341
  • 342
  • 343
  • 344
  • 345
  • 346
  • 347
  • 348
  • 349
  • 350
  • 351
  • 352
  • 353
  • 354
  • 355
  • 356
  • 357
  • 358
  • 359
  • 360
  • 361
  • 362
  • 363
  • 364
  • 365
  • 366
  • 367
  • 368
  • 369
  • 370
  • 371
  • 372
  • 373
  • 374
  • 375
  • 376
  • 377
  • 378
  • 379
  • 380
  • 381
  • 382
  • 383
  • 384
  • 385
  • 386
  • 387
  • 388
  • 389
  • 390
  • 391
  • 392
  • 393
  • 394
  • 395
  • 396
  • 397
  • 398
  • 399
  • 400
  • 401
  • 402
  • 403
  • 404
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小桥流水78/article/detail/995904
推荐阅读
相关标签
  

闽ICP备14008679号