赞
踩
注意cv2库是opencv-python
使用方法最下方url列表写要爬的题集,该题集要已经提交完毕
最后的json文件自己建,里面存一对花括号
网站的网页结构变了,有点css类名变了,稍微改了一下
爬取代码 2021-12-12更新可用
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import requests
import time
import numpy
import cv2
import os
import json
# 定义全局变量请求网页之前等待的时间,防止请求过快被拒绝
my_time = 3.5
def login_PTA(my_account, my_password):
# 输入账号密码并点击登录
account = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[1]/div/div/div/input')
password = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[2]/div/div/div/input')
account.send_keys(my_account)
password.send_keys(my_password)
web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button').click() # 找到登录按钮并点击
web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button/div/div').click()
print("ok")
for i in range(5):
time.sleep(3) # 等待验证码加载完成,时间间隔可根据网速调整,
# print('当前url:' + web.current_url)
# 如果当前url改变说明已经登录成功
if web.current_url != login_url:
break
cracking_captcha()
def cracking_captcha():
"""破解验证码"""
# bg背景图片
bg_img_src = web.find_element_by_xpath(
'/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[1]').get_attribute('src')
# front可拖动图片
front_img_src = web.find_element_by_xpath(
'/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[2]').get_attribute('src')
# 保存图片
with open("bg.jpg", mode="wb") as f:
f.write(requests.get(bg_img_src).content)
with open("front.jpg", mode="wb") as f:
f.write(requests.get(front_img_src).content)
# 将图片加载至内存
bg = cv2.imread("bg.jpg")
front = cv2.imread("front.jpg")
# 将背景图片转化为灰度图片,将三原色降维
bg = cv2.cvtColor(bg, cv2.COLOR_BGR2GRAY)
# 将可滑动图片转化为灰度图片,将三原色降维
front = cv2.cvtColor(front, cv2.COLOR_BGR2GRAY)
front = front[front.any(1)]
# 用cv算法匹配精度最高的xy值
result = cv2.matchTemplate(bg, front, cv2.TM_CCOEFF_NORMED)
# numpy解析xy,注意xy与实际为相反,x=y,y=x
x, y = numpy.unravel_index(numpy.argmax(result), result.shape)
# 找到可拖动区域
div = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/div[2]/div[2]')
# 拖动滑块,以实际相反的y值代替x
ActionChains(web).drag_and_drop_by_offset(div, xoffset=y // 0.946, yoffset=0).perform()
# 至此成功破解验证码,由于算法问题,准确率不能达到100%,所以加了循环判断
def get_question_type_url(headers_collection_url):
time.sleep(my_time)
"""获取当前章节题目类型的url"""
web.get(headers_collection_url)
single_choice_url = ""
judgment_url = ""
fill_in_the_blanks_url = ""
program_fill_in_the_blanks_url = ""
function_url = ""
programming_url = ""
questions_type_list_a = web.find_element_by_css_selector(
"[class='pc-h container_3U5RB pc-gap-default']").find_elements_by_css_selector("a")
for t in questions_type_list_a:
questions_type_name = t.find_element_by_css_selector("[class='pc-text-raw']").text
this_url = t.get_attribute('href')
if questions_type_name == '单选题':
single_choice_url = this_url
elif questions_type_name == '判断题':
judgment_url = this_url
elif questions_type_name == '填空题':
fill_in_the_blanks_url = this_url
elif questions_type_name == '程序填空题':
program_fill_in_the_blanks_url = this_url
elif questions_type_name == '函数题':
function_url = this_url
elif questions_type_name == '编程题':
programming_url = this_url
# print(single_choice_url)
# print(judgment_url)
# print(fill_in_the_blanks)
# print(program_fill_in_the_blanks)
# print(function)
# print(programming)
question_type_dict = {
"single_choice_url": single_choice_url,
"judgment_url": judgment_url,
"fill_in_the_blanks_url": fill_in_the_blanks_url,
"program_fill_in_the_blanks_url": program_fill_in_the_blanks_url,
"function_url": function_url,
"programming_url": programming_url
}
return question_type_dict
def get_judgment(judgment_url):
"""获取判断题并返回题集字典--判断题自动改错"""
time.sleep(my_time)
web.get(judgment_url)
judgment_question_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
questions_dict = {}
num = 0
success_num = 0
fail_num = 0
for judgment in judgment_question_list:
try:
question = judgment.find_element_by_css_selector("[class='rendered-markdown']").find_element_by_xpath(
'p').text
is_select_T = judgment.find_element_by_css_selector(
"[class='mr-2 mt-1 focus:outline-none']").is_selected()
isTrue = judgment.find_element_by_css_selector("[class='pc-text inline']").find_element_by_css_selector(
"[class='pc-text-raw']").text
if isTrue != "答案正确":
if is_select_T:
answer = "F"
else:
answer = "T"
else:
if is_select_T:
answer = "T"
else:
answer = "F"
print(question)
print(answer)
questions_dict[question] = answer
num += 1
except Exception:
traceback.print_exc()
fail_num += 1
print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + judgment_url + ", 程序跳过该题继续执行")
continue
print("判断题--题集: " + judgment_url + "获取题目数量--成功: " + str(num) + " 失败: " + str(fail_num))
return questions_dict
def get_single_choice(single_choice_url):
"""获取选择题并返回题集字典"""
time.sleep(my_time)
web.get(single_choice_url)
single_choice_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
questions_dict = {}
success_num = 0
fail_num = 0
for single_choice in single_choice_list:
try:
success_num += 1
question = single_choice.find_element_by_css_selector("[class='pc-x min-w-0 shrink']").text
options_urls = single_choice.find_elements_by_css_selector(
"[class='flex items-start p-2 rounded hover:bg-gray-100 focus:bg-gray-200 focus-within:bg-gray-100 focus-within:ring focus-within:ring-blue-300 multiple-choice-label min-w-0 items-baseline']")
option = []
answer = ""
for option_url in options_urls:
this_answer = option_url.find_element_by_css_selector(
"[class='rendered-markdown']").text
option.append(this_answer)
if option_url.find_element_by_css_selector(
"[class='mr-2 mt-1 focus:outline-none']").is_selected():
answer = this_answer
is_true = single_choice.find_element_by_css_selector(
"[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").text
print(question)
print(option)
print(answer)
print(is_true)
questions_dict[question] = [option, answer, is_true]
except:
fail_num += 1
success_num -= 1
print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + single_choice_url + ", 程序跳过该题继续执行")
continue
print("选择题--题集: " + single_choice_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))
return questions_dict
def get_fill_or_program_in_the_blanks(fill_or_program_in_the_blanks_url):
"""获取填空题并返回题集字典"""
time.sleep(my_time)
web.get(fill_or_program_in_the_blanks_url)
fill_or_program_in_the_blanks_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
questions_dict = {}
success_num = 0
fail_num = 0
for fill_or_program_in_the_blanks in fill_or_program_in_the_blanks_list:
try:
success_num += 1
question = fill_or_program_in_the_blanks.find_element_by_css_selector("[class='rendered-markdown']").text
answer_list_input = fill_or_program_in_the_blanks.find_element_by_css_selector(
"[class='rendered-markdown']").find_elements_by_css_selector("input")
answer = []
for a in answer_list_input:
answer.append(a.get_attribute("value"))
is_true = fill_or_program_in_the_blanks.find_element_by_css_selector(
"[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").text
print(question)
print(answer)
print(is_true)
questions_dict[question] = [answer, is_true]
except:
fail_num += 1
success_num -= 1
print("当前题目获取失败, 上一题序号: " + str(
success_num) + "-序号, 当前题集:" + fill_or_program_in_the_blanks_url + ", 程序跳过该题继续执行")
continue
print("填空/程序填空题--题集: " + fill_or_program_in_the_blanks_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(
fail_num))
return questions_dict
def get_function_or_programming(function_or_programming_url):
"""获取函数/编程题并返回题集字典"""
time.sleep(my_time)
web.get(function_or_programming_url)
questions_dict = {}
# 获取所以题目行
trp_problems = web.find_elements_by_xpath('/html/body/div/div[2]/div[1]/div/div[2]/div[2]/div/div[1]/table//tbody/tr')
# 存放所有题目的链接
problems_href = []
for tr in trp_problems:
problems_href.append(tr.find_element_by_xpath('td[3]/a').get_attribute('href'))
success_num = 0
fail_num = 0
for problem in problems_href:
# 这里循环3次的目的是防止请求过快被限制,如果正常执行则退出,否则继续请求(3次还get不到跳过)
for i in range(3):
try:
time.sleep(my_time) # 根据网速设置时间间隔,访问太快也会被提示
web.get(problem)
# 获取题目和答案
problem_title = web.find_element_by_css_selector(
"[class='text-center text-light text-base font-bold my-4']").text
answer = web.find_element_by_css_selector(
"[class='codeEditor_2kCM6 grow shrink']").find_element_by_css_selector('textarea').get_attribute(
'value')
problem_content = web.find_element_by_css_selector("[class='rendered-markdown']").text
questions_dict[problem_title] = [problem_content, answer]
print(problem_title)
print(problem_content)
print(answer)
success_num += 1
break
except:
continue
fail_num += 1 # 如果能执行到这说明当前题目获取失败
print("函数/编程题--题集: " + function_or_programming_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))
return questions_dict
def write_question_file(url_list, judgment_file_name, single_choice_file_name, fill_in_the_blanks_name,
program_fill_in_the_blanks_name, function_name, programming_name):
"""将题目分类并写入json文件"""
for url in url_list:
this_question_type_dict = get_question_type_url(url)
questions_dict = {}
def write_file(this_name):
if not os.path.exists(os.getcwd() + "\\" + this_name):
new_file = open(this_name, 'w')
new_file.write("{}")
new_file.close()
f = open(this_name, 'r', encoding="utf-8")
content = f.read()
file_dict = json.loads(content)
f.close()
file_dict.update(questions_dict)
judgment_file = open(this_name, mode='w', encoding="utf-8")
judgment_file.write(json.dumps(file_dict, ensure_ascii=False))
judgment_file.close()
print("-----当前题记长度-----------------------------------------------------" + str(len(questions_dict)))
print("-----写入文件--总长度-------------------------------------------------" + str(len(file_dict)))
# 判断题
if this_question_type_dict['judgment_url'] != "":
questions_dict = get_judgment(this_question_type_dict['judgment_url'])
write_file(this_name=judgment_file_name)
# 选择题
if this_question_type_dict['single_choice_url'] != "":
questions_dict = get_single_choice(this_question_type_dict['single_choice_url'])
write_file(this_name=single_choice_file_name)
# 填空题
if this_question_type_dict['fill_in_the_blanks_url'] != "":
questions_dict = get_fill_or_program_in_the_blanks(this_question_type_dict['fill_in_the_blanks_url'])
write_file(this_name=fill_in_the_blanks_name)
# 程序填空题
if this_question_type_dict['program_fill_in_the_blanks_url'] != "":
questions_dict = get_fill_or_program_in_the_blanks(
this_question_type_dict['program_fill_in_the_blanks_url'])
write_file(this_name=program_fill_in_the_blanks_name)
# 函数题
if this_question_type_dict['function_url'] != "":
questions_dict = get_function_or_programming(
function_or_programming_url=this_question_type_dict['function_url'])
write_file(this_name=function_name)
# # 编程题
if this_question_type_dict['programming_url'] != "":
questions_dict = get_function_or_programming(
function_or_programming_url=this_question_type_dict['programming_url'])
write_file(this_name=programming_name)
if __name__ == '__main__':
# 创建 WebDriver 对象,指明使用chrome浏览器驱动
web = webdriver.Chrome(service=Service(r'C:\Users\Cat\AppData\Local\Google\Chrome\Application\chromedriver.exe'))
web.implicitly_wait(5)
login_url = 'https://pintia.cn/auth/login'
# 调用WebDriver 对象的get方法 可以让浏览器打开 指定网址
web.get('https://pintia.cn/auth/login')
login_PTA('zzz@qq.com', 'xxx')
# # 题目集类型(数据库)
# # 1判断;单选;多选!!!;填空
# # 2判断;单选;填空
# # 3判断;单选
# # 4单选
# # 5单选
# # 6判断;单选
# # 7判断;单选;填空
# # 8判断;单选
# # 9判断;单选
# sql_url_list_walking = [
# 'https://pintia.cn/problem-sets/1343789975057166336/problems/type/1',
# 'https://pintia.cn/problem-sets/1343794588401487872/problems/type/1',
# 'https://pintia.cn/problem-sets/1343799990153117696/problems/type/1',
# 'https://pintia.cn/problem-sets/1343806731523719168/problems/type/2',
# 'https://pintia.cn/problem-sets/1343807501140754432/problems/type/2',
# 'https://pintia.cn/problem-sets/1343808640402018304/problems/type/1',
# 'https://pintia.cn/problem-sets/1343811518420176896/problems/type/1',
# 'https://pintia.cn/problem-sets/1343798231569530880/problems/type/1',
# 'https://pintia.cn/problem-sets/1343819242272718848/problems/type/1'
# ]
#
# # 题目集类型(java)
# # 1判断;单选
# # 2判断;单选;填空;函数;编程
# # 3判断;单选;填空;程序填空;函数;编程
# # 4判断;单选;填空;程序填空;函数;编程
# # 5判断;单选;填空;函数;编程
# # 6判断;单选;填空;程序填空;编程
# # 7判断;单选;填空;编程
# # 8判断;单选;填空;程序填空;函数;编程
# java_url_list_lgr = [
# 'https://pintia.cn/problem-sets/1368832382463172608/problems/type/1',
# 'https://pintia.cn/problem-sets/1368833022220361728/problems/type/1',
# 'https://pintia.cn/problem-sets/1369164346714021888/problems/type/1',
# 'https://pintia.cn/problem-sets/1369165326734123008/problems/type/1',
# 'https://pintia.cn/problem-sets/1369165872660537344/problems/type/1',
# 'https://pintia.cn/problem-sets/1369166179822002176/problems/type/1',
# 'https://pintia.cn/problem-sets/1369166486127828992/problems/type/1',
# 'https://pintia.cn/problem-sets/1369166803779248128/problems/type/1'
# ]
java_url_list_lxf = [
'https://pintia.cn/problem-sets/1468315811752116224/problems/type/1'
]
write_question_file(java_url_list_lxf, "dataSql\\judgment.json", "dataSql\\single_choice.json",
"dataSql\\fill_in_the_blanks.json", "dataSql\\program_fill_in_the_blanks.json",
"dataSql\\function.json", "dataSql\\programming.json")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。