赞
踩
由于工作需要,这里学习了中科院软件所刘焕勇老师在github上的开源项目,基于知识图谱的医药领域问答项目QABasedOnMedicaKnowledgeGraph。
原项目地址:https://github.com/liuhuanyong/QASystemOnMedicalKG
自己动手实现了环境的搭建,目前实践到爬虫部分,在此记录,欢迎大家提出意见。
参考博客mongodb安装及创建用户
按照文中的说明下载和配置mongodb,并启动服务,打开网址,出现如下语句说明启动成功:
It looks like you are trying to access MongoDB over HTTP on the native driver port.
为了便于验证数据库是否建立成功,这里给出了几个常用的数据库语法:
show dbs 查看已有的数据库
use db_name 如果该数据库存在则进入,若不存在则创建名称为db_name的数据库
db.dropDatabase() 删除该数据库
db.jc.find() 查看数据库中jc表中的数据
在对症状的解析函数中,源代码对有的网页解析得到的结果是人名,因此我对其进行了修改,详见代码。运行之后就在数据库中建立该数据库,并存入了爬取的数据。
# -*- coding: utf-8 -*- import urllib.request import urllib.parse from lxml import etree import pymongo # 创建数据库连接,开启Mongodb服务之后,不存在该数据库的话会自动创建该数据库 conn = pymongo.MongoClient() db = conn['medical'] col = db['data'] # 爬取数据并解析 def get_html(url): headers = {'User-Agent':'Mozilla/5.0(Window Nt 10.0; WOW64) AppleWebKit/537.36 (KHTML, LIKE Gecko)'} req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) html = res.read().decode('gbk') return html def spider_main(): for page in range(1, 11000): try: basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'% page cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm' % page # 病因 prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm' % page # 预防 symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm' % page # 症状 inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm' % page # 检查方法 treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm' % page # 治疗 food_url = 'http://jib.xywy.com/il_sii/food/%s.htm' % page # 饮食保健 drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm' % page # 好评药品 data = {} data['url'] = basic_url data['basic_info'] = basicinfo_spider(basic_url) data['cause_info'] = common_spider(cause_url) data['prevent_info'] = common_spider(prevent_url) data['symptom'] = symptom_spider(symptom_url) data['inspect_info'] = inspect_spider(inspect_url) data['treat_info'] = treat_sipder(treat_url) data['food_info'] = food_spider(food_url) data['drug_info'] = drug_spider(drug_url) print(page, basic_url) col.insert(data) except: print('error') return '''基本信息解析''' def basicinfo_spider(url): html = get_html(url) selector = etree.HTML(html) title = selector.xpath('//title/text()')[0] category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()') desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()') ps = selector.xpath('//div[@class="mt20 articl-know"]/p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') infobox.append(info) basic_data = {} basic_data['category'] = category basic_data['name'] = title.split('的简介')[0] basic_data['desc'] = desc basic_data['attributes'] = infobox return basic_data # 对网页分别进行解析 def treat_sipder(url): html = get_html(url) selector = etree.HTML(html) ps = selector.xpath('//div[starts-with(@class, "mt20 articl-know")]/p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0','').replace(' ','').replace('\t','') infobox.append(info) # print(infobox) return infobox def drug_spider(url): html = get_html(url) selector = etree.HTML(html) drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')] # print(drugs) return drugs '''food治疗解析''' def food_spider(url): html = get_html(url) selector = etree.HTML(html) divs = selector.xpath('//div[@class="diet-img clearfix mt20"]') try: food_data = {} food_data['good'] = divs[0].xpath('./div/p/text()') food_data['bad'] = divs[1].xpath('./div/p/text()') food_data['recommand'] = divs[2].xpath('./div/p/text()') except: return {} return food_data '''症状信息解析''' def symptom_spider(url): html = get_html(url) selector = etree.HTML(html) # 源代码中的结果是人名,因此对其进行了修改。 symptoms = selector.xpath('//span[@class="db f12 lh240 mb15 "]/a/text()') ps = selector.xpath('//p') detail = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') detail.append(info) symptoms_data = {} symptoms_data['symptoms'] = symptoms symptoms_data['symptoms_detail'] = detail # print(symptoms_data) return symptoms, detail def inspect_spider(url): '''对检查项目的链接进行获取,有的网页时有检查链接的,有的是没有的''' html = get_html(url) selector = etree.HTML(html) inspects = selector.xpath('//li[@class="check-item"]/a/@href') return inspects '''通用解析模块''' def common_spider(url): html = get_html(url) selector = etree.HTML(html) ps = selector.xpath('//p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '') if info: infobox.append(info) return '\n'.join(infobox) '''检查项抓取模块''' def inspect_crawl(): for page in range(1, 3685): try: url = 'http://jck.xywy.com/jc_%s.html'%page html = get_html(url) data = {} data['url'] = url data['html'] = html db['jc'].insert(data) # print(data) except Exception as e: print(e) spider_main() inspect_crawl()
#!/usr/bin/env python3 # coding: utf-8 # File: build_data.py # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io> # Date: 18-10-3 import pymongo from lxml import etree import os from max_cut import * class MedicalGraph: def __init__(self): self.conn = pymongo.MongoClient() # cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) self.db = self.conn['test'] self.col = self.db['data'] first_words = [i.strip() for i in open('../dict/disease.txt', encoding='utf-8')] alphabets = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y', 'z'] nums = ['1','2','3','4','5','6','7','8','9','0'] self.stop_words = first_words + alphabets + nums self.key_dict = { '医保疾病' : 'yibao_status', "患病比例" : "get_prob", "易感人群" : "easy_get", "传染方式" : "get_way", "就诊科室" : "cure_department", "治疗方式" : "cure_way", "治疗周期" : "cure_lasttime", "治愈率" : "cured_prob", '药品明细': 'drug_detail', '药品推荐': 'recommand_drug', '推荐': 'recommand_eat', '忌食': 'not_eat', '宜食': 'do_eat', '症状': 'symptom', '检查': 'check', '成因': 'cause', '预防措施': 'prevent', '所属类别': 'category', '简介': 'desc', '名称': 'name', '常用药品' : 'common_drug', '治疗费用': 'cost_money', '并发症': 'acompany' } self.cuter = CutWords() def collect_medical(self): cates = [] inspects = [] count = 0 for item in self.col.find(): data = {} basic_info = item['basic_info'] name = basic_info['name'] if not name: continue # 基本信息 data['名称'] = name data['简介'] = '\n'.join(basic_info['desc']).replace('\r\n\t', '').replace('\r\n\n\n','').replace(' ','').replace('\r\n','\n') category = basic_info['category'] data['所属类别'] = category cates += category inspect = item['inspect_info'] inspects += inspect attributes = basic_info['attributes'] # 成因及预防 data['预防措施'] = item['prevent_info'] data['成因'] = item['cause_info'] # 并发症 data['症状'] = list(set([i for i in item["symptom_info"][0] if i[0] not in self.stop_words])) for attr in attributes: attr_pair = attr.split(':') if len(attr_pair) == 2: key = attr_pair[0] value = attr_pair[1] data[key] = value # 检查 inspects = item['inspect_info'] jcs = [] for inspect in inspects: jc_name = self.get_inspect(inspect) if jc_name: jcs.append(jc_name) data['检查'] = jcs # 食物 food_info = item['food_info'] if food_info: data['宜食'] = food_info['good'] data['忌食'] = food_info['bad'] data['推荐'] = food_info['recommand'] # 药品 drug_info = item['drug_info'] data['药品推荐'] = list(set([i.split('(')[-1].replace(')','') for i in drug_info])) data['药品明细'] = drug_info data_modify = {} for attr, value in data.items(): attr_en = self.key_dict.get(attr) if attr_en: data_modify[attr_en] = value if attr_en in ['yibao_status', 'get_prob', 'easy_get', 'get_way', "cure_lasttime", "cured_prob"]: data_modify[attr_en] = value.replace(' ','').replace('\t','') elif attr_en in ['cure_department', 'cure_way', 'common_drug']: data_modify[attr_en] = [i for i in value.split(' ') if i] elif attr_en in ['acompany']: acompany = [i for i in self.cuter.max_biward_cut(data_modify[attr_en]) if len(i) > 1] data_modify[attr_en] = acompany try: self.db['medical'].insert(data_modify) count += 1 print(count) except Exception as e: print(e) return def get_inspect(self, url): res = self.db['jc'].find_one({'url':url}) if not res: return '' else: return res['name'] def modify_jc(self): for item in self.db['jc'].find(): url = item['url'] content = item['html'] selector = etree.HTML(content) name = selector.xpath('//title/text()')[0].split('结果分析')[0] desc = selector.xpath('//meta[@name="description"]/@content')[0].replace('\r\n\t','') self.db['jc'].update({'url':url}, {'$set':{'name':name, 'desc':desc}}) if __name__ == '__main__': handler = MedicalGraph() handler.modify_jc() handler.collect_medical()
完成之后,使用如下命令可以导出数据
mongoexport -d test -c medical -o medical.json
其中,-d test 是指明数据库,
-c medical 是指明要导出的列表
-o medical.json 是指明导出的文件名,可以在前边指定位置
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。