#获取每个区的起始url链接地址 def get_region_url(self): d_region_url = {} content = self.get_content(self.base_url) pattern = re.compile('<a href="(/ershoufang/\w+/)" title=".*?">(.*?)</a>',re.S) result = re.findall(pattern,content) if result: for x in result: d_region_url[x[1]] = x[0] else: pass return d_region_url
#获取每个区的所有页面url地址列表 def get_region_url_list(self,region_url): page_num = self.get_page_num(region_url) l_url = [region_url+'pg'+str(i)+'/' for i in range(2,page_num+1)] return l_url
#获取总页数 def get_page_num(self,url): content = self.get_content(url) pattern = re.compile('{"totalPage":(\d+),"curPage":1}',re.S) result = re.search(pattern,content) if result: return int(result.group(1).strip()) else: return None #获取每套房子的房价信息 def get_house_info(self,url,region): content = self.get_content(url) pattern = re.compile('<a href=".*?" target="_blank" data-log_index="\d+" data-el="region">' + '(.*?)</a>(.*?)</div>.*?</a></div>.*?</div>' + '.*?<div class="totalPrice"><span>(\d+)(\S+)</div>',re.S)
result = re.findall(pattern,content) if result: for x in result: l = x[1].split('|') rooms,area,direct,other = l[1],l[2],l[3],l[4] s_str = '|'.join([region,x[0],rooms,area,direct,other,x[2],x[3]]) self.writeStr2File(self.out_put_file,s_str) else: return None #开始抓取链家网房价数据 def start_scrapy(self): d_region_url = self.get_region_url() for k in d_region_url: region = k region_init_url = 'http://bj.lianjia.com' + d_region_url[region] l_region_url = self.get_region_url_list(region_init_url) for url in l_region_url: time.sleep(1) url = url.strip() self.get_house_info(url,region) #写文件 def writeStr2File(self,out_put_file,str1,append = 'a'): # 去掉文件,保留路径。比如 'a/b/c/d.txt' 经过下面代码会变成 'a/b/c' subPath = out_put_file[:self.out_put_file.rfind('/')] # 如果给定的路径中,文件夹不存在,则创建 if not os.path.exists(subPath): os.makedirs(subPath) # 打开文件并将 str 内容写入给定的文件 with open(out_put_file, append) as f: f.write(str1.strip()+'\n') url = 'http://bj.lianjia.com/ershoufang/' home = HomeLink(url) home.start_scrapy()