赞
踩
“”"
Created on Sun Dec 3 19:15:30 2023
@Dr. Wen
“”"
第一步:导入工具包,并设定路径
import requests
from bs4 import BeautifulSoup
import re
import os
os.getcwd()
os.chdir('Your Path')
第二歩:请求网页
def get_html(url): cookies = { 'lianjia_uuid': 'adafbbb9-352e-434f-bee9-88c547ce45cd', '_smt_uid': '638ef566.35a25657', '_jzqa': '1.2347015318386996000.1670560356.1670560356.1670560356.1', '_qzja': '1.1432031133.1670560355864.1670560355864.1670560355864.1670560355864.1670560355864.0.0.0.1.1', 'UM_distinctid': '188a9401ac02ea-0fecbfd3c3fe3a-26031d51-19fe24-188a9401ac1b83', '_ga_TJZVFLS7KV': 'GS1.1.1686467190.1.0.1686467194.0.0.0', 'CNZZDATA1255633284': '499438434-1686466187-https%253A%252F%252Fwww.lianjia.com%252F%7C1686466187', '_ga': 'GA1.2.598951696.1670313344', 'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%22184e66ef435183-0b0757be768ca4-26021151-1890293-184e66ef436938%22%2C%22%24device_id%22%3A%22184e66ef435183-0b0757be768ca4-26021151-1890293-184e66ef436938%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D', 'select_city': '440100', 'lianjia_ssid': '3f62f49f-fc8c-4db1-9460-df293a433478', '_gid': 'GA1.2.1739010512.1699960383', 'Hm_lvt_9152f8221cb6243a53c83b956842be8a': '1698799665,1699960404', '_gat': '1', '_gat_global': '1', '_gat_new_global': '1', '_gat_dianpu_agent': '1', '_ga_654P0WDKYN': 'GS1.2.1699960383.2.1.1699960462.0.0.0', 'Hm_lpvt_9152f8221cb6243a53c83b956842be8a': '1699960466', 'srcid': 'eyJ0Ijoie1wiZGF0YVwiOlwiM2YyNGJhM2UwMjEyN2IwZGRjMjYwM2NhNzU0YmIyNTQ5NGQ3ZGI5NmI3MGVkODE1MjY5OGQ3M2JmZTA1MDNlNmM0NDhlYjU2YmRiZmU0YjI2M2FjODdiMjc0OTIwNTI5YTdiNmU4ZTUyNzZlZjdkZjc3OTM0NjU0NzkwZmNjMDlmMzEzOTYwOWFlNTdhM2Q2MDFmNDQzYTJhZDBmZmJjNzE0NzgyYjczMDA1ZWVjZTRmNzhjNDQ5MzI0YmU2NmNiNzY2M2VlNGZjYTdiMWUxYmVmNTRiNTkxMzEzMzY5NzA4NWJlMWYwYWY5Y2Y5YmMzMGY2NmU5MjgwYWQ1MjIxZVwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIwMDdkMDBjNVwifSIsInIiOiJodHRwczovL2d6LmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvcGcxMDAvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', 'Connection': 'keep-alive', 'Referer': 'https://gz.lianjia.com/ershoufang/pg100/', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } response = requests.get(url, cookies=cookies, headers=headers) return response.text
第三步:#内容提取函数,是你的手术刀
def pagecontent(lis): #内容提取函数,手术刀 for i in range(len(lis)): info=lis[i].find('div',class_='info clear') # title=info.find('div',class_='title') # title=re.findall('blank">(.*?)</a>',str(title))[0] title=info.find('div',class_='title').a.text.replace(" ",'') flood=info.find('div',class_='flood').text flood='--'.join([i.strip() for i in flood.split('-')]) # flood=info.find('div',class_='flood').text.replace(' ','') address=info.find('div',class_='address').text if len(address.split('|'))==7: htype,area,direction,decoration,floor,byear,tower=address.split('|') elif len(address.split('|'))==6: byear='' htype,area,direction,decoration,floor,tower=address.split('|') else: pass totalPrice=info.find('div',class_='priceInfo').div.text.strip() unitPrice=info.find('div',class_='priceInfo').find('div',class_='unitPrice').span.text # unitPrice=info.find('div',class_='priceInfo').div.next_sibling.span.text results= title+';\t'+flood+';\t'+ htype+';\t'+ area+';\t'+ direction +';\t'+decoration+';\t'+ floor+';\t'+byear+';\t'+tower+';\t'+ totalPrice+';\t'+unitPrice+'\n' with open('results/链家二手房信息.csv','a',encoding='utf-8-sig') as f: f.write(results) return None
第三步:爬取多页数据函数
def ljhouses(pgstart,pgend):
for i in range(pgstart,pgend):
url='https://gz.lianjia.com/ershoufang/pg{}/'.format(i)
print(f'第{i}页:',url)
soup=BeautifulSoup(get_html(url),'lxml') #请求网页,并解析网页
lis=soup.find('ul',class_='sellListContent').findAll('li')
pagecontent(lis)
return None
最后:运行代码,提取所需内容,比如,提取前30页数据
if __name__=='__main__':
ljhouses(1,31)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。