当前位置:   article > 正文

python如何爬取链家二手房数据_python链家二手房爬取ppt

python链家二手房爬取ppt

-- coding: utf-8 --

“”"
Created on Sun Dec 3 19:15:30 2023

@Dr. Wen
“”"

====================================

第一步:请求网页

====================================

第一步:导入工具包,并设定路径

import requests
from bs4 import BeautifulSoup
import re

import os
os.getcwd()
os.chdir('Your Path')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

第二歩:请求网页

def get_html(url): 
    cookies = {
        'lianjia_uuid': 'adafbbb9-352e-434f-bee9-88c547ce45cd',
        '_smt_uid': '638ef566.35a25657',
        '_jzqa': '1.2347015318386996000.1670560356.1670560356.1670560356.1',
        '_qzja': '1.1432031133.1670560355864.1670560355864.1670560355864.1670560355864.1670560355864.0.0.0.1.1',
        'UM_distinctid': '188a9401ac02ea-0fecbfd3c3fe3a-26031d51-19fe24-188a9401ac1b83',
        '_ga_TJZVFLS7KV': 'GS1.1.1686467190.1.0.1686467194.0.0.0',
        'CNZZDATA1255633284': '499438434-1686466187-https%253A%252F%252Fwww.lianjia.com%252F%7C1686466187',
        '_ga': 'GA1.2.598951696.1670313344',
        'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%22184e66ef435183-0b0757be768ca4-26021151-1890293-184e66ef436938%22%2C%22%24device_id%22%3A%22184e66ef435183-0b0757be768ca4-26021151-1890293-184e66ef436938%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D',
        'select_city': '440100',
        'lianjia_ssid': '3f62f49f-fc8c-4db1-9460-df293a433478',
        '_gid': 'GA1.2.1739010512.1699960383',
        'Hm_lvt_9152f8221cb6243a53c83b956842be8a': '1698799665,1699960404',
        '_gat': '1',
        '_gat_global': '1',
        '_gat_new_global': '1',
        '_gat_dianpu_agent': '1',
        '_ga_654P0WDKYN': 'GS1.2.1699960383.2.1.1699960462.0.0.0',
        'Hm_lpvt_9152f8221cb6243a53c83b956842be8a': '1699960466',
        'srcid': 'eyJ0Ijoie1wiZGF0YVwiOlwiM2YyNGJhM2UwMjEyN2IwZGRjMjYwM2NhNzU0YmIyNTQ5NGQ3ZGI5NmI3MGVkODE1MjY5OGQ3M2JmZTA1MDNlNmM0NDhlYjU2YmRiZmU0YjI2M2FjODdiMjc0OTIwNTI5YTdiNmU4ZTUyNzZlZjdkZjc3OTM0NjU0NzkwZmNjMDlmMzEzOTYwOWFlNTdhM2Q2MDFmNDQzYTJhZDBmZmJjNzE0NzgyYjczMDA1ZWVjZTRmNzhjNDQ5MzI0YmU2NmNiNzY2M2VlNGZjYTdiMWUxYmVmNTRiNTkxMzEzMzY5NzA4NWJlMWYwYWY5Y2Y5YmMzMGY2NmU5MjgwYWQ1MjIxZVwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIwMDdkMDBjNVwifSIsInIiOiJodHRwczovL2d6LmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvcGcxMDAvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=',
    }
    
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8',
        'Connection': 'keep-alive',
        'Referer': 'https://gz.lianjia.com/ershoufang/pg100/',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    
    response = requests.get(url, cookies=cookies, headers=headers)
    return response.text
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
'
运行

第三步:#内容提取函数,是你的手术刀

def pagecontent(lis):    #内容提取函数,手术刀
    for i in  range(len(lis)):
        info=lis[i].find('div',class_='info clear')
        
        # title=info.find('div',class_='title') 
        # title=re.findall('blank">(.*?)</a>',str(title))[0]
        title=info.find('div',class_='title').a.text.replace(" ",'') 
        
        flood=info.find('div',class_='flood').text
        flood='--'.join([i.strip() for i in flood.split('-')])
        # flood=info.find('div',class_='flood').text.replace(' ','')
        
        address=info.find('div',class_='address').text
        
        if len(address.split('|'))==7:
            htype,area,direction,decoration,floor,byear,tower=address.split('|')
        elif len(address.split('|'))==6:
            byear=''
            htype,area,direction,decoration,floor,tower=address.split('|')
        else:
            pass
        totalPrice=info.find('div',class_='priceInfo').div.text.strip()
        unitPrice=info.find('div',class_='priceInfo').find('div',class_='unitPrice').span.text
        # unitPrice=info.find('div',class_='priceInfo').div.next_sibling.span.text
                
        results= title+';\t'+flood+';\t'+ htype+';\t'+ area+';\t'+ direction +';\t'+decoration+';\t'+ floor+';\t'+byear+';\t'+tower+';\t'+ totalPrice+';\t'+unitPrice+'\n' 
              
        with open('results/链家二手房信息.csv','a',encoding='utf-8-sig') as f:
            f.write(results)
    return None
      
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
'
运行

第三步:爬取多页数据函数

def ljhouses(pgstart,pgend):
    for i in range(pgstart,pgend):    
        url='https://gz.lianjia.com/ershoufang/pg{}/'.format(i)
        print(f'第{i}页:',url)        
        soup=BeautifulSoup(get_html(url),'lxml') #请求网页,并解析网页
        lis=soup.find('ul',class_='sellListContent').findAll('li')
        pagecontent(lis) 
    return None

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
'
运行

最后:运行代码,提取所需内容,比如,提取前30页数据

if __name__=='__main__':
    ljhouses(1,31)
  • 1
  • 2
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/794599
推荐阅读
相关标签
  

闽ICP备14008679号