当前位置:   article > 正文

python爬虫基础 使用urllib库抓取高德接口边界信息_高德建筑物爬取

高德建筑物爬取

使用的是python3

需要pip install MySQL

#爬取逻辑

1.根据name查询匹配建筑物

2.通过接口返回的建筑物信息获取建筑物ID

3.用建筑物ID查询边界信息

4.数据返回格式是json字符串,直接转成json处理

5.遍历出边界信息保存

6.存储边界信息

  1. #coding=utf-8
  2. import urllib.request
  3. from urllib.parse import quote
  4. import time
  5. import json
  6. import pymysql
  7. import random
  8. import string
  9. COON = pymysql.connect(
  10. host='127.0.0.1',
  11. port=3306,
  12. user='root',
  13. passwd='root',
  14. db='navi_scrapy',
  15. charset='utf8')
  16. pagestart = 1
  17. '''
  18. 1.数据表增加border字段
  19. 2.修改数据库链接
  20. 3.修改对应中的name和city字段
  21. 4.启动run_gaode_border.py
  22. '''
  23. start_url = 'https://restapi.amap.com/v3/place/text?key=4b86820a7590de60e4f81f53e59ae17f&citylimit=true&output=json&' #开始网址
  24. url = "https://ditu.amap.com/detail/get/detail?id="
  25. def hello():
  26. citys = get_data() #查询数据
  27. print(citys)
  28. for city in citys:
  29. tempurl = quote(start_url + "keywords="+str(city[2])+"&city="+str(city[3])+"", safe=string.printable)
  30. request = urllib.request.Request(url=tempurl, headers=get_header(),method='GET')
  31. time.sleep(0.8)
  32. response = urllib.request.urlopen(request)
  33. parse(response,{"id": city[0], "name": city[2]})
  34. def parse(response,meta):
  35. try:
  36. data = json.loads(response.read().decode("utf8"))
  37. print(data)
  38. if data["status"] == "1":
  39. poi = data["pois"][0] # 一般第一个就是查找的
  40. print( poi )
  41. if poi["parent"] != []:
  42. print("查询parent ID~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
  43. id = poi["parent"]
  44. else:
  45. id = poi["id"]
  46. gaode_url = url + str(id)
  47. print(gaode_url)
  48. request = urllib.request.Request(url=gaode_url, headers=get_header(), method='GET')
  49. time.sleep(0.8)
  50. response = urllib.request.urlopen(request)
  51. info(response,{"id": meta["id"], "url": gaode_url})
  52. else:
  53. print("接口返回异常~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·")
  54. except Exception as e:
  55. print("查询失败~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·" + e)
  56. def info(response,meta):
  57. try:
  58. data = json.loads(response.read().decode("utf8"))
  59. print(data)
  60. if data["status"] == "1":
  61. spec = data["data"]["spec"]
  62. border = spec["mining_shape"]["shape"]
  63. print("border :~~~~~~~~~~~~~~~~~~~",border)
  64. update_data((meta["id"]),border)
  65. except Exception as e:
  66. print("查询错误~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + meta["url"] + " error: "+ e)
  67. def get_data():
  68. # 创建游标
  69. cursor = COON.cursor()
  70. try:
  71. sql = "SELECT * FROM new_shopping WHERE border is null"
  72. # 执行SQL,并返回受影响行数,执行多次
  73. cursor.execute(sql)
  74. infoList = cursor.fetchall()
  75. return infoList
  76. except Exception as e:
  77. print(e)
  78. finally:
  79. # 关闭
  80. cursor.close()
  81. def update_data(id,border):
  82. # 创建游标
  83. cursor = COON.cursor()
  84. try:
  85. sql = "update new_shopping set border='" +str(border)+ " 'where id=" + str(id)
  86. # 执行SQL,并返回受影响行数,执行多次
  87. cursor.execute(sql)
  88. except Exception as e:
  89. print(e)
  90. finally:
  91. # 关闭游标
  92. cursor.close()
  93. def get_header():
  94. '''
  95. 随机生成User-Agent
  96. :return:
  97. '''
  98. head_connection = ['Keep-Alive', 'close']
  99. head_accept = ['text/html, application/xhtml+xml, */*',
  100. 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8']
  101. head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
  102. head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
  103. 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
  104. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
  105. 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
  106. 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
  107. 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
  108. 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
  109. 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
  110. 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
  111. 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
  112. 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
  113. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
  114. 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
  115. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
  116. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
  117. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
  118. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
  119. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11',
  120. 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
  121. 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
  122. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
  123. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
  124. ]
  125. result = {
  126. 'Connection': head_connection[0],
  127. 'Accept': head_accept[0],
  128. 'Accept-Language': head_accept_language[1],
  129. 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
  130. }
  131. return result
  132. hello()

 

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号