当前位置:   article > 正文

go实践十 爬虫抓取网页数据_htmlquery.findone 获取 text

htmlquery.findone 获取 text

需要用到的第三方包有:

go get github.com/antchfx/htmlquery

编辑一个 testspider.go 文件,内容如下

使用 go run testspider.go 运行该文件即可

  1. package main
  2. import (
  3. "log"
  4. "time"
  5. "strings"
  6. "strconv"
  7. "net/http"
  8. "golang.org/x/net/html"
  9. //htmlquery 包
  10. "github.com/antchfx/htmlquery"
  11. )
  12. //测试爬虫
  13. func main() {
  14. htmlquerypath()
  15. }
  16. //使用htmlquery 包
  17. func htmlquerypath(){
  18. start := time.Now()
  19. ch := make(chan bool)
  20. for i := 0; i < 10; i++ {
  21. go htmlparseUrls("https://movie.douban.com/top250?start="+strconv.Itoa(25*i), ch)
  22. }
  23. for i := 0; i < 10; i++ {
  24. <-ch
  25. }
  26. elapsed := time.Since(start)
  27. log.Printf("Took %s", elapsed)
  28. }
  29. func htmlfetch(url string) *html.Node {
  30. log.Println("Fetch Url", url)
  31. client := &http.Client{}
  32. req, _ := http.NewRequest("GET", url, nil)
  33. req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
  34. resp, err := client.Do(req)
  35. if err != nil {
  36. log.Fatal("Http get err:", err)
  37. }
  38. if resp.StatusCode != 200 {
  39. log.Fatal("Http status code:", resp.StatusCode)
  40. }
  41. defer resp.Body.Close()
  42. doc, err := htmlquery.Parse(resp.Body)
  43. if err != nil {
  44. log.Fatal(err)
  45. }
  46. return doc
  47. }
  48. func htmlparseUrls(url string, ch chan bool) {
  49. doc := htmlfetch(url)
  50. pic := htmlquery.Find(doc, `//ol[@class="grid_view"]/li//div[@class="pic"]`)
  51. nodes := htmlquery.Find(doc, `//ol[@class="grid_view"]/li//div[@class="hd"]`)
  52. for key, node := range nodes {
  53. num := htmlquery.FindOne(pic[key], `./em[@class=""]/text()`)
  54. url := htmlquery.FindOne(node, "./a/@href")
  55. title := htmlquery.FindOne(node, `.//span[@class="title"]/text()`)
  56. log.Println(htmlquery.InnerText(num),
  57. strings.Split(htmlquery.InnerText(url), "/")[4],
  58. htmlquery.InnerText(title))
  59. }
  60. time.Sleep(2 * time.Second)
  61. ch <- true
  62. }

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/607778
推荐阅读
相关标签
  

闽ICP备14008679号