前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >golang源码分析:chromedp

golang源码分析:chromedp

作者头像
golangLeetcode
发布2023-08-09 15:18:00
4250
发布2023-08-09 15:18:00
举报

chromedp是go写的,支持Chrome DevTools Protocol 的一个驱动浏览器的库。https://github.com/chromedp/chromedp。随着前端spa应用的普及,传统的爬虫很难抓取到我们想要的内容,Chrome DevTools Protocol (CDP)提供了一个完整的浏览器接口,使得我们可以用浏览器一样的环境来模拟请求来抓取动态生成的网页。所谓 CDP 的协议,本质上是什么呢?本质上是基于 websocket 的一种协议。

下面我们通过两个例子看下如何使用chromedp:

代码语言:javascript
复制
// Command click is a chromedp example demonstrating how to use a selector to
// click on an element.
package main

import (
  "context"
  "log"
  "time"

  "github.com/chromedp/chromedp"
)

func main() {
  // create chrome instance
  ctx, cancel := chromedp.NewContext(
    context.Background(),
    // chromedp.WithDebugf(log.Printf),
  )
  defer cancel()

  // create a timeout
  ctx, cancel = context.WithTimeout(ctx, 15*time.Second)
  defer cancel()

  // navigate to a page, wait for an element, click
  var example string
  err := chromedp.Run(ctx,
    chromedp.Navigate(`https://pkg.go.dev/time`),
    // wait for footer element is visible (ie, page is loaded)
    chromedp.WaitVisible(`body > footer`),
    // find and click "Example" link
    chromedp.Click(`#example-After`, chromedp.NodeVisible),
    // retrieve the text of the textarea
    chromedp.Value(`#example-After textarea`, &example),
  )
  if err != nil {
    log.Fatal(err)
  }
  log.Printf("Go's time.After example:\n%s", example)
}

返回结果如下:

代码语言:javascript
复制
2023/07/02 23:29:57 Go's time.After example:
package main

import (
        "fmt"
        "time"
)

var c chan int

func handle(int) {}

func main() {
        select {
        case m := <-c:
                handle(m)
        case <-time.After(10 * time.Second):
                fmt.Println("timed out")
        }
}

完全模拟了一个浏览器的点击请求,返回我们想要的内容。当然也可以用来种植cookie,

代码语言:javascript
复制
// Command cookie is a chromedp example demonstrating how to set a HTTP cookie
// on requests.
package main

import (
  "context"
  "encoding/json"
  "flag"
  "fmt"
  "log"
  "net/http"
  "time"

  "github.com/chromedp/cdproto/cdp"
  "github.com/chromedp/cdproto/network"
  "github.com/chromedp/cdproto/storage"
  "github.com/chromedp/chromedp"
)

func main() {
  port := flag.Int("port", 8544, "port")
  flag.Parse()

  // start cookie server
  go cookieServer(fmt.Sprintf(":%d", *port))

  // create context
  ctx, cancel := chromedp.NewContext(context.Background())
  defer cancel()

  // run task list
  var res string
  err := chromedp.Run(ctx, setcookies(
    fmt.Sprintf("http://localhost:%d", *port), &res,
    "cookie1", "value1",
    "cookie2", "value2",
  ))
  if err != nil {
    log.Fatal(err)
  }

  log.Printf("chrome received cookies: %s", res)
}

// cookieServer creates a simple HTTP server that logs any passed cookies.
func cookieServer(addr string) error {
  mux := http.NewServeMux()
  mux.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
    cookies := req.Cookies()
    for i, cookie := range cookies {
      log.Printf("from %s, server received cookie %d: %v", req.RemoteAddr, i, cookie)
    }
    buf, err := json.MarshalIndent(req.Cookies(), "", "  ")
    if err != nil {
      http.Error(res, err.Error(), http.StatusInternalServerError)
      return
    }
    fmt.Fprintf(res, indexHTML, string(buf))
  })
  return http.ListenAndServe(addr, mux)
}

// setcookies returns a task to navigate to a host with the passed cookies set
// on the network request.
func setcookies(host string, res *string, cookies ...string) chromedp.Tasks {
  if len(cookies)%2 != 0 {
    panic("length of cookies must be divisible by 2")
  }
  return chromedp.Tasks{
    chromedp.ActionFunc(func(ctx context.Context) error {
      // create cookie expiration
      expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
      // add cookies to chrome
      for i := 0; i < len(cookies); i += 2 {
        err := network.SetCookie(cookies[i], cookies[i+1]).
          WithExpires(&expr).
          WithDomain("localhost").
          WithHTTPOnly(true).
          Do(ctx)
        if err != nil {
          return err
        }
      }
      return nil
    }),
    // navigate to site
    chromedp.Navigate(host),
    // read the returned values
    chromedp.Text(`#result`, res, chromedp.ByID, chromedp.NodeVisible),
    // read network values
    chromedp.ActionFunc(func(ctx context.Context) error {
      cookies, err := storage.GetCookies().Do(ctx)
      if err != nil {
        return err
      }

      for i, cookie := range cookies {
        log.Printf("chrome cookie %d: %+v", i, cookie)
      }

      return nil
    }),
  }
}

const (
  indexHTML = `<!doctype html>
<html>
<body>
  <div id="result">%s</div>
</body>
</html>`
)
本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2023-07-02,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 golang算法架构leetcode技术php 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档