2021-09-06发表2021-09-06更新折腾 / 爬虫5 分钟读完 (大约802个字)0次访问

Colly 入门实战

成果展示

先看结果, 这是要爬的页面

这是结果

安装 Colly

1	go get -u github.com/gocolly/colly/...

简单例子

func main() {
    c := colly.NewCollector()

    // 找到并访问所有链接
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    // 打印请求的链接
    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    // 发送 get 请求
    c.Visit("http://go-colly.org/")
}

全部有哪些回调方法

c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL)
})

c.OnError(func(_ *colly.Response, err error) {
    log.Println("Something went wrong:", err)
})

c.OnResponseHeaders(func(r *colly.Response) {
    fmt.Println("Visited", r.Request.URL)
})

c.OnResponse(func(r *colly.Response) {
    fmt.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    e.Request.Visit(e.Attr("href"))
})

c.OnHTML("tr td:nth-of-type(1)", func(e *colly.HTMLElement) {
    fmt.Println("First column of a table row:", e.Text)
})

c.OnXML("//h1", func(e *colly.XMLElement) {
    fmt.Println(e.Text)
})

c.OnScraped(func(r *colly.Response) {
    fmt.Println("Finished", r.Request.URL)
})

调用顺序

1.OnRequest
在请求之前调用

2.OnError
如果请求期间发生错误,则调用

3.OnResponseHeaders
在收到响应标头后调用

4.OnResponse
收到回复后调用

5.OnHTML
OnResponse如果收到的内容是HTML ,则在之后调用

6.OnXML
OnHTML如果接收到的内容是HTML或XML ,则在之后调用

7.OnScraped
OnXML回调后调用

实战

这是地址页地址

package main

import (
	zipUtils "Spider/utils"
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"os"
	"path"
	"regexp"
	"strconv"
	"strings"

	"github.com/gocolly/colly"
)

func main() {
	temp := "./out/temp"
	chapter := temp
	zipDir := "./out/zip"
	zipFile := "x.zip"
	bookId := "25797"
	bookHome := "https://www.manhuadb.com/manhua/" + bookId

	// Instantiate default collector
	c := colly.NewCollector()

	// Before making a request print "Visiting ..."
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})

	c.OnRequest(func(rq *colly.Request) {
		rq.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
		rq.Headers.Add("Accept-Encoding", "gzip, deflate")
		rq.Headers.Add("Accept-Language", "zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6")
		rq.Headers.Add("Cache-Control", "no-cache")
		rq.Headers.Add("Connection", "keep-alive")
		rq.Headers.Add("Host", "www.manhuadb.com")
		rq.Headers.Add("Pragma", "no-cache")
		rq.Headers.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36")

	})

	c.OnResponse(func(r *colly.Response) {
		//fmt.Println(string(r.Body))

		if strings.Contains(r.Headers.Get("content-type"), "image") {
			fileName := path.Base(r.Request.URL.String())
			f, err := os.Create(chapter + "/" + fileName)
			if err != nil {
				panic(err)
			}
			io.Copy(f, bytes.NewReader(r.Body))
		}

	})

	// 标题
	c.OnHTML("h1[class=\"comic-title\"]", func(e *colly.HTMLElement) {
		fmt.Printf("返回值: %s \n", e.Text)
		text := strings.Replace(e.Text, " ", "", -1)  // 去 空格
		text = strings.Replace(e.Text, "?", "", -1)	// 去 ？
		temp =  temp + "/" + text
		os.MkdirAll(temp, 0711)
		zipDir = zipDir + "/" + text
		os.MkdirAll(zipDir, 0711)
	})

	// 章节列表
	c.OnHTML("li[class=\"sort_div fixed-wd-num\"] > a", func(e *colly.HTMLElement) {
		chapterTitle := e.Attr("title")
		if chapterTitle != "" {
			println("章节==：", chapterTitle)
			url := e.Attr("href")

			chapter = temp + "/" + chapterTitle
			os.MkdirAll(chapter, 0711)

			chapterUrl := "https://www.manhuadb.com" + url
			c.Visit(chapterUrl)
		}
	})

	// img 列表
	c.OnHTML("div[class=\"text-center pjax-container\"] > img", func(e *colly.HTMLElement) {
		imgUrl := e.Attr("src")
		if imgUrl != "" {
			c.Visit(imgUrl)
		}
	})

	// 判断是否还有下一页
	c.OnHTML("div[class=\"container-fluid comic-detail p-0\"]", func(e *colly.HTMLElement) {
		pageHref := e.ChildAttr("li[class=\"breadcrumb-item active\"] > a", "href")
		currentPage := e.ChildText("li[class=\"breadcrumb-item active\"] > span")
		reg := regexp.MustCompile("共 ([0-9]*?) 页")

		sumPageSize := reg.FindStringSubmatch(e.ChildText("li[class=\"breadcrumb-item active\"]"))

		hrefs := strings.Split(pageHref, ".")
		sumPageSizeNum, err:=strconv.Atoi(sumPageSize[1])
		currentPageNum, err:=strconv.Atoi(currentPage)
		if err != nil {
			println(err)
		}
		if currentPageNum < sumPageSizeNum {
			p :=strconv.Itoa(currentPageNum + 1)
			next := "https://www.manhuadb.com" + hrefs[0] + "_p"+ p +"." + hrefs[1]

			c.Visit(next)
		}

	})

	// Start scraping on https://hackerspaces.org
	c.Visit(bookHome)

	// 压缩
	items, _ := ioutil.ReadDir(temp)
	for _, item := range items {
		if item.IsDir() {
			src := temp + "/" + item.Name()
			zipFile = item.Name() + ".cbz"
			dest := zipDir + "/" + zipFile
			zipUtils.CompressFile(src, dest)
		}
	}


}

代码仓库地址 https://github.com/fillpit/Spider

Colly 入门实战

http://example.com/2021/09/06/Colly-start/

作者

坑飞

发布于

2021-09-06

更新于

2021-09-06

许可协议

#Colly golang

Colly 入门实战

成果展示

安装 Colly

简单例子

全部有哪些回调方法

调用顺序

实战

作者

发布于

更新于

许可协议

喜欢这篇文章？打赏一下作者吧

评论

目录

链接

最新文章

分类

归档

标签