Colly 入门实战

Colly 入门实战

成果展示

先看结果, 这是要爬的页面

这是结果

安装 Colly

1
go get -u github.com/gocolly/colly/...

简单例子

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
func main() {
c := colly.NewCollector()

// 找到并访问所有链接
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})

// 打印请求的链接
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})

// 发送 get 请求
c.Visit("http://go-colly.org/")
}

全部有哪些回调方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})

c.OnError(func(_ *colly.Response, err error) {
log.Println("Something went wrong:", err)
})

c.OnResponseHeaders(func(r *colly.Response) {
fmt.Println("Visited", r.Request.URL)
})

c.OnResponse(func(r *colly.Response) {
fmt.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})

c.OnHTML("tr td:nth-of-type(1)", func(e *colly.HTMLElement) {
fmt.Println("First column of a table row:", e.Text)
})

c.OnXML("//h1", func(e *colly.XMLElement) {
fmt.Println(e.Text)
})

c.OnScraped(func(r *colly.Response) {
fmt.Println("Finished", r.Request.URL)
})

调用顺序

1.OnRequest
在请求之前调用

2.OnError
如果请求期间发生错误,则调用

3.OnResponseHeaders
在收到响应标头后调用

4.OnResponse
收到回复后调用

5.OnHTML
OnResponse如果收到的内容是HTML ,则在之后调用

6.OnXML
OnHTML如果接收到的内容是HTML或XML ,则在之后调用

7.OnScraped
OnXML回调后调用

实战

这是地址页 地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
package main

import (
zipUtils "Spider/utils"
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"regexp"
"strconv"
"strings"

"github.com/gocolly/colly"
)

func main() {
temp := "./out/temp"
chapter := temp
zipDir := "./out/zip"
zipFile := "x.zip"
bookId := "25797"
bookHome := "https://www.manhuadb.com/manhua/" + bookId

// Instantiate default collector
c := colly.NewCollector()

// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})

c.OnRequest(func(rq *colly.Request) {
rq.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
rq.Headers.Add("Accept-Encoding", "gzip, deflate")
rq.Headers.Add("Accept-Language", "zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6")
rq.Headers.Add("Cache-Control", "no-cache")
rq.Headers.Add("Connection", "keep-alive")
rq.Headers.Add("Host", "www.manhuadb.com")
rq.Headers.Add("Pragma", "no-cache")
rq.Headers.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36")

})

c.OnResponse(func(r *colly.Response) {
//fmt.Println(string(r.Body))

if strings.Contains(r.Headers.Get("content-type"), "image") {
fileName := path.Base(r.Request.URL.String())
f, err := os.Create(chapter + "/" + fileName)
if err != nil {
panic(err)
}
io.Copy(f, bytes.NewReader(r.Body))
}

})

// 标题
c.OnHTML("h1[class=\"comic-title\"]", func(e *colly.HTMLElement) {
fmt.Printf("返回值: %s \n", e.Text)
text := strings.Replace(e.Text, " ", "", -1) // 去 空格
text = strings.Replace(e.Text, "?", "", -1) // 去 ?
temp = temp + "/" + text
os.MkdirAll(temp, 0711)
zipDir = zipDir + "/" + text
os.MkdirAll(zipDir, 0711)
})

// 章节列表
c.OnHTML("li[class=\"sort_div fixed-wd-num\"] > a", func(e *colly.HTMLElement) {
chapterTitle := e.Attr("title")
if chapterTitle != "" {
println("章节==:", chapterTitle)
url := e.Attr("href")

chapter = temp + "/" + chapterTitle
os.MkdirAll(chapter, 0711)

chapterUrl := "https://www.manhuadb.com" + url
c.Visit(chapterUrl)
}
})

// img 列表
c.OnHTML("div[class=\"text-center pjax-container\"] > img", func(e *colly.HTMLElement) {
imgUrl := e.Attr("src")
if imgUrl != "" {
c.Visit(imgUrl)
}
})

// 判断是否还有下一页
c.OnHTML("div[class=\"container-fluid comic-detail p-0\"]", func(e *colly.HTMLElement) {
pageHref := e.ChildAttr("li[class=\"breadcrumb-item active\"] > a", "href")
currentPage := e.ChildText("li[class=\"breadcrumb-item active\"] > span")
reg := regexp.MustCompile("共 ([0-9]*?) 页")

sumPageSize := reg.FindStringSubmatch(e.ChildText("li[class=\"breadcrumb-item active\"]"))

hrefs := strings.Split(pageHref, ".")
sumPageSizeNum, err:=strconv.Atoi(sumPageSize[1])
currentPageNum, err:=strconv.Atoi(currentPage)
if err != nil {
println(err)
}
if currentPageNum < sumPageSizeNum {
p :=strconv.Itoa(currentPageNum + 1)
next := "https://www.manhuadb.com" + hrefs[0] + "_p"+ p +"." + hrefs[1]

c.Visit(next)
}

})

// Start scraping on https://hackerspaces.org
c.Visit(bookHome)

// 压缩
items, _ := ioutil.ReadDir(temp)
for _, item := range items {
if item.IsDir() {
src := temp + "/" + item.Name()
zipFile = item.Name() + ".cbz"
dest := zipDir + "/" + zipFile
zipUtils.CompressFile(src, dest)
}
}


}

代码仓库地址 https://github.com/fillpit/Spider

作者

坑 飞

发布于

2021-09-06

更新于

2021-09-06

许可协议

评论