步骤1:导入必要的包
首先,你需要导入以下包:
net/http
: 用于发送HTTP请求。io
: 用于读取响应内容(在Go 1.16及后续版本中,使用io.ReadAll
代替ioutil.ReadAll
)。regexp
: 用于正则表达式匹配(如果需要从HTML中提取特定信息)。fmt
: 用于打印输出。
go
import (
"fmt"
"io"
"log"
"net/http"
"regexp"
)
步骤2:定义函数获取网页源码
创建一个函数来发送GET请求并获取网页源码:
go
func fetch(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
示例:设置User-Agent头
部分网站可能会根据User-Agent头来限制爬虫访问。可以通过设置http.Client
的请求头来模拟浏览器的User-Agent:
go
func fetchWithUserAgent(url string) (string, error) {
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
步骤3:解析HTML内容
如果你需要从HTML中提取特定信息,可以使用正则表达式。例如,提取所有链接:
go
func extractLinks(html string) []string {
re := regexp.MustCompile(`href="([^"]+)"`)
matches := re.FindAllStringSubmatch(html, -1)
var links []string
for _, match := range matches {
links = append(links, match[1])
}
return links
}
示例:提取所有图片URL
你也可以使用正则表达式提取所有图片URL:
go
func extractImages(html string) []string {
re := regexp.MustCompile(`src="([^"]+)"`)
matches := re.FindAllStringSubmatch(html, -1)
var images []string
for _, match := range matches {
images = append(images, match[1])
}
return images
}
步骤4:运行爬虫
在main
函数中调用上述函数:
go
func main() {
url := "https://example.com"
html, err := fetch(url)
if err != nil {
log.Fatal(err)
}
links := extractLinks(html)
for _, link := range links {
fmt.Println(link)
}
images := extractImages(html)
fmt.Println("Images:")
for _, image := range images {
fmt.Println(image)
}
}
完整代码
go
package main
import (
"fmt"
"io"
"log"
"net/http"
"regexp"
)
func fetch(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func fetchWithUserAgent(url string) (string, error) {
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func extractLinks(html string) []string {
re := regexp.MustCompile(`href="([^"]+)"`)
matches := re.FindAllStringSubmatch(html, -1)
var links []string
for _, match := range matches {
links = append(links, match[1])
}
return links
}
func extractImages(html string) []string {
re := regexp.MustCompile(`src="([^"]+)"`)
matches := re.FindAllStringSubmatch(html, -1)
var images []string
for _, match := range matches {
images = append(images, match[1])
}
return images
}
func main() {
url := "https://example.com"
html, err := fetch(url)
if err != nil {
log.Fatal(err)
}
links := extractLinks(html)
fmt.Println("Links:")
for _, link := range links {
fmt.Println(link)
}
images := extractImages(html)
fmt.Println("Images:")
for _, image := range images {
fmt.Println(image)
}
}
注意事项
- User-Agent头 : 部分网站可能会根据User-Agent头来限制爬虫访问。可以通过设置
http.Client
的请求头来模拟浏览器的User-Agent。 - robots.txt : 在爬取前,应检查目标网站的
robots.txt
文件,以确保爬虫行为合法。 - 频率限制: 避免过快地发送请求,以免被目标网站封禁。