用Go语言&&正则，如何爬取数据

文章精选推荐

文章正文

Go语言结合正则表达式可以构建高效的数据爬取工具。下面我将提供几个完整的实例，涵盖不同场景下的数据爬取需求。

基础网页内容爬取

1.1 获取网页中所有链接

go 复制代码

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
)

func main() {
	// 发送HTTP请求
	resp, err := http.Get("https://example.com")
	if err != nil {
		fmt.Println("HTTP请求失败:", err)
		return
	}
	defer resp.Body.Close()

	// 读取响应内容
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("读取响应失败:", err)
		return
	}

	// 编译正则表达式，匹配所有a标签的href属性
	re := regexp.MustCompile(`<a[^>]+href=["'](.*?)["']`)
	matches := re.FindAllStringSubmatch(string(body), -1)

	// 输出所有链接
	fmt.Println("找到的链接:")
	for _, match := range matches {
		if len(match) > 1 {
			fmt.Println(match[1])
		}
	}
}

1.2 提取特定模式的文本

go 复制代码

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
)

func main() {
	resp, err := http.Get("https://example.com")
	if err != nil {
		fmt.Println("HTTP请求失败:", err)
		return
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)

	// 匹配所有<h1>-<h6>标签内容
	re := regexp.MustCompile(`<h[1-6][^>]*>(.*?)</h[1-6]>`)
	titles := re.FindAllStringSubmatch(string(body), -1)

	fmt.Println("网页标题:")
	for _, title := range titles {
		if len(title) > 1 {
			// 去除HTML标签
			cleanTitle := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(title[1], "")
			fmt.Println(cleanTitle)
		}
	}
}

结构化数据爬取

2.1 爬取表格数据

go 复制代码

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strings"
)

func main() {
	resp, err := http.Get("https://example.com/table-page")
	if err != nil {
		fmt.Println("HTTP请求失败:", err)
		return
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)
	content := string(body)

	// 匹配整个表格
	tableRe := regexp.MustCompile(`<table[^>]*>(.*?)</table>`)
	tableMatch := tableRe.FindStringSubmatch(content)
	if len(tableMatch) == 0 {
		fmt.Println("未找到表格")
		return
	}

	tableContent := tableMatch[1]

	// 匹配表格行
	rowRe := regexp.MustCompile(`<tr[^>]*>(.*?)</tr>`)
	rows := rowRe.FindAllStringSubmatch(tableContent, -1)

	// 匹配单元格
	cellRe := regexp.MustCompile(`<t[dh][^>]*>(.*?)</t[dh]>`)

	fmt.Println("表格数据:")
	for _, row := range rows {
		cells := cellRe.FindAllStringSubmatch(row[1], -1)
		for _, cell := range cells {
			if len(cell) > 1 {
				// 清理单元格内容
				cleanCell := strings.TrimSpace(regexp.MustCompile(`<[^>]+>`).ReplaceAllString(cell[1], ""))
				fmt.Printf("%s\t", cleanCell)
			}
		}
		fmt.Println() // 换行
	}
}

2.2 爬取JSON数据中的特定字段

go 复制代码

package main

import (
	"encoding/json"
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
)

type Product struct {
	Name  string  `json:"name"`
	Price float64 `json:"price"`
}

func main() {
	resp, err := http.Get("https://api.example.com/products")
	if err != nil {
		fmt.Println("HTTP请求失败:", err)
		return
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)

	// 方法1：直接解析JSON
	var products []Product
	if err := json.Unmarshal(body, &products); err == nil {
		fmt.Println("产品列表(JSON解析):")
		for _, p := range products {
			fmt.Printf("%s - $%.2f\n", p.Name, p.Price)
		}
		return
	}

	// 方法2：当JSON结构不确定时使用正则
	fmt.Println("\n尝试使用正则表达式提取:")

	// 匹配产品名称和价格
	re := regexp.MustCompile(`"name"\s*:\s*"([^"]+)"[^}]+"price"\s*:\s*(\d+\.?\d*)`)
	matches := re.FindAllStringSubmatch(string(body), -1)

	for _, match := range matches {
		if len(match) >= 3 {
			fmt.Printf("%s - $%s\n", match[1], match[2])
		}
	}
}

高级爬虫技巧

3.1 带并发控制的爬虫

go 复制代码

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"sync"
)

func main() {
	urls := []string{
		"https://example.com/page1",
		"https://example.com/page2",
		"https://example.com/page3",
	}

	var wg sync.WaitGroup
	semaphore := make(chan struct{}, 3) // 并发限制为3

	titleRe := regexp.MustCompile(`<title[^>]*>(.*?)</title>`)

	for _, url := range urls {
		wg.Add(1)
		go func(u string) {
			defer wg.Done()
			semaphore <- struct{}{} // 获取信号量

			resp, err := http.Get(u)
			if err != nil {
				fmt.Printf("获取 %s 失败: %v\n", u, err)
				<-semaphore
				return
			}

			body, _ := ioutil.ReadAll(resp.Body)
			resp.Body.Close()

			title := titleRe.FindStringSubmatch(string(body))
			if len(title) > 1 {
				fmt.Printf("%s 的标题: %s\n", u, title[1])
			}

			<-semaphore // 释放信号量
		}(url)
	}

	wg.Wait()
}

3.2 处理分页内容

go 复制代码

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strconv"
)

func main() {
	baseURL := "https://example.com/news?page="
	pageRe := regexp.MustCompile(`<div class="news-item">(.*?)</div>`)
	titleRe := regexp.MustCompile(`<h2>(.*?)</h2>`)
	pageNumRe := regexp.MustCompile(`page=(\d+)`)

	// 先获取总页数
	totalPages := getTotalPages(baseURL + "1")
	
	fmt.Printf("共发现 %d 页内容\n", totalPages)

	// 爬取每页内容
	for page := 1; page <= totalPages; page++ {
		url := baseURL + strconv.Itoa(page)
		fmt.Printf("\n正在爬取第 %d 页: %s\n", page, url)
		
		resp, err := http.Get(url)
		if err != nil {
			fmt.Printf("获取第 %d 页失败: %v\n", page, err)
			continue
		}

		body, _ := ioutil.ReadAll(resp.Body)
		resp.Body.Close()

		newsItems := pageRe.FindAllStringSubmatch(string(body), -1)
		for _, item := range newsItems {
			if len(item) > 1 {
				title := titleRe.FindStringSubmatch(item[1])
				if len(title) > 1 {
					fmt.Println("新闻标题:", title[1])
				}
			}
		}
	}
}

func getTotalPages(url string) int {
	resp, err := http.Get(url)
	if err != nil {
		return 1 // 默认1页
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)
	
	// 假设页面中有类似 "共 5 页" 的文字
	re := regexp.MustCompile(`共\s*(\d+)\s*页`)
	match := re.FindStringSubmatch(string(body))
	if len(match) > 1 {
		total, _ := strconv.Atoi(match[1])
		return total
	}
	
	return 1
}

实用技巧与注意事项

User-Agent设置：

go 复制代码

client := &http.Client{}
req, _ := http.NewRequest("GET", "https://example.com", nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; MyBot/1.0)")
resp, _ := client.Do(req)

处理相对链接：

go 复制代码

import "net/url"

base, _ := url.Parse("https://example.com")
rel, _ := url.Parse("/page1")
absURL := base.ResolveReference(rel).String()

正则表达式优化：

预编译正则表达式：re := regexp.MustCompile(pattern)
使用非贪婪匹配：.*?
避免过度复杂的正则表达式

错误处理增强：

go 复制代码

resp, err := http.Get(url)
if err != nil {
    return fmt.Errorf("请求失败: %w", err)
}
defer func() {
    if err := resp.Body.Close(); err != nil {
        log.Printf("关闭响应体失败: %v", err)
    }
}()

反爬虫策略应对

设置合理的请求间隔：

go 复制代码

import "time"

func crawlWithDelay(urls []string, delay time.Duration) {
    for _, url := range urls {
        go crawlPage(url)
        time.Sleep(delay)
    }
}

使用代理IP：

go 复制代码

proxyUrl, _ := url.Parse("http://proxy-ip:port")
client := &http.Client{
    Transport: &http.Transport{
        Proxy: http.ProxyURL(proxyUrl),
    },
}
resp, _ := client.Get("https://example.com")

处理Cookies：

go 复制代码

jar, _ := cookiejar.New(nil)
client := &http.Client{Jar: jar}
// 第一次请求获取cookie
client.Get("https://example.com/login")
// 后续请求会携带cookie
client.Get("https://example.com/protected-page")

总结

以上实例展示了Go语言结合正则表达式进行数据爬取的多种方法：

基础网页爬取：获取链接、提取特定内容
结构化数据提取：表格数据、JSON数据
高级技巧：并发控制、分页处理
实用技巧：User-Agent设置、相对链接处理
反爬应对：请求间隔、代理IP、Cookies处理

在实际项目中，建议：

对于结构化数据优先使用API而非HTML解析
复杂的HTML解析考虑使用goquery等专门库
遵守网站的robots.txt规则
设置合理的爬取频率，避免对目标网站造成负担

这些实例可以作为基础模板，根据具体需求进行调整和扩展。