现在做数据采集可真不容易,动不动就被网站封IP。我最近用Nim语言搞了个爬虫,效果还是挺不错。这东西速度快、占资源少,最重要的是接入了隧道代理,能自动换IP,无需再怕被封了。今天就跟大家分享一下具体怎么做的,从环境搭建到代码实现,保证通俗易懂。不管大家是做数据分析还是做市场调研,这个方案都能帮大家稳定高效地抓取数据。

我将为大家提供一个使用Nim语言编写的高性能、低消耗爬虫方案,并集成隧道代理IP功能以防止被封。
方案概述
这个爬虫方案具有以下特点:
- 使用Nim的异步机制实现高性能
- 内存占用低,资源消耗小
- 集成隧道代理自动轮换IP
- 支持自动重试和异常处理
- 易于部署和配置
完整代码实现
nim
import asyncdispatch, httpclient, net, strutils, json, os, times, random, sequtils, options
type
CrawlerConfig = object
proxyUrl: string
userAgent: string
timeout: int
maxRetries: int
requestDelay: int
concurrentRequests: int
CrawlerStats = object
totalRequests: int
successfulRequests: int
failedRequests: int
startTime: Time
Crawler = ref object
config: CrawlerConfig
stats: CrawlerStats
client: AsyncHttpClient
proxyPool: seq[string]
RequestResult = object
url: string
content: string
status: int
success: bool
proxyUsed: string
const
DefaultUserAgents = @[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/615.1"
]
proc newCrawlerConfig(proxyUrl = "", userAgent = "", timeout = 10000,
maxRetries = 3, requestDelay = 1000, concurrentRequests = 10): CrawlerConfig =
result.proxyUrl = proxyUrl
result.userAgent = if userAgent == "": sample(DefaultUserAgents) else: userAgent
result.timeout = timeout
result.maxRetries = maxRetries
result.requestDelay = requestDelay
result.concurrentRequests = concurrentRequests
proc newCrawler(config: CrawlerConfig): Crawler =
new result
result.config = config
result.stats = CrawlerStats(totalRequests: 0, successfulRequests: 0,
failedRequests: 0, startTime: getTime())
result.client = newAsyncHttpClient(userAgent = config.userAgent, timeout = config.timeout)
result.proxyPool = @[]
proc addProxyToPool(crawler: Crawler, proxyUrl: string) =
crawler.proxyPool.add(proxyUrl)
proc getRandomProxy(crawler: Crawler): string =
if crawler.proxyPool.len > 0:
result = sample(crawler.proxyPool)
else:
result = crawler.config.proxyUrl
proc setupProxy(client: AsyncHttpClient, proxyUrl: string) =
if proxyUrl != "":
let parts = proxyUrl.split(":")
if parts.len >= 2:
let host = parts[0]
let port = parts[1].parseInt.Port
client.setProxy(host, port, "", "")
proc makeRequest(crawler: Crawler, url: string, retryCount = 0): Future[RequestResult] {.async.} =
var result = RequestResult(url: url, success: false, status: 0)
let proxyUrl = crawler.getRandomProxy()
try:
crawler.stats.totalRequests.inc
# 设置代理
if proxyUrl != "":
crawler.client.setupProxy(proxyUrl)
# 添加随机延迟以避免被检测
await sleepAsync(rand(crawler.config.requestDelay))
let response = await crawler.client.get(url)
result.status = response.code.int
result.content = await response.body
result.success = true
result.proxyUsed = proxyUrl
crawler.stats.successfulRequests.inc
echo "Success: ", url, " | Status: ", result.status, " | Proxy: ", proxyUrl
except Exception as e:
echo "Error: ", url, " | Error: ", e.msg, " | Proxy: ", proxyUrl
if retryCount < crawler.config.maxRetries:
echo "Retrying (", retryCount + 1, "/", crawler.config.maxRetries, "): ", url
await sleepAsync(2000) # 等待2秒后重试
return await crawler.makeRequest(url, retryCount + 1)
else:
crawler.stats.failedRequests.inc
result.success = false
return result
proc crawlMultiple(crawler: Crawler, urls: seq[string]): Future[seq[RequestResult]] {.async.} =
var futures = newSeq[Future[RequestResult]]()
for url in urls:
if futures.len >= crawler.config.concurrentRequests:
let completed = await one(futures)
result.add(completed)
futures.keepItIf(not it.finished)
futures.add(crawler.makeRequest(url))
# 等待所有剩余请求完成
while futures.len > 0:
let completed = await one(futures)
result.add(completed)
futures.keepItIf(not it.finished)
proc getStats(crawler: Crawler): string =
let elapsed = (getTime() - crawler.stats.startTime).inMilliseconds.float / 1000
let successRate = if crawler.stats.totalRequests > 0:
(crawler.stats.successfulRequests.float / crawler.stats.totalRequests.float * 100).formatFloat(ffDecimal, 2)
else:
"0.00"
result = "爬虫统计:\n" &
"运行时间: " & $elapsed & " 秒\n" &
"总请求数: " & $crawler.stats.totalRequests & "\n" &
"成功请求: " & $crawler.stats.successfulRequests & "\n" &
"失败请求: " & $crawler.stats.failedRequests & "\n" &
"成功率: " & successRate & "%"
# 示例使用
when isMainModule:
proc main() {.async.} =
# 配置爬虫
let config = newCrawlerConfig(
proxyUrl = "your-tunnel-proxy.com:8000", # 替换为大家的隧道代理地址
timeout = 15000,
maxRetries = 5,
requestDelay = 2000,
concurrentRequests = 5
)
var crawler = newCrawler(config)
# 可以添加多个代理到代理池(如果需要)
crawler.addProxyToPool("proxy1.com:8000")
crawler.addProxyToPool("proxy2.com:8000")
crawler.addProxyToPool("proxy3.com:8000")
# 要爬取的URL列表
let urls = @[
"https://httpbin.org/ip",
"https://httpbin.org/user-agent",
"https://httpbin.org/headers",
"https://httpbin.org/get",
"https://httpbin.org/html"
]
echo "开始爬取 ", urls.len, " 个URL..."
let results = await crawler.crawlMultiple(urls)
echo "\n", crawler.getStats()
# 输出一些结果示例
echo "\n前3个结果摘要:"
for i, res in results[0..min(2, results.high)]:
echo "结果 ", i+1, ":"
echo " URL: ", res.url
echo " 状态: ", res.status
echo " 成功: ", res.success
echo " 代理: ", res.proxyUsed
echo " 内容长度: ", res.content.len, " 字符"
echo " ---"
waitFor main()
部署说明
1、安装Nim
首先需要安装Nim编程语言:
bash
# Linux/Mac
curl https://nim-lang.org/choosenim/init.sh -sSf | sh
# Windows
# 从 https://nim-lang.org/install_windows.html 下载安装器
2、安装依赖
bash
nimble install asyncdispatch httpclient
3、配置隧道代理
根据大家的隧道代理服务商提供的信息配置代理:
nim
let config = newCrawlerConfig(
proxyUrl = "大家的隧道代理地址:端口", # 例如: "tunnel.proxyprovider.com:8000"
timeout = 15000,
maxRetries = 5,
requestDelay = 2000,
concurrentRequests = 5
)
4、编译和运行
bash
# 编译
nim c -d:ssl --threads:on crawler.nim
# 运行
./crawler
高级配置选项
自定义请求头
nim
proc makeRequest(crawler: Crawler, url: string, retryCount = 0): Future[RequestResult] {.async.} =
# ... 其他代码 ...
var headers = newHttpHeaders()
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Connection"] = "keep-alive"
let response = await crawler.client.get(url, headers = headers)
# ... 其他代码 ...
处理Cookies和会话
nim
proc newCrawler(config: CrawlerConfig): Crawler =
# ... 其他代码 ...
result.client = newAsyncHttpClient(
userAgent = config.userAgent,
timeout = config.timeout,
maxRedirects = 5
)
result.client.handleCookies = true
速率限制和礼貌爬取
nim
proc crawlMultiple(crawler: Crawler, urls: seq[string]): Future[seq[RequestResult]] {.async.} =
var futures = newSeq[Future[RequestResult]]()
for i, url in urls:
if futures.len >= crawler.config.concurrentRequests:
let completed = await one(futures)
result.add(completed)
futures.keepItIf(not it.finished)
futures.add(crawler.makeRequest(url))
# 添加域名特定的延迟以避免对同一网站过于频繁的请求
if i mod crawler.config.concurrentRequests == 0:
await sleepAsync(1000) # 每批请求后等待1秒
# 等待所有剩余请求完成
while futures.len > 0:
let completed = await one(futures)
result.add(completed)
futures.keepItIf(not it.finished)
性能优化建议
1、调整并发数 :根据目标网站和网络条件调整concurrentRequests
2、合理设置超时:根据目标网站响应速度设置适当的超时时间
3、使用连接池:保持HTTP连接复用以减少握手开销
4、内存管理:及时处理响应内容,避免内存积累
错误处理和监控
nim
proc monitorCrawler(crawler: Crawler) =
# 可以添加监控逻辑,如记录日志、发送警报等
asyncCheck monitorLoop(crawler)
proc monitorLoop(crawler: Crawler) {.async.} =
while true:
await sleepAsync(30000) # 每30秒检查一次
let stats = crawler.getStats()
# 可以在这里添加发送统计信息到监控系统的逻辑
echo "监控: ", stats
这个爬虫框架提供了高性能、低资源消耗的基础,并通过隧道代理有效防止IP被封。大家可以根据具体需求进一步扩展和定制功能。
好了,这个爬虫方案就介绍到这里。实际用起来确实挺方便的,性能不错还省资源,关键是再也不用担心IP被封的问题了。如果大家也在做数据采集,不妨试试这个方案,代码我都写好了,照着做就行。要是遇到什么问题,或者有什么更好的建议,欢迎一起交流讨论。希望这个工具能帮到大家!