Nim轻量级爬虫:异步高效+代理轮换防封

现在做数据采集可真不容易,动不动就被网站封IP。我最近用Nim语言搞了个爬虫,效果还是挺不错。这东西速度快、占资源少,最重要的是接入了隧道代理,能自动换IP,无需再怕被封了。今天就跟大家分享一下具体怎么做的,从环境搭建到代码实现,保证通俗易懂。不管大家是做数据分析还是做市场调研,这个方案都能帮大家稳定高效地抓取数据。

我将为大家提供一个使用Nim语言编写的高性能、低消耗爬虫方案,并集成隧道代理IP功能以防止被封。

方案概述

这个爬虫方案具有以下特点:

  • 使用Nim的异步机制实现高性能
  • 内存占用低,资源消耗小
  • 集成隧道代理自动轮换IP
  • 支持自动重试和异常处理
  • 易于部署和配置

完整代码实现

nim 复制代码
import asyncdispatch, httpclient, net, strutils, json, os, times, random, sequtils, options

type
  CrawlerConfig = object
    proxyUrl: string
    userAgent: string
    timeout: int
    maxRetries: int
    requestDelay: int
    concurrentRequests: int

  CrawlerStats = object
    totalRequests: int
    successfulRequests: int
    failedRequests: int
    startTime: Time

  Crawler = ref object
    config: CrawlerConfig
    stats: CrawlerStats
    client: AsyncHttpClient
    proxyPool: seq[string]

  RequestResult = object
    url: string
    content: string
    status: int
    success: bool
    proxyUsed: string

const
  DefaultUserAgents = @[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/615.1"
  ]

proc newCrawlerConfig(proxyUrl = "", userAgent = "", timeout = 10000, 
                     maxRetries = 3, requestDelay = 1000, concurrentRequests = 10): CrawlerConfig =
  result.proxyUrl = proxyUrl
  result.userAgent = if userAgent == "": sample(DefaultUserAgents) else: userAgent
  result.timeout = timeout
  result.maxRetries = maxRetries
  result.requestDelay = requestDelay
  result.concurrentRequests = concurrentRequests

proc newCrawler(config: CrawlerConfig): Crawler =
  new result
  result.config = config
  result.stats = CrawlerStats(totalRequests: 0, successfulRequests: 0, 
                             failedRequests: 0, startTime: getTime())
  result.client = newAsyncHttpClient(userAgent = config.userAgent, timeout = config.timeout)
  result.proxyPool = @[]

proc addProxyToPool(crawler: Crawler, proxyUrl: string) =
  crawler.proxyPool.add(proxyUrl)

proc getRandomProxy(crawler: Crawler): string =
  if crawler.proxyPool.len > 0:
    result = sample(crawler.proxyPool)
  else:
    result = crawler.config.proxyUrl

proc setupProxy(client: AsyncHttpClient, proxyUrl: string) =
  if proxyUrl != "":
    let parts = proxyUrl.split(":")
    if parts.len >= 2:
      let host = parts[0]
      let port = parts[1].parseInt.Port
      client.setProxy(host, port, "", "")

proc makeRequest(crawler: Crawler, url: string, retryCount = 0): Future[RequestResult] {.async.} =
  var result = RequestResult(url: url, success: false, status: 0)
  let proxyUrl = crawler.getRandomProxy()
  
  try:
    crawler.stats.totalRequests.inc
    
    # 设置代理
    if proxyUrl != "":
      crawler.client.setupProxy(proxyUrl)
    
    # 添加随机延迟以避免被检测
    await sleepAsync(rand(crawler.config.requestDelay))
    
    let response = await crawler.client.get(url)
    result.status = response.code.int
    result.content = await response.body
    result.success = true
    result.proxyUsed = proxyUrl
    
    crawler.stats.successfulRequests.inc
    echo "Success: ", url, " | Status: ", result.status, " | Proxy: ", proxyUrl
    
  except Exception as e:
    echo "Error: ", url, " | Error: ", e.msg, " | Proxy: ", proxyUrl
    
    if retryCount < crawler.config.maxRetries:
      echo "Retrying (", retryCount + 1, "/", crawler.config.maxRetries, "): ", url
      await sleepAsync(2000)  # 等待2秒后重试
      return await crawler.makeRequest(url, retryCount + 1)
    else:
      crawler.stats.failedRequests.inc
      result.success = false
  
  return result

proc crawlMultiple(crawler: Crawler, urls: seq[string]): Future[seq[RequestResult]] {.async.} =
  var futures = newSeq[Future[RequestResult]]()
  
  for url in urls:
    if futures.len >= crawler.config.concurrentRequests:
      let completed = await one(futures)
      result.add(completed)
      futures.keepItIf(not it.finished)
    
    futures.add(crawler.makeRequest(url))
  
  # 等待所有剩余请求完成
  while futures.len > 0:
    let completed = await one(futures)
    result.add(completed)
    futures.keepItIf(not it.finished)

proc getStats(crawler: Crawler): string =
  let elapsed = (getTime() - crawler.stats.startTime).inMilliseconds.float / 1000
  let successRate = if crawler.stats.totalRequests > 0:
    (crawler.stats.successfulRequests.float / crawler.stats.totalRequests.float * 100).formatFloat(ffDecimal, 2)
  else:
    "0.00"
  
  result = "爬虫统计:\n" &
           "运行时间: " & $elapsed & " 秒\n" &
           "总请求数: " & $crawler.stats.totalRequests & "\n" &
           "成功请求: " & $crawler.stats.successfulRequests & "\n" &
           "失败请求: " & $crawler.stats.failedRequests & "\n" &
           "成功率: " & successRate & "%"

# 示例使用
when isMainModule:
  proc main() {.async.} =
    # 配置爬虫
    let config = newCrawlerConfig(
      proxyUrl = "your-tunnel-proxy.com:8000",  # 替换为大家的隧道代理地址
      timeout = 15000,
      maxRetries = 5,
      requestDelay = 2000,
      concurrentRequests = 5
    )
    
    var crawler = newCrawler(config)
    
    # 可以添加多个代理到代理池(如果需要)
    crawler.addProxyToPool("proxy1.com:8000")
    crawler.addProxyToPool("proxy2.com:8000")
    crawler.addProxyToPool("proxy3.com:8000")
    
    # 要爬取的URL列表
    let urls = @[
      "https://httpbin.org/ip",
      "https://httpbin.org/user-agent",
      "https://httpbin.org/headers",
      "https://httpbin.org/get",
      "https://httpbin.org/html"
    ]
    
    echo "开始爬取 ", urls.len, " 个URL..."
    let results = await crawler.crawlMultiple(urls)
    
    echo "\n", crawler.getStats()
    
    # 输出一些结果示例
    echo "\n前3个结果摘要:"
    for i, res in results[0..min(2, results.high)]:
      echo "结果 ", i+1, ":"
      echo "  URL: ", res.url
      echo "  状态: ", res.status
      echo "  成功: ", res.success
      echo "  代理: ", res.proxyUsed
      echo "  内容长度: ", res.content.len, " 字符"
      echo "  ---"
  
  waitFor main()

部署说明

1、安装Nim

首先需要安装Nim编程语言:

bash 复制代码
# Linux/Mac
curl https://nim-lang.org/choosenim/init.sh -sSf | sh

# Windows
# 从 https://nim-lang.org/install_windows.html 下载安装器

2、安装依赖

bash 复制代码
nimble install asyncdispatch httpclient

3、配置隧道代理

根据大家的隧道代理服务商提供的信息配置代理:

nim 复制代码
let config = newCrawlerConfig(
  proxyUrl = "大家的隧道代理地址:端口",  # 例如: "tunnel.proxyprovider.com:8000"
  timeout = 15000,
  maxRetries = 5,
  requestDelay = 2000,
  concurrentRequests = 5
)

4、编译和运行

bash 复制代码
# 编译
nim c -d:ssl --threads:on crawler.nim

# 运行
./crawler

高级配置选项

自定义请求头

nim 复制代码
proc makeRequest(crawler: Crawler, url: string, retryCount = 0): Future[RequestResult] {.async.} =
  # ... 其他代码 ...
  var headers = newHttpHeaders()
  headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  headers["Accept-Language"] = "en-US,en;q=0.5"
  headers["Connection"] = "keep-alive"
  
  let response = await crawler.client.get(url, headers = headers)
  # ... 其他代码 ...

处理Cookies和会话

nim 复制代码
proc newCrawler(config: CrawlerConfig): Crawler =
  # ... 其他代码 ...
  result.client = newAsyncHttpClient(
    userAgent = config.userAgent, 
    timeout = config.timeout,
    maxRedirects = 5
  )
  result.client.handleCookies = true

速率限制和礼貌爬取

nim 复制代码
proc crawlMultiple(crawler: Crawler, urls: seq[string]): Future[seq[RequestResult]] {.async.} =
  var futures = newSeq[Future[RequestResult]]()
  
  for i, url in urls:
    if futures.len >= crawler.config.concurrentRequests:
      let completed = await one(futures)
      result.add(completed)
      futures.keepItIf(not it.finished)
    
    futures.add(crawler.makeRequest(url))
    
    # 添加域名特定的延迟以避免对同一网站过于频繁的请求
    if i mod crawler.config.concurrentRequests == 0:
      await sleepAsync(1000)  # 每批请求后等待1秒
  
  # 等待所有剩余请求完成
  while futures.len > 0:
    let completed = await one(futures)
    result.add(completed)
    futures.keepItIf(not it.finished)

性能优化建议

1、调整并发数 :根据目标网站和网络条件调整concurrentRequests

2、合理设置超时:根据目标网站响应速度设置适当的超时时间

3、使用连接池:保持HTTP连接复用以减少握手开销

4、内存管理:及时处理响应内容,避免内存积累

错误处理和监控

nim 复制代码
proc monitorCrawler(crawler: Crawler) =
  # 可以添加监控逻辑,如记录日志、发送警报等
  asyncCheck monitorLoop(crawler)

proc monitorLoop(crawler: Crawler) {.async.} =
  while true:
    await sleepAsync(30000)  # 每30秒检查一次
    let stats = crawler.getStats()
    # 可以在这里添加发送统计信息到监控系统的逻辑
    echo "监控: ", stats

这个爬虫框架提供了高性能、低资源消耗的基础,并通过隧道代理有效防止IP被封。大家可以根据具体需求进一步扩展和定制功能。

好了,这个爬虫方案就介绍到这里。实际用起来确实挺方便的,性能不错还省资源,关键是再也不用担心IP被封的问题了。如果大家也在做数据采集,不妨试试这个方案,代码我都写好了,照着做就行。要是遇到什么问题,或者有什么更好的建议,欢迎一起交流讨论。希望这个工具能帮到大家!

相关推荐
点光12 小时前
使用Sentinel作为Spring Boot应用限流组件
后端
不要秃头啊12 小时前
别再谈提效了:AI 时代的开发范式本质变了
前端·后端·程序员
jonjia13 小时前
模块、脚本与声明文件
typescript
jonjia13 小时前
配置 TypeScript
typescript
有志13 小时前
Java 项目添加慢 SQL 查询工具实践
后端
jonjia13 小时前
TypeScript 工具函数开发
typescript
jonjia13 小时前
注解与断言
typescript
jonjia13 小时前
IDE 超能力
typescript
jonjia13 小时前
对象类型
typescript
jonjia13 小时前
快速搭建 TypeScript 开发环境
typescript