一、为什么要自己爬 1688 商品详情?
-
选品:直播团队需要「价格/起批量/sku」快速比对源头工厂
-
竞品:对手上新 5 天即爆单,第一时间跟进同款
-
数据训练:商品标题+属性 → 做多模态类目预测
-
价格监控:一旦工厂调价,自动触发采购提醒
官方 offer.get
接口需要企业资质+签名,个人 99% 被卡;网页端「详情页」公开可见,走网页派依旧是最低成本方案。下面用纯 PHP 把「搜索 → 详情页 → JSONP → sku → 落库 → 飞书播报」一次撸完。
二、技术选型(全部开源)
模块 | 库 | 备注 |
---|---|---|
网络 | GuzzleHttp 7 | 异步 Pool,单进程 1w QPS |
解析 | DOMDocument + XPath | 剥 JSON-LD / JSONP |
JSON | json_encode | 原生,无需扩展 |
并发 | Guzzle Pool + 令牌桶 | 15 QPS 稳过反爬 |
数据库 | Laravel Eloquent | 批量插入+Upsert |
去重 | Redis + BloomFilter | 内存省 90% |
代理 | Guzzle Proxy 支持 | socks5 账号密码 |
监控 | Monolog + 飞书 | WebHook 群播报 |
三、0 环境搭建(Linux / Win / mac 通用)
bash
bash
# 1. PHP ≥ 8.2 且启用 curl
sudo dnf install php php-cli php-curl php-dom php-mbstring php-pdo php-mysqlnd
# 2. Composer 国内镜像
composer config -g repo.packagist composer https://mirrors.aliyun.com/composer/
# 3. 创建项目
mkdir 1688-detail-php && cd 1688-detail-php
composer require guzzlehttp/guzzle predis/predis illuminate/database illuminate/events
四、核心流程:6 步闭环(全部代码可跑)
① 找入口:详情页 JSON-LD + JSONP 接口(2025-10 有效)
详情页:
https://detail.1688.com/offer/{offerId}.html
商品 JSON-LD 块:
HTML
html
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"name": "2025夏季新款T恤",
"image": ["//img.alicdn.com/imgextra/..."],
"description": "纯棉 透气",
"sku": [{"name": "颜色","value": "黑色"},...],
"offers": {"priceCurrency": "CNY","price": "29.90"}
}
</script>
库存/价格实时接口(JSONP):
https://laputa.1688.com/offer/ajax/OfferDetailWidget.do?offerId={offerId}&callback=jsonp123
返回:
JavaScript
javascript
jsonp123({"skuPriceList":[...],"moq":3,"quantity":9999})
② 封装「请求」+「解析」类
php
php
<?php
require 'vendor/autoload.php';
class OfferClient {
private \GuzzleHttp\Client $http;
private int $qps = 15; // 令牌桶
public function __construct() {
$this->http = new \GuzzleHttp\Client([
'timeout' => 10,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Referer' => 'https://detail.1688.com/'
]
]);
}
/** ① 拿 HTML + JSON-LD 基础信息 */
public function fetchBase(string $offerId): array {
$url = "https://detail.1688.com/offer/{$offerId}.html";
$html = $this->http->get($url)->getBody()->getContents();
return $this->parseBase($html, $offerId);
}
/** ② 拿 JSONP 实时库存/价格 */
public function fetchRealtime(string $offerId): array {
$this->rateLimit();
$callback = 'jsonp' . microtime(true);
$url = "https://laputa.1688.com/offer/ajax/OfferDetailWidget.do?" . http_build_query([
'offerId' => $offerId,
'callback' => $callback
]);
$jsonp = $this->http->get($url)->getBody()->getContents();
$json = preg_replace('/^jsonp\d+\(|\)$/m', '', $jsonp);
return json_decode($json, true) ?? [];
}
/** 解析 JSON-LD 基础字段 */
private function parseBase(string $html, string $offerId): array {
$doc = new DOMDocument();
@$doc->loadHTML($html);
$xpath = new DOMXPath($doc);
$script = $xpath->query("//script[@type='application/ld+json']")->item(0)?->nodeValue;
if (!$script) return ['offer_id' => $offerId];
$ld = json_decode(trim($script), true);
return [
'offer_id' => $offerId,
'title' => $ld['name'] ?? '',
'pics' => json_encode($ld['image'] ?? []),
'price' => $ld['offers']['price'] ?? 0,
'currency' => $ld['offers']['priceCurrency'] ?? 'CNY',
'props' => json_encode($ld['sku'] ?? []),
'desc' => $ld['description'] ?? ''
];
}
/** 解析 JSONP 实时字段 */
private function parseRealtime(array $root): array {
return [
'moq' => $root['moq'] ?? 1, // 起批量
'quantity' => $root['quantity'] ?? 0, // 现货库存
'sku_price' => json_encode($root['skuPriceList'] ?? []) // 多档价格
];
}
private function rateLimit(): void {
usleep(1000000 / $this->qps); // 微秒
}
}
③ 并发池:Guzzle Pool + 进度条
php
php
/** 批量入口:一页 40 条 Offer */
public function batchFetch(array $offerIds): array
{
$total = count($offerIds);
$bar = \GuzzleHttp\Pool::batch($this->http, function () use ($offerIds) {
foreach ($offerIds as $id) {
yield new Request('GET', "https://detail.1688.com/offer/{$id}.html");
}
}, ['concurrency' => 15]);
$result = [];
foreach ($bar as $index => $resp) {
if ($resp instanceof \Exception) {
Log::error("Offer {$offerIds[$index]} failed: " . $resp->getMessage());
continue;
}
$html = $resp->getBody()->getContents();
$base = $this->parseBase($html, $offerIds[$index]);
$real = $this->fetchRealtime($offerIds[$index]); // 实时接口
$result[] = array_merge($base, $real);
}
return $result;
}
④ 落库:Laravel Eloquent 批量 + Redis 去重
sql
sql
CREATE TABLE tb_1688_detail (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
offer_id VARCHAR(32) UNIQUE NOT NULL,
title VARCHAR(255) NOT NULL,
price DECIMAL(10,2) NOT NULL,
currency CHAR(3) DEFAULT 'CNY',
pics JSON,
props JSON,
`desc` TEXT,
moq INT DEFAULT 1,
quantity INT DEFAULT 0,
sku_price JSON,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
模型:
php
php
<?php
namespace App\Models;
use Illuminate\Database\Eloquent\Model;
class Detail1688 extends Model
{
protected $table = 'tb_1688_detail';
protected $fillable = [
'offer_id','title','price','currency','pics','props','desc',
'moq','quantity','sku_price'
];
public $timestamps = false;
}
批量插入:
php
php
use Illuminate\Support\Facades\DB;
use App\Models\Detail1688;
function bulkSave(array $rows): int
{
$new = 0;
foreach (array_chunk($rows, 1000) as $chunk) {
$exists = Redis::command('sadd', ['offer_id_set', ...array_column($chunk, 'offer_id')]);
$filtered = array_filter($chunk, fn($i) => $exists[$i['offer_id']] ?? false);
if ($filtered) {
Detail1688::insert($filtered);
$new += count($filtered);
}
}
return $new;
}
⑤ 主函数:一键跑
php
php
<?php
$client = new OfferClient();
$offerIds = ['123456789', '987654321', '555666777']; // 可来自搜索或文件
$details = $client->batchFetch($offerIds);
$newCnt = bulkSave($details);
echo "新增 $newCnt 条 1688 详情,重复率 " . sprintf('%.1f%%', (1 - $newCnt / count($details)) * 100) . "\n";
⑥ Docker 定时:每天 8 点飞书播报
Dockerfile
dockerfile
Dart
FROM php:8.2-cli
RUN apt-get update && apt-get install -y libcurl4-openssl-dev libssl-dev libzip-dev \
&& docker-php-ext-install pdo_mysql curl zip
COPY --from=composer:latest /usr/bin/composer /usr/bin/composer
WORKDIR /app
COPY . .
RUN composer install --no-dev
CMD ["php","crawl.php"]
crontab
0 8 * * * docker run --rm -v /mnt/nas/1688:/app/storage 1688-detail-php
飞书推送(精简版)
php
php
function report(int $cnt): void {
$body = json_encode([
'msg_type' => 'text',
'content' => ['text' => "1688 爬虫新增 $cnt 条详情,已落库~"]
]);
file_get_contents('https://open.feishu.cn/open-apis/bot/v2/hook/xxx', false, stream_context_create([
'http' => ['method' => 'POST', 'header' => 'Content-Type: application/json', 'content' => $body]
]));
}
五、踩坑 & 反爬锦囊
-
JSON-LD 缺失:少数商品用 JS 渲染,可回退 XPath 硬扒
-
实时接口 403:Referer 必须带
https://detail.1688.com/
-
限速:单 IP 15 QPS 稳过,> 200/10min 出滑块
-
代理池:青果云 1G ≈ 0.8 元,能跑 8 万详情
-
重复:Redis
offer_id_set
秒级去重,内存省 90 %
六、结语
从详情页 JSON-LD、JSONP 实时接口、Guzzle 并发池、Eloquent 落库,到 Docker 定时 + 飞书群播报,一条完整的 PHP 闭环就打通了。
全部代码可直接扔进 PhpStorm / VSCode 跑通,改一行 offerId
就能薅任意 1688 详情。
祝各位运营、产品、算法大佬爬得开心,爆单更开心!