本博客旨在分享关于爬虫技术的学习和实践经验,仅供学习使用,请使用爬虫技术的用户自行承担相应的法律责任,务必在进行任何网络数据抓取操作之前,仔细审查相关法律法规,并取得相应的授权或同意。请确保你的行为符合道德和法律的双重标准,尊重知识产权和网站的服务协议,仅将此技术应用于正当、合法的学习和研究目的。
爬取b站评论
1.单视频评论
爬取某个视频的评论时,只需要对请求载荷的w_rid和wts进行加密,然后请求网络拿到数据,拿到的数据不需要解密。
这里的加密我直接扣js代码破解,比较简单,首先在comment_url.js里面写加密函数。这里有一个参数是从本地的localstorage里面拿到的,这里我直接写死了,但其实也没用,因为o和i都是常量。
java
function lt(e) {
ct = "wbi_img_urls";
var t, r, n = function(e) {
var t;
if (e.useAssignKey)
return {
imgKey: e.wbiImgKey,
subKey: e.wbiSubKey
};
var r = (null === (t = function(e) {
try {
return "https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png"
} catch (e) {
return null
}
}(ct)) || void 0 === t ? void 0 : t.split("-")) || []
, n = r[0]
, o = r[1]
, i = n ? ft(n) : e.wbiImgKey
, a = o ? ft(o) : e.wbiSubKey;
return {
imgKey: i,
subKey: a
}
}(arguments.length > 1 && void 0 !== arguments[1] ? arguments[1] : {
wbiImgKey: "",
wbiSubKey: ""
}), o = n.imgKey, i = n.subKey;
// ,o = '7cd084941338484aae1ad9425b84077c', i = '4932caff0ff746eab6f01bf08b70ac45';
if (o && i) {
for (var a = (t = o + i,
r = [],
[46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52].forEach((function(e) {
t.charAt(e) && r.push(t.charAt(e))
}
)),
r.join("").slice(0, 32)), u = Math.round(Date.now() / 1e3), s = Object.assign({}, e, {
wts: u
}), c = Object.keys(s).sort(), l = [], f = /[!'()*]/g, d = 0; d < c.length; d++) {
var p = c[d]
, h = s[p];
h && "string" == typeof h && (h = h.replace(f, "")),
null != h && l.push("".concat(encodeURIComponent(p), "=").concat(encodeURIComponent(h)))
}
var y = l.join("&");
return {
w_rid: at(y + a),
wts: u.toString()
}
}
return "sssss"
}
function ft(e) {
return e.substring(e.lastIndexOf("/") + 1, e.length).split(".")[0]
}
r = function() {
return e
}
EwordsToBytes=function(e) {
console.log(e)
for (var t = [], r = 0; r < 32 * e.length; r += 8)
t.push(e[r >>> 5] >>> 24 - r % 32 & 255);
return t
}
EbytesToWords=function(e) {
for (var t = [], r = 0, n = 0; r < e.length; r++,
n += 8)
t[n >>> 5] |= e[r] << 24 - n % 32;
return t
}
TstringToBytes=function(e) {
return NstringToBytes(unescape(encodeURIComponent(e)))
},
TbytesToString=function(e) {
return decodeURIComponent(escape(rt.bin.bytesToString(e)))
}
NstringToBytes= function(e) {
for (var t = [], r = 0; r < e.length; r++)
t.push(255 & e.charCodeAt(r));
return t
},
NbytesToString=function(e) {
for (var t = [], r = 0; r < e.length; r++)
t.push(String.fromCharCode(e[r]));
return t.join("")
}
function hFF(e, t, r, n, o, i, a) {
var u = e + (t & r | ~t & n) + (o >>> 0) + a;
return (u << i | u >>> 32 - i) + t
}
function yGG(e, t, r, n, o, i, a) {
var u = e + (t & n | r & ~n) + (o >>> 0) + a;
return (u << i | u >>> 32 - i) + t
}
function vHH(e, t, r, n, o, i, a) {
var u = e + (t ^ r ^ n) + (o >>> 0) + a;
return (u << i | u >>> 32 - i) + t
}
function bII(e, t, r, n, o, i, a) {
var u = e + (r ^ (t | ~n)) + (o >>> 0) + a;
return (u << i | u >>> 32 - i) + t
}
o = function o(i, a) {
i.constructor == String ? i = a && "binary" === a.encoding ? NstringToBytes(i) : TstringToBytes(i) : r(i) ? i = Array.prototype.slice.call(i, 0) : Array.isArray(i) || i.constructor === Uint8Array || (i = i.toString());
for (var u = EbytesToWords(i), s = 8 * i.length, c = 1732584193, l = -271733879, f = -1732584194, d = 271733878, p = 0; p < u.length; p++)
u[p] = 16711935 & (u[p] << 8 | u[p] >>> 24) | 4278255360 & (u[p] << 24 | u[p] >>> 8);
u[s >>> 5] |= 128 << s % 32,
u[14 + (s + 64 >>> 9 << 4)] = s;
var h = o._ff
, y = o._gg
, v = o._hh
, b = o._ii;
for (p = 0; p < u.length; p += 16) {
var m = c
, w = l
, g = f
, x = d;
c = hFF(c, l, f, d, u[p + 0], 7, -680876936),
d = hFF(d, c, l, f, u[p + 1], 12, -389564586),
f = hFF(f, d, c, l, u[p + 2], 17, 606105819),
l = hFF(l, f, d, c, u[p + 3], 22, -1044525330),
c = hFF(c, l, f, d, u[p + 4], 7, -176418897),
d = hFF(d, c, l, f, u[p + 5], 12, 1200080426),
f = hFF(f, d, c, l, u[p + 6], 17, -1473231341),
l = hFF(l, f, d, c, u[p + 7], 22, -45705983),
c = hFF(c, l, f, d, u[p + 8], 7, 1770035416),
d = hFF(d, c, l, f, u[p + 9], 12, -1958414417),
f = hFF(f, d, c, l, u[p + 10], 17, -42063),
l = hFF(l, f, d, c, u[p + 11], 22, -1990404162),
c = hFF(c, l, f, d, u[p + 12], 7, 1804603682),
d = hFF(d, c, l, f, u[p + 13], 12, -40341101),
f = hFF(f, d, c, l, u[p + 14], 17, -1502002290),
c = yGG(c, l = hFF(l, f, d, c, u[p + 15], 22, 1236535329), f, d, u[p + 1], 5, -165796510),
d = yGG(d, c, l, f, u[p + 6], 9, -1069501632),
f = yGG(f, d, c, l, u[p + 11], 14, 643717713),
l = yGG(l, f, d, c, u[p + 0], 20, -373897302),
c = yGG(c, l, f, d, u[p + 5], 5, -701558691),
d = yGG(d, c, l, f, u[p + 10], 9, 38016083),
f = yGG(f, d, c, l, u[p + 15], 14, -660478335),
l = yGG(l, f, d, c, u[p + 4], 20, -405537848),
c = yGG(c, l, f, d, u[p + 9], 5, 568446438),
d = yGG(d, c, l, f, u[p + 14], 9, -1019803690),
f = yGG(f, d, c, l, u[p + 3], 14, -187363961),
l = yGG(l, f, d, c, u[p + 8], 20, 1163531501),
c = yGG(c, l, f, d, u[p + 13], 5, -1444681467),
d = yGG(d, c, l, f, u[p + 2], 9, -51403784),
f = yGG(f, d, c, l, u[p + 7], 14, 1735328473),
c = vHH(c, l = yGG(l, f, d, c, u[p + 12], 20, -1926607734), f, d, u[p + 5], 4, -378558),
d = vHH(d, c, l, f, u[p + 8], 11, -2022574463),
f = vHH(f, d, c, l, u[p + 11], 16, 1839030562),
l = vHH(l, f, d, c, u[p + 14], 23, -35309556),
c = vHH(c, l, f, d, u[p + 1], 4, -1530992060),
d = vHH(d, c, l, f, u[p + 4], 11, 1272893353),
f = vHH(f, d, c, l, u[p + 7], 16, -155497632),
l = vHH(l, f, d, c, u[p + 10], 23, -1094730640),
c = vHH(c, l, f, d, u[p + 13], 4, 681279174),
d = vHH(d, c, l, f, u[p + 0], 11, -358537222),
f = vHH(f, d, c, l, u[p + 3], 16, -722521979),
l = vHH(l, f, d, c, u[p + 6], 23, 76029189),
c = vHH(c, l, f, d, u[p + 9], 4, -640364487),
d = vHH(d, c, l, f, u[p + 12], 11, -421815835),
f = vHH(f, d, c, l, u[p + 15], 16, 530742520),
c = bII(c, l = vHH(l, f, d, c, u[p + 2], 23, -995338651), f, d, u[p + 0], 6, -198630844),
d = bII(d, c, l, f, u[p + 7], 10, 1126891415),
f = bII(f, d, c, l, u[p + 14], 15, -1416354905),
l = bII(l, f, d, c, u[p + 5], 21, -57434055),
c = bII(c, l, f, d, u[p + 12], 6, 1700485571),
d = bII(d, c, l, f, u[p + 3], 10, -1894986606),
f = bII(f, d, c, l, u[p + 10], 15, -1051523),
l = bII(l, f, d, c, u[p + 1], 21, -2054922799),
c = bII(c, l, f, d, u[p + 8], 6, 1873313359),
d = bII(d, c, l, f, u[p + 15], 10, -30611744),
f = bII(f, d, c, l, u[p + 6], 15, -1560198380),
l = bII(l, f, d, c, u[p + 13], 21, 1309151649),
c = bII(c, l, f, d, u[p + 4], 6, -145523070),
d = bII(d, c, l, f, u[p + 11], 10, -1120210379),
f = bII(f, d, c, l, u[p + 2], 15, 718787259),
l = bII(l, f, d, c, u[p + 9], 21, -343485551),
c = c + m >>> 0,
l = l + w >>> 0,
f = f + g >>> 0,
d = d + x >>> 0
}
return endian([c, l, f, d])
};
rotl=function(e, t) {
return e << t | e >>> 32 - t
},
rotr=function(e, t) {
return e << 32 - t | e >>> t
}
function endian(e) {
if (e.constructor == Number)
return 16711935 & rotl(e, 8) | 4278255360 & rotl(e, 24);
for (var r = 0; r < e.length; r++)
e[r] = endian(e[r]);
return e
}
bytesToHex=function(e) {
for (var t = [], r = 0; r < e.length; r++)
t.push((e[r] >>> 4).toString(16)),
t.push((15 & e[r]).toString(16));
return t.join("")
}
var Qe= function(t, r) {
if (null == t)
throw new Error("Illegal argument " + t);
var i = EwordsToBytes(o(t, r));
return r && r.asBytes ? i : r && r.asString ? NbytesToString(i) : bytesToHex(i)
}
function Ze(e) {
return e && e.__esModule && Object.prototype.hasOwnProperty.call(e, "default") ? e.default : e
}
var at = Ze(Qe)
// e={
// "oid": "1906333968",
// "type": 1,
// "mode": 3,
// "pagination_str": "{\"offset\":\"{\\\"type\\\":1,\\\"direction\\\":1,\\\"session_id\\\":\\\"1778169679258543\\\",\\\"data\\\":{}}\"}",
// "plat": 1,
// "web_location": 1315875
// }
e={
"oid": "1906333968",
"type": 1,
"mode": 3,
"pagination_str": "{\"offset\":\"\"}",
"plat": 1,
"seek_rpid": "",
"web_location": 1315875
}
console.log(lt(e))
然后在py里面调用js文件,获得加密后的载荷数据,然后请求,并对数据进行分析和保存,这里我存成了csv文件。这里需要注意的是参数有的是字符串有的是数字,所以严格按照输出的格式,否则加密结果一致通不过验证,还有就是pagination_str的格式,一定要按照控制台的输出格式写,否则验证失败。评论懒加载,第一次页拿到sessionid后后续请求时会带着sessionid。
java
import requests
import urllib.parse
import csv
import execjs
oid="1906333968"#必须是string类型啊!!!
web_location= 1315875#必须是整数类型啊!!!
session_id=""#第一页无session_id
cookies = {
'buvid3': '6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc',
'b_nut': '1726211919',
'_uuid': '828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc',
'enable_web_push': 'DISABLE',
'buvid4': '3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D',
'header_theme_version': 'CLOSE',
'rpdid': "|(u|kkmlu~ll0J'u~kYkukl|m",
'fingerprint': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
'buvid_fp_plain': 'undefined',
'buvid_fp': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
'DedeUserID': '37611353',
'DedeUserID__ckMd5': 'af2f5320e5c29dea',
'home_feed_column': '5',
'browser_resolution': '2048-1023',
'bili_ticket': 'eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms',
'bili_ticket_expires': '1736494298',
'SESSDATA': 'dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC',
'bili_jct': '91812d98065f2f1035dfb5271f1057b6',
'CURRENT_FNVAL': '4048',
#TODO
'sid': '6rzu47nf',#8位
'b_lsid': 'EF10A7B92_1944D9191B0',#位
'bp_t_offset_37611353': '1020612344109072384',#位
}
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
# 'cookie': "buvid3=6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc; b_nut=1726211919; _uuid=828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc; enable_web_push=DISABLE; buvid4=3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D; header_theme_version=CLOSE; rpdid=|(u|kkmlu~ll0J'u~kYkukl|m; fingerprint=65fbd3ec7ea1fba4aa76eb96cb7f6249; buvid_fp_plain=undefined; buvid_fp=65fbd3ec7ea1fba4aa76eb96cb7f6249; DedeUserID=37611353; DedeUserID__ckMd5=af2f5320e5c29dea; home_feed_column=5; browser_resolution=2048-1023; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms; bili_ticket_expires=1736494298; SESSDATA=dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC; bili_jct=91812d98065f2f1035dfb5271f1057b6; CURRENT_FNVAL=4048; sid=6rzu47nf; b_lsid=EF10A7B92_1944D9191B0; bp_t_offset_37611353=1020612344109072384",
'origin': 'https://www.bilibili.com',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.bilibili.com/video/BV1xU411U7PW/?spm_id_from=333.1391.0.0&vd_source=fd84ddc58aead0485969c92933b61484',
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}
def save2csv(*args):
if len(args) < 11:
raise ValueError("参数错误.")
with open(f"{args[10]}_{args[11]}.csv", "a", newline='',encoding="utf-8") as f:
f_csv=csv.writer(f)#写入缓存
if f.tell() == 0: # 检查文件是否为空,如果是,则先写入表头
headers_csv=['rpid','replay_count','message','like','avatar','sex','uname','oid','parent','is_end','all_count','name']
f_csv.writerow(headers_csv)
data = list(args)
f_csv.writerow(data)#写入一行
def handle_content(list_comment,is_end,all_count,name):
for comment in list_comment:
rpid=comment["rpid"]#我的id
count=comment["count"]
replay_count=comment["rcount"]#回复数
message=comment["content"]["message"]#回复内容
like=comment["like"]#点赞数
avatar=comment["member"]["avatar"]#回复者头像
sex=comment["member"]["sex"]#回复者性别
uname=comment["member"]["uname"]#回复者昵称
oid=comment["oid"]#我以及我的回复者们共用id
parent=comment["parent"]#回复者id
if comment["replies"]:
replies=handle_content(comment["replies"],is_end,all_count,name)
# save2csv(rpid,replay_count,message,like,avatar,sex,uname,oid,parent,is_end,all_count,name)
print(rpid,replay_count,message,like,avatar,sex,uname,oid,parent)
def handle_cursor(cursor):
is_end=cursor["is_end"]#是否最后一页
all_count=cursor["all_count"]#总评论数
name=cursor["name"]#热门评论
return is_end,all_count,name
def get_params(session_id):#{"offset":"{\"type\":1,\"direction\":1,\"session_id\":\"1778143604964054\",\"data\":{}}"}
pagination_str = "{\"offset\":\"{\\\"type\\\":1,\\\"direction\\\":1,\\\"session_id\\\":\\\""+str(session_id)+"\\\",\\\"data\\\":{}}\"}" if session_id else '{\"offset\":\"\"}'
params={
"oid": oid,
"type": 1,
"mode": 3,
"pagination_str": pagination_str,
"plat": 1,
'seek_rpid': '',
"web_location": web_location
}
ctx=execjs.compile(open('./bili/comment_url.js','r',encoding='utf-8').read()).call('lt',params)
params.update({
'w_rid': ctx["w_rid"],
'wts': ctx["wts"]
})
return params
if __name__=="__main__":
count=1
while True:
params=get_params(session_id)
print(params)
response = requests.get(
'https://api.bilibili.com/x/v2/reply/wbi/main',
cookies=cookies,
headers=headers,
params=params,
)
# print(response.text)
is_end,all_count,name=handle_cursor(response.json()["data"]["cursor"])
handle_content(response.json()["data"]["replies"],is_end,all_count,name)
print(f"第{count}页爬完了")
count+=1
if is_end==True:
print(f"爬取完成,一共有{all_count}条")
break