以前老觉得爬虫就是phython的专利,我想做的事情都会因为phython太难,自己又不想学而畏首畏尾,见天就来个node的爬虫测试看看。(爬取boss数据)
狠人话少,上干货!
安装node我就不说了,但凡有点基础都知道。建立文件夹,执行初始化。
js
npm init -y
npm install --save puppeteer exceljs
建立index.js文件
php
import puppeteer from 'puppeteer';
import ExcelJS from 'exceljs';
const browser = await puppeteer.launch({
headless: false,
defaultViewport: {
width: 0,
height: 0
}
});
const page = await browser.newPage();
await page.goto('https://www.zhipin.com/web/geek/job?query=前端&city=100010000');
await page.waitForSelector('.job-list-box');
const totalPage = await page.$eval('.options-pages a:nth-last-child(2)', e => {
return parseInt(e.textContent);
});
const allJobs = [];
for (let i = 1; i <= totalPage; i++) {
await page.goto('https://www.zhipin.com/web/geek/job?query=前端&city=100010000&page=' + i);
await page.waitForSelector('.job-list-box');
const jobs = await page.$eval('.job-list-box', el => {
return [...el.querySelectorAll('.job-card-wrapper')].map(item => {
return {
job: {
name: item.querySelector('.job-name').textContent,
area: item.querySelector('.job-area').textContent,
salary: item.querySelector('.salary').textContent
},
link: item.querySelector('a').href,
company: {
name: item.querySelector('.company-name').textContent,
}
};
});
});
allJobs.push(...jobs);
}
for (let i = 0; i < allJobs.length; i++) {
await page.goto(allJobs[i].link);
try {
await page.waitForSelector('.job-sec-text');
const jd = await page.$eval('.job-sec-text', el => {
return el.textContent;
});
allJobs[i].desc = jd;
} catch (e) { }
}
// 创建 Excel 文件
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet('爬取数据');
// 设置表头
worksheet.columns = [
{ header: '工作名称', key: 'job', width: 20 },
{ header: '链接', key: 'link', width: 15 },
{ header: '公司名称', key: 'company', width: 15 },
{ header: '描述', key: 'desc', width: 15 }
];
// 写入数据
allJobs.forEach(item => worksheet.addRow(item));
// 保存文件
await workbook.xlsx.writeFile('output.xlsx');
await browser.close();
成果如下
如果想要你的excel更好,你就利用js更细致的解析数据就好了。
js
import puppeteer from 'puppeteer';
import ExcelJS from 'exceljs';
const browser = await puppeteer.launch({
headless: false,
defaultViewport: {
width: 0,
height: 0
}
});
const page = await browser.newPage();
await page.goto('https://www.zhipin.com/web/geek/job?query=前端&city=100010000');
await page.waitForSelector('.job-list-box');
const totalPage = await page.$eval('.options-pages a:nth-last-child(2)', e => {
return parseInt(e.textContent);
});
const allJobs = [];
for (let i = 1; i <= totalPage; i++) {
await page.waitForTimeout(2000);
await page.goto('https://www.zhipin.com/web/geek/job?query=前端&city=100010000&page=' + i);
await page.waitForSelector('.job-list-box');
const jobs = await page.$eval('.job-list-box', el => {
return [...el.querySelectorAll('.job-card-wrapper')].map(item => {
return {
job: {
name: item.querySelector('.job-name').textContent,
area: item.querySelector('.job-area').textContent,
salary: item.querySelector('.salary').textContent
},
link: item.querySelector('a').href,
company: {
name: item.querySelector('.company-name').textContent,
}
};
});
});
allJobs.push(...jobs);
}
for (let i = 0; i < allJobs.length; i++) {
const job = allJobs[i]?.job;
allJobs.name = job?.name;
allJobs.area = job?.area;
allJobs.salary = job?.salary;
allJobs.company = job?.company?.name;
await page.goto(allJobs[i].link);
try {
await page.waitForSelector('.job-sec-text');
const jd = await page.$eval('.job-sec-text', el => {
return el.textContent;
});
allJobs[i].desc = jd;
} catch (e) { }
}
// 创建 Excel 文件
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet('爬取数据');
// 设置表头
worksheet.columns = [
{ header: '工作名称', key: 'name', width: 20 },
{ header: '工作地点', key: 'area', width: 20 },
{ header: '工资', key: 'salary', width: 20 },
{ header: '链接', key: 'link', width: 15 },
{ header: '公司名称', key: 'company', width: 15 },
{ header: '描述', key: 'desc', width: 15 }
];
// 写入数据
allJobs.forEach(item => worksheet.addRow(item));
// 保存文件
await workbook.xlsx.writeFile('output.xlsx');
await browser.close();