# nodejs爬取豆瓣读书Top250
需要用到的依赖:
- request Node.js 第三方 HTTP 请求工具
- iconv-lite 用于在node当中处理各种奇特编码
- cheerio jquery核心功能的一个快速灵活实现
原文链接: https://book.douban.com/top250?start=0
实现对网址的爬取,获取当前页的书籍信息
const originRequest = require('request')
const iconv = require('iconv-lite')
const cheerio = require('cheerio')
function request(url, callback) {
const option = {
encoding: null,
url: url,
headers: {
// 模拟浏览器
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
}
originRequest(option, callback)
}
const lists = [];
function getList(start = 0) {
let list = [];
request('https://book.douban.com/top250?start='+ start, (err, res, body) => {
const html = iconv.decode(body, 'utf8');
const $ = cheerio.load(html)
let books = $('.indent table tr');
for(let i = 0; i < books.length; i++) {
// 名字
let name = $(books[i]).find('.pl2 a').text().trim();
// 图片地址
let img = $(books[i]).find('img').attr('src');
// 具体描述
let bookDesc = $(books[i]).find('p.pl').text().trim();
// 星级
let rating_num = $(books[i]).find('.rating_nums').text().trim();
// 多少人评论
let pl = $(books[i]).find('.star .pl').text().trim();
// 引用文字
let quote = $(books[i]).find('.quote').text().trim();
list.push({name, img, bookDesc, rating_num, pl, quote});
}
lists.push(...list);
});
}
读取250条并输入 books.json
const originRequest = require('request')
const iconv = require('iconv-lite')
const cheerio = require('cheerio')
const fs = require('fs')
function request(url, callback) {
const option = {
encoding: null,
url: url,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
}
originRequest(option, callback)
}
const lists = [];
function getList(start = 0) {
request('https://book.douban.com/top250?start='+ start, (err, res, body) => {
const html = iconv.decode(body, 'utf8');
const $ = cheerio.load(html)
let books = $('.indent table tr');
let list = [];
for(let i = 0; i < books.length; i++) {
// 名字
let name = $(books[i]).find('.pl2 a').text().trim();
// 图片地址
let img = $(books[i]).find('img').attr('src');
// 具体描述
let bookDesc = $(books[i]).find('p.pl').text().trim();
// 星级
let rating_num = $(books[i]).find('.rating_nums').text().trim();
// 多少人评论
let pl = $(books[i]).find('.star .pl').text().trim();
// 引用文字
let quote = $(books[i]).find('.quote').text().trim();
list.push({name, img, bookDesc, rating_num, pl, quote});
}
lists.push(...list);
if(start == 225) { // 250条数据 每页25条 225就拿完了
fs.appendFile('books.json', JSON.stringify(lists, null, '\t'), (err) => {
if(err) return err;
console.log('文件已被保存');
})
return;
} else {
getList(start + 25);
}
});
}
getList(0);
爬取电影Top250也是一样的思路,只是文档结构不一样。