micro-scraper
Node.js 爬虫示例 (for:百度百科)
app.js
var request = require('request'),
cheerio = require('cheerio'),
http = require('http'),
url = require('url');
var host = 'http://baike.baidu.com/view/39744.htm';//可修改为其他的百科地址
var html = [];
setInterval(scraper(host), 1000*60*15);//15 分钟更新一次
function scraper (host) {
request(host, function (error, response, data) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(data);
var title = $('.title').first().text(),
header = [],
nav = [],
body = [];
//删除无用数据
$('.title').remove();
$('.pic-info').remove();
$('.count').remove();
$('sup').remove();
//筛选有用数据
$('#lemmaContent-0 .headline-1').each(function (i) {
var str = '',
$next = $(this).next();
while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) {
if ($next.hasClass('headline-2')) {
str += "<p><strong>" + $next.text() + "</strong></p>";
} else {
str += "<p>" + $next.text() + "</p>";
}
$next = $next.next();
}
header.push($(this).find('.headline-content').text());
nav.push("<span><a href='/" + i + "'>" + header[i] + "</a></span>");
body.push(str);
});
var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数
for (var i = 0; i < len; i++) {
html[i] = "" +
"<!DOCTYPE html>" +
"<html>" +
"<head>" +
"<meta charset='UTF-8' />" +
"<title>" + title + "</title>" +
"<style type='text/css'>" +
"body{width:600px;margin:2em auto;font-family:'Microsoft YaHei';}" +
"p{line-height:24px;margin:1em 0;}" +
"header{border-bottom:1px solid #cccccc;font-size:2em;font-weight:bold;padding-bottom:.2em;}" +
"nav{float:left;font-family:'Microsoft YaHei';margin-left:-12em;width:9em;text-align:right;}" +
"nav a{display:block;text-decoration:none;padding:.7em 1em;color:#000000;}" +
"nav a:hover{background-color:#003f00;color:#f9f9f9;-webkit-transition:color .2s linear;}" +
"</style>" +
"</head>" +
"<body>" +
"<header>" + header[i] + "</header>" +
"<nav>" + nav.join('') + "</nav>" +
"<article>" + body[i] + "</article>" +
"</body>" +
"</html>";
}
}
});
}
http.createServer(function (req, res) {
var path = url.parse(req.url).pathname;
path = path == '/' ? 0 : parseInt(path.slice(1));
res.writeHead(200, {"Content-Type":"text/html"});
res.end(html[path]);
}).listen(3000);
console.log('Server running at localhost:3000');