Node.js - Web Crawling

Updated: 2018-05-27

Crawling

Use request or http to get the raw html.

Request

var request = require('request');

var url = 'http://foo.com';

// plain text
request(url, function (err, res, body) {

}

// gzip
request({url: url, gzip: true}, function (err, res, body) {

}

http

var http = require('http');
http.request({
    host: 'search.twitter.com',
    path: '/search.json?' + qs.stringify({ q: search })
}, function (res) {}

If plain text

http.get(url, function(res) {
  var buffer = [];
  res
    .on("data", function(data) {
      buffer.push(data);
    })
    .on("end", function() {
      parsePage(buffer.join(""));
    });
});

If gziped

var zlib = require("zlib");

http.get(url, function(res) {
  var buffer = [];
  var gunzip = zlib.createGunzip();
  res.pipe(gunzip);

  gunzip
    .on("data", function(data) {
      buffer.push(data);
    })
    .on("end", function() {
      parsePage(buffer.join(""));
    });
});

Parsing

Use cheerio to parse html, after that everything works like jQuery.

var cheerio = require("cheerio");
request(url, function(err, res, body) {
  $ = cheerio.load(body);
  //...
});

Each

$("table tr").each(function(i, row) {
  console.log($(this).html());
});

To get a list of fields, and output the first column

$("table tr").each(function(i, row) {
  var fields = $(this).find("td");
  console.log($(fields[0]).text());
});