
Question:
I am in the market for a new vehicle. Instead of repeatedly searching the dealerships websites, I thought this would be an interesting and fun opportunity to learn a little node and mongodb so I'm scraping my local dealerships' websites to grab the makes and models that I am interested in.
The problem that I am running into is that node won't terminate after my final callback has run through.
var cheerio = require('cheerio');
var request = require('request');
var db = require('mongodb');
var S = require('string');
var log = require('console').log;
var async = require('async');
var links = [];
var website = 'http://www.yahoo.com';
async.series(
[
function(){
log('starting');
db.connect('mongodb://127.0.0.1:27017/test',
function(err, base){
if(err) throw err;
db = base;
});
},
request(website, start)
],
function(){
log('closing DB');
db.close();
});
function start(err,resp,body){
var $ = cheerio.load(body);
var numPages = 2;
$('.gbps').each(function(i,elem) {
links.push('http://www.yahoo.com');
});
var pageURLS = [];
for (var i = 2; i<=numPages; i++){
//create URLs for additional pages
pageURLS[i-2] = website;
}
var pages = 1;
log('getting page URLs');
pageURLS.forEach(function(url, index, array){
request(url, function(error,response,bodies) {
pages++;
var $ = cheerio.load(bodies);
$('.tab').each(function(i,elem) {
links.push('http://www.yahoo.com');
});
if (pages == numPages){
getDetailInfo();
};
});
});
}
function getDetailInfo(){
log(links.length);
links.forEach(function(link, index, array){
request(link, doStuff);
});
}
function doStuff(err, response, body){
if(err){
log(err);
}
parseDetailResponse(err,response,body, addToDB);
}
function parseDetailResponse(err,resp,body,callback){
log('parsing');
var $ = cheerio.load(body);
var specs = $('.specifications').children().map(function(i, elem){
var key = 'key';
var value = 'value';
var ret = {};
ret [ 'name' ] = key;
ret [ 'value' ] = value;
return ret;
});
var makeAndModel = 'makeAndModel';
callback(['picture url', 'vehicle description', 100, specs, makeAndModel]);
}
function getMakeAndModel(stuff){
var $ = cheerio.load(stuff);
temp = $('.gbps').map(function(i, elem){
var ret = {};
switch(i){
case 0:
ret['name'] = 'year';
ret['value'] = $(this).text();
break;
case 1:
ret['name'] = 'make';
ret['value'] = $(this).text();
break;
case 2:
ret['name'] = 'model';
ret['value'] = $(this).text();
break;
case 3:
ret['name'] = 'ignore';
ret['value'] = $(this).text();
break;
default:
ret['name'] = 'ignore';
ret['value'] = 'ignore';
}
return ret;
});
return temp;
}
function addToDB(arr){
log('adding to DB');
pic = arr[0];
description = arr[1];
price = arr[2];
specs = arr[3];
makeAndModel = arr[4];
var obj = {};
for (var i = specs.length - 1; i >= 0; i--) {
obj [specs[i].name] = specs[i].value;
};
for (var i = makeAndModel.length - 1; i >= 0; i--){
obj [makeAndModel[i].name] = makeAndModel[i].value;
};
db.collection('carsTest').update(
{VIN: obj.VIN},
{
$set: {
VIN: obj.VIN,
make: obj.make,
model: obj.model,
year: obj.year,
price: price,
engine: obj.Engine,
interior: obj.Interior,
exterior: obj.Exterior,
'model code': obj['Model Code'],
'stock number': S(obj['Stock Number']).toInt(),
transmission: obj.Transmission,
mileage: obj.Mileage ? obj.Mileage : 0,
description: description,
picture: pic,
}
},
{upsert: true, safe: true},
function(err,result){
if(err){
throw err;
}
});
log('finished with this one!');
}
I've omitted and changed a fair amount as a proof here without a lot of error checking or anything but even this will add the document but won't quit. Node just sits there, waiting for something to happen and it never calls the final callback to close the db and exit.
> db.carsTest.find().pretty()
{
"_id" : ObjectId("52139aa7c9b7a39e0f1eb61d"),
"VIN" : null,
"description" : "vehicle description",
"engine" : null,
"exterior" : null,
"interior" : null,
"make" : null,
"mileage" : 0,
"model" : null,
"model code" : null,
"picture" : "picture url",
"price" : 100,
"stock number" : NaN,
"transmission" : null,
"year" : null
}
Answer1:I think that you misunderstand how async.series
works.
Your functions in async.series
don't take callback
as an argument and they don't call it. And that request(...)
stuff is probably not a function at all. That's probably why it breaks async loop. Try this:
async.series(
[
function(callback) { // <--- missing callback
log('starting');
db.connect('mongodb://127.0.0.1:27017/test',
function(err, base){
if(err) throw err;
db = base;
callback(); // <--- missing callback
});
},
function(callback) { // <--- missing function with callback
request(website, function(err,resp,body) {
start(err, resp, body, callback);
})
}
],
function(){
log('closing DB');
db.close();
}
);
Note that I've added callback
argument when calling start
. Thus you will have to refactor your code havily so that every function accepts callback
which can be called at the end when you know that all jobs are done. For example you can add async.parallel
inside start
and this function may look like this:
function start(err, resp, body, callback) {
// some stuff happens here
var jobs = []
pageURLS.forEach(function(url, index, array){
jobs.push(function(clb) {
request(url, function(error,response,bodies) {
// some stuff
clb(); // <--- this refers to the local callback for the job
});
});
});
async.parallel(jobs, function() {
// all jobs are done, let's finilize everything
callback();
});
};