I'm trying to write some parser on node. I can't understand something here.
I need to stop requesting next pages if previous result was wrong.
My current code is something like this:
var request = require('request');
var cheerio = require('cheerio');
var links = ['http://link1','http://link2','http://link3'];
for(l in links) {
var link = links[l];
request(link, function(err, page) {
if(err) throw err;
$ = cheerio.load(page.body);
if($('a').length < 2) {
// here i need to stop requesting next url(s) from links array somehow!
// (if this is the case of link1 then link2 and link3 will not request)
} else {
// do something...
}
$.html();
});
}
Node.js is an asynchronous language, so you can't stop it in native javascript like break;, so you probably try using the async module like this:
var request = require('request');
var cheerio = require('cheerio');
var links = ['http://link1', 'http://link2', 'http://link3'];
var async = require('async');
async.eachSeries(links, function(link, callback) {
request(link, function(err, page) {
if (err) {
callback(err);
return;
}
$ = cheerio.load(page.body);
if ($('a').length < 2) {
// here i need to stop requesting next url(s) from links array somehow!
// (if this is the case of link1 then link2 and link3 will not request)
callback(err);
return;
} else {
// do something...
}
$.html();
callback();
});
}, function(err) {
if (err) {
//do something id some of the links throws err
return;
}
//do womething if every request was success
});
The above code will run a function for each element in the array(links), if one of the links will call callback with error, it will immidiately stop the iteration and will call the callback function with the error. Otherwise if all of the elements will call callback without any params, the callback function will be called without any params.
Please look at async docs to see other options that might help you in other ways.
Related
I think the rendering takes place before the searching of the string on the files, i have tried different methods but don't seems to get this working. any help will be appreciated. im a noob on to the nodejs. im trying to get the id of the user and query and get all the data and there after see if he is in any of the lists given and finally render the page.
const j = [];
let name = '';
const filename = [];
var ext = '';
module.exports = function(app, express) {
app.use(bodyParser.urlencoded({ extended: false }));
app.use(bodyParser.json());
app.post('/cusdetails', isLoggedIn, function (req, res) {
var cusid=req.body.cusid;
var insertQuerys = "SELECT * FROM customer WHERE cusid=? ORDER BY rowid DESC LIMIT 1";
connection.query(insertQuerys,[cusid],
function(err, rows){
rows.forEach( (row) => {
name=row.fncus;
});
fs.readdir('./views/iplist', function(err, files) {
if (err)
throw err;
for (var index in files) {
j.push(files[index])
}
j.forEach(function(value) {
var k = require('path').resolve(__dirname, '../views/iplist/',value);
fs.exists(k, function(fileok){
if(fileok) {
fs.readFile(k, function(err, content) {
if (err) throw err;
if (content.indexOf(name) > -1) {
ext = path.extname(k);
filename.push(path.basename(k, ext));
}
});
}
else {
console.log(" FileNotExist ");
}
});
});
});
console.log(filename);
res.render('cusdetails.ejs', {rows: rows, user:req.user , aml: filename });
});
})
You can create simple Promise wrapper and then use it inside async/await function to pause execution until resolved.
// use mysql2 package as it provides promise, less work to write promise wrappers
const mysql = require('mysql2/promise');
// create the connection to database
const connection = mysql.createConnection({
host: 'localhost',
user: 'root',
database: 'test'
});
// sample wrapper
function some(k) {
// more advisable to have local variables, why do you need this to be array?
var filename = [];
return new Promise((resolve, reject) => {
// doing this is also not recommended check nodejs documentation **fs.exists** for more info
fs.exists(k, function(fileok){
if(fileok) {
fs.readFile(k, function(err, content) {
if (err) reject(err);
if (content.indexOf(name) > -1) {
ext = path.extname(k);
filename.push(path.basename(k, ext));
resolve(filename)
}
});
}
else {
// reject(new Error("FileNotExist"))
console.log(" FileNotExist ");
}
});
})
}
// note the use of async
app.post('/cusdetails', isLoggedIn, async function (req, res) {
var cusid=req.body.cusid;
var insertQuerys = "SELECT * FROM customer WHERE cusid=? ORDER BY rowid DESC LIMIT 1";
// using await to pause excution, waits till query is finished
const [rows] = await connection.query(insertQuerys,[cusid])
rows.forEach( (row) => {
name=row.fncus;
});
// then you can
var result = await some(k)
...
Note however this way you loose the advantage of concurrent execution, as it's kindoff blocking. If the result of one call is not used in another, you can execute in parallel and await for result to achieve sequencing like
const [rows] = connection.query(insertQuerys,[cusid])
var result = some(k)
console.log(await rows) // do something
console.log(await result) // do something
JavaScript is asynchronous. This means that if you have a function with a callback (i.e. your query), the callback will be called asynchronously, at an unknown time, while the other code executes.
You need to look up some tutorials how to deal with callbacks, to get a proper understanding of it. Another method is using async/await and/or promises.
Basically, if you take the following code:
console.log("this will print first");
setTimeout(function () {
console.log("this will print last");
}, 1000);
console.log("this will print second");
If you run the code above, the top level is executed synchronously, so, it first calls console.log, then it executes setTimeout, which is synchronous. It sets a timeout, then says "I'm ready", and the code continues to the other console.log. After 1 second (1000 milliseconds), the callback in the setTimeout function is executed, and only then that console.log is called. You can not make the rest of the code wait this way, you need to restructure your code or read into promises.
I'm using cheerio, request and Node.js.
When I run the script below, it outputs names in a wrong order. I believe that it's caused by asynchronous nature of it, how can I make it work in the "right" order? Do I need to use a sync package or is there a way to change it in a way so it'll work in a sync way?
app.get('/returned', function (req, res) {
for (var y = 0; y < 10; y++) {
var url = "http://example.com" + y + "/person.html";
request(url, function (err, resp, body) {
$ = cheerio.load(body);
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("returned null");
} else {
console.log(name);
}
});
}
});
Promise makes this relatively easy:
app.get('/returned', function (req, res) {
let urls = [];
for (let y = 0; y < 10; y++) {
urls.push('http://example.com' + y + '/person.html');
}
Promise.all(urls.map(function (url) {
return new Promise(resolve, reject) {
request(url, function (err, resp, body) {
if (err) {return reject(err);}
let $ = cheerio.load(body);
let links = $('#container');
let name = links.find('span[itemprop="name"]').html(); // name
resolve({name: name, links: links, url: url});
});
});
}).then(function (result) {
result.forEach(function (obj) {
if (obj.name == null) {
console.log(obj.url, "returned null");
} else {
console.log(obj.url, obj.name);
}
});
}).catch(function (err) {
console.log(err);
});
});
I started by creating an array of urls to get, then I mapped that to an array of promises. When each of the requests are complete, i resolved the promise with the name, url, and links. When all promises were complete, I then looped over the result which will will be in the original order. This runs in parallel.
Nope, you shouldn't have to use a sync package. IMO the cleanest way is to use a mature 3rd party library.
I'd recommend async.
The async.series method would execute all request functions in the order they are given, then allow you to register a callback to fire when all requests have been made, or when an error has occurred.
https://github.com/caolan/async#seriestasks-callback
I have been trying to figure the following for the last couple of days and just can't seem to figure out the answer. I am new to node and JS (only experience is online tutorials).
I am trying to create a class (function) to scrape the source code from websites. I want to read in a url from the command line and return the html content. However, I seem to be getting different results when running the code different ways (which I think I should be getting the same results).
I have been reading about events in node and so I have used them a little in the code. One listener event prompts the me for the url and then after setting the url it (the listener function) emits a message, which is picked up by another listener which goes out and fetches the html content.
The problem I am having is that when I create an instance of the object, it seems like the request portion of the code does not execute. However, if I call the method from the instance I get the print out of the html content of the page.
Any help is appreciated. Thanks.
function test() {
var events = require('events').EventEmitter;
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
console.log("Input the URL: ");
process.stdin.resume();
process.stdin.setEncoding('utf8');
process.stdin.on('data', function (text) {
that.url = util.inspect(text);
that.url = that.url.substr(1, that.url.length - 4);
that.eventEmitter.emit('Get url html');
process.exit();
});
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
scrapper.httpGet(); // This gives the desired results from that.httpGet
I solved using the Prompt library https://www.npmjs.com/package/prompt
function test() {
var events = require('events').EventEmitter;
var prompt = require('prompt');
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
prompt.start();
process.stdin.setEncoding('utf8');
prompt.get(['url'], function( err, result ) {
that.url = result.url;
that.eventEmitter.emit('Get url html');
} );
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
// scrapper.httpGet(); // This gives the desired results from that.httpGet
I ran the script from the commandline, input http://www.google.com and it retrieved the results without the additional call to scrapper.httpGet();
i have a sample function in which i want to trigger 3 sql queries in one express post request
function getRelatedSalespersonByCardcode(req, res) {
var reqJson = JSON.parse(req.body.json);
var count = Object.keys(reqJson.cardcode).length;
var tmpResult1, tmpResult2, tmpResult3 = [];
var q = sql.open(connstr, function (err) {
if (err) {
console.log(err);
return;
}
for (var i = 0; i < count; i++) {
q.queryRaw("SELECT Division, Salesperson FROM SomeDB.dbo.MS2_Rel_BusinessPartnerSalesperson WHERE CardCode = " + reqJson.cardcode[i], function (e, results) {
if (e) {
console.log(e);
return;
}
tmpResult1.push(results);
});
q.queryRaw("SELECT SlpCode, SlpName, Memo, Commission, GroupCode, UserSign, Active, U_wpABIS, U_sweDW," +
" U_sweATT, U_sweDIV, U_sweEMPLOYEE, U_sweRETAILER FROM SomeDB.dbo.OSLP WHERE U_sweId = " + tmpResult1[1], function (e, results) {
if (e) {
console.log(e);
return;
}
tmpResult2.push(results);
});
q.queryRaw("SELECT Code, Name, U_sweSALES FROM SomeDB.dbo.[#SWEDIV] WHERE Code = " + tmpResult3[0], function (e, results) {
if (e) {
console.log(e);
return;
}
tmpResult3.push(results);
});
}
});
res.send(200, tmpResult2);
}
Anyway after 1 function is called inside my req, res function the callback is triggert.. so it jumps directly to the res.send(...) line.
i've played a bit around and it seems that this is how express works.
after a bit of googleing around i found out that i have to use the async lib.
i'd like to ask why express is working like this and if anybody maybe have a better solution than the async approach. i simply need a way to realize my scenario.
This is no issue of express and no issue at all. This is like node.js async programming is working. Let's examine what's happening in your code:
var tmpResult2;
// 1. async open sql connection
var q = sql.open(connstr, function (err) {
// 3. query sql connection
});
// 2. render result
res.send(200, tmpResult2);
SQL connection is opened asynchronously.
Express render is called.
SQL queries are executed.
As a consequence this code piece sends data to the client before the data was fetched. So the most simple solution is to invoke res.send inside of the callback like this:
var tmpResult2;
// 1. async open sql connection
var q = sql.open(connstr, function (err) {
// 2. render result
res.send(200, tmpResult2);
});
You can use the wonderful async module to deal with the queries (Take a look at the async waterfall function).
I have been playing with nodejs and zombiejs to fetch some personal data from a site. Unfortunately I am stuck at a point where zombiejs only gets me the data from first link and then hangsup.
The steps I follow are-
Go to to the base url
Get the number of pages
Use async library to fetch them in series by opening a new browser window everytime. Note I only create a browser window instead of a totally new browser instance as it expensive to create one.
This is my code
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
});
},
function(e) {
console.log(e);
});
});
Results
>node main_zombie.js
..... HTML DUMP
http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=1
>
Any suggestions would be appreciated
Found the mistake
As per https://github.com/caolan/async#each
One needs to call the callback function with empty arguments or null if there is no error.
So the correct code would be
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
// Add callback and check if we reached the last page
if (k == 10) {
browser.close();
}
callback();
});
},
function(e) {
console.log(e);
});
});