Zombiejs - fetching contents of links synchronouly - node.js

I have been playing with nodejs and zombiejs to fetch some personal data from a site. Unfortunately I am stuck at a point where zombiejs only gets me the data from first link and then hangsup.
The steps I follow are-
Go to to the base url
Get the number of pages
Use async library to fetch them in series by opening a new browser window everytime. Note I only create a browser window instead of a totally new browser instance as it expensive to create one.
This is my code
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
});
},
function(e) {
console.log(e);
});
});
Results
>node main_zombie.js
..... HTML DUMP
http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=1
>
Any suggestions would be appreciated

Found the mistake
As per https://github.com/caolan/async#each
One needs to call the callback function with empty arguments or null if there is no error.
So the correct code would be
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
// Add callback and check if we reached the last page
if (k == 10) {
browser.close();
}
callback();
});
},
function(e) {
console.log(e);
});
});

Related

page renders before getting all the values sorted

I think the rendering takes place before the searching of the string on the files, i have tried different methods but don't seems to get this working. any help will be appreciated. im a noob on to the nodejs. im trying to get the id of the user and query and get all the data and there after see if he is in any of the lists given and finally render the page.
const j = [];
let name = '';
const filename = [];
var ext = '';
module.exports = function(app, express) {
app.use(bodyParser.urlencoded({ extended: false }));
app.use(bodyParser.json());
app.post('/cusdetails', isLoggedIn, function (req, res) {
var cusid=req.body.cusid;
var insertQuerys = "SELECT * FROM customer WHERE cusid=? ORDER BY rowid DESC LIMIT 1";
connection.query(insertQuerys,[cusid],
function(err, rows){
rows.forEach( (row) => {
name=row.fncus;
});
fs.readdir('./views/iplist', function(err, files) {
if (err)
throw err;
for (var index in files) {
j.push(files[index])
}
j.forEach(function(value) {
var k = require('path').resolve(__dirname, '../views/iplist/',value);
fs.exists(k, function(fileok){
if(fileok) {
fs.readFile(k, function(err, content) {
if (err) throw err;
if (content.indexOf(name) > -1) {
ext = path.extname(k);
filename.push(path.basename(k, ext));
}
});
}
else {
console.log(" FileNotExist ");
}
});
});
});
console.log(filename);
res.render('cusdetails.ejs', {rows: rows, user:req.user , aml: filename });
});
})
You can create simple Promise wrapper and then use it inside async/await function to pause execution until resolved.
// use mysql2 package as it provides promise, less work to write promise wrappers
const mysql = require('mysql2/promise');
// create the connection to database
const connection = mysql.createConnection({
host: 'localhost',
user: 'root',
database: 'test'
});
// sample wrapper
function some(k) {
// more advisable to have local variables, why do you need this to be array?
var filename = [];
return new Promise((resolve, reject) => {
// doing this is also not recommended check nodejs documentation **fs.exists** for more info
fs.exists(k, function(fileok){
if(fileok) {
fs.readFile(k, function(err, content) {
if (err) reject(err);
if (content.indexOf(name) > -1) {
ext = path.extname(k);
filename.push(path.basename(k, ext));
resolve(filename)
}
});
}
else {
// reject(new Error("FileNotExist"))
console.log(" FileNotExist ");
}
});
})
}
// note the use of async
app.post('/cusdetails', isLoggedIn, async function (req, res) {
var cusid=req.body.cusid;
var insertQuerys = "SELECT * FROM customer WHERE cusid=? ORDER BY rowid DESC LIMIT 1";
// using await to pause excution, waits till query is finished
const [rows] = await connection.query(insertQuerys,[cusid])
rows.forEach( (row) => {
name=row.fncus;
});
// then you can
var result = await some(k)
...
Note however this way you loose the advantage of concurrent execution, as it's kindoff blocking. If the result of one call is not used in another, you can execute in parallel and await for result to achieve sequencing like
const [rows] = connection.query(insertQuerys,[cusid])
var result = some(k)
console.log(await rows) // do something
console.log(await result) // do something
JavaScript is asynchronous. This means that if you have a function with a callback (i.e. your query), the callback will be called asynchronously, at an unknown time, while the other code executes.
You need to look up some tutorials how to deal with callbacks, to get a proper understanding of it. Another method is using async/await and/or promises.
Basically, if you take the following code:
console.log("this will print first");
setTimeout(function () {
console.log("this will print last");
}, 1000);
console.log("this will print second");
If you run the code above, the top level is executed synchronously, so, it first calls console.log, then it executes setTimeout, which is synchronous. It sets a timeout, then says "I'm ready", and the code continues to the other console.log. After 1 second (1000 milliseconds), the callback in the setTimeout function is executed, and only then that console.log is called. You can not make the rest of the code wait this way, you need to restructure your code or read into promises.

Structure of a synchronous application in Node.js and MongoDb

I need to build an application that does these things (in order):
on load:
01- connect to MongoDB 'db'
02- creates a collection 'cas'
03- check if a web page has updates, if yes go to step 04, if not go to step 07
04- do web scraping (using Cheerio) of the web site and get a $ variable like that $ = cheerio.load(body);
05- elaborate this object to get only informations I'm interested in and organize them in a jsons object like this one:
var jsons = [
{year: 2015, country: Germany, value: 51},
{year: 2015, country: Austria, value: 12},
{year: 2016, country: Germany, value: 84},
{year: 2016, country: Bulgaria, value: 104},
...
];
06- insert each of these elements ({year: 2015, country: Germany, value: 51}, ...) in the collection 'cas' of database 'db'
07- download the data (for example in a csv file)
08- create a web page for data visualization of these data using D3.js
09- disconnect from 'db'
If Node.js were synchronous, I could write something like this:
var url = 'http://...';
var jsons = [];
connectDb('db');
createCollection('db', 'cas');
if(checkForUpdates(url)) {
var $ = scrape(url);
jsons = elaborate($);
for(var i = 0; i < jsons.length; i++) {
saveDocumentOnDbIfNotExistsYet('db', 'cas', jsons[i]);
}
}
downloadCollectionToFile('db', 'cas', './output/casData.csv');
createBarChart('./output/casData.csv');
disconnectDb('db');
But Node.js is asynchronous so this code would not work properly.
I've read that I can use Promise to get the code to run in a certain order.
I read the documentation about the Promise and some sites that showed simple tutorials.
The structure of a Promise is:
// some code (A)
var promise = new Promise(function(resolve, reject) {
// some code (B)
});
promise.then(function() {
// some code (C)
});
promise.catch(function() {
// some code (D)
});
// some code (E)
If I understood correctly, in this case the execution (if Node.js were synchronous) would be equivalent to:
// some code (A)
// some code (E)
if(some code (B) not produce errors) {
// some code (C)
}
else {
// some code (D)
}
or (swap between code A and E, because they are asynchronous)
// some code (E)
// some code (A)
if(some code (B) not produce errors) {
// some code (C)
}
else {
// some code (D)
}
So now I wonder what is the right structure for my application.
I thought about:
var cheerio = require('cheerio');
var express = require('express');
var fs = require('fs');
var MongoClient = require('mongodb').MongoClient;
var dbUrl = 'mongodb://localhost:27017/';
var dbName = 'db';
var collectionName = 'cas';
const app = express(); // run using > node app.js
// connect to db
var connect = function(url) {
return new Promise(function(resolve, reject) {
MongoClient.connect(url + dbName, function(err, db) {
if(err) {
reject(err);
}
else {
console.log('Connected');
resolve(db);
}
});
});
}
// create collection
connect.then(function(db) {
db.createCollection(collectionName, function(err, res) {
if(err) {
throw err;
}
else {
console.log('Collection', collectionName, 'created!');
}
});
});
// connection error
connect.catch(function(err) {
console.log('Error during connection...');
throw err;
});
It's right? If yes, how can I proceed with other steps?
I can I improve my code?
EDIT 1
Following the example of Андрей Щербаков, I modified my code in this way:
app.js:
// my files
var db = require('./middlewares/db.js');
var url = 'mongodb://localhost:27017/';
var dbName = 'db';
var collectionName = 'cas';
const start = async function() {
const connect = await db.connectToMongoDb(url, dbName);
const cas = await connect.createYourCollection(collectionName);
const isPageHasUpdates = oneMoreFunction(); // i don't know how you gonna check it
if(isPageHasUpdates) {
await step 4;
await step 5;
await step 6;
}
await step 7
return something; // if you want
}
start()
.then(res => console.log(res)) // here you can use result of your start function if you return something or skip this then
.catch(err => console.log(err)); // do something with your error
middlewares/db.js:
var MongoClient = require('mongodb').MongoClient;
let dbInstance;
var methods = {};
methods.connectToMongoDb = function(url, dbName) {
if(dbInstance) {
return dbInstance;
}
else {
MongoClient.connect(url + dbName, function(err, db) {
if(!err) {
dbInstance = db;
return db;
}
});
}
}
methods.createYourCollection = function(collectionName) {
?.createCollection(collectionName, function(err, res) {
if(err) {
throw err;
}
});
}
module.exports = methods;
But I'm not sure I'm doing well.
How can I separate function in different files? For example I want to put all the function about db in file middlewares/db.js. But I have some problems in line ?.createCollection(collectionName, function(err, res).
If you are running node version 7.6 or higher, better way will be to use async await which works with promises.
So your code will look like
const start = async() => {
const connect = await connectToMongoDb(url);
const cas = await connect.createYourCollection();
const isPageHasUpdates = oneMoreFunction(); // i don't know how you gonna check it
if(isPageHasUpdates) {
await step 4;
await step 5;
await step 6;
}
await step 7
return something; // if you want
}
start()
.then(res => console.log(res)) // here you can use result of your start function if you return something or skip this then
.catch(err => console.log(err)); // do something with your error
Sure any function you are gonna await should be promisified as you did with your connect function( but if you are using https://www.npmjs.com/package/mongodb functions already promisified)
Update
The best way will be to use mongoose, but if you want to work with native mongodb you can write your mongodb like this https://pastebin.com/BHHc0uVN (just an example)
You can expand this example as you want.
You can create function createCollection
const createCollection = (connection, collectionName) => {
return connection.createCollection(collectionName); // actually i'm not sure that this function exists in mongodb driver
}
And usage will be:
const mongodbLib = require('./lib/mongodb'); //path to db.js file
mongodbLib.init()
.then(connection => mongodbLib.createCollection(connection, 'cas'))
.then(() => doSmthElse())
Or if you are sure that init is done(you can do it once before you main script like starting server or whatever you doing)
const mongodbLib = require('./lib/mongodb'); //path to db.js file
const connection = mongodbLib.getConnection();
Or if you want to simple work with collection like in step 6, add your cas collection(like user in example file). But this you can use when your init function is done as well.
So usage will be
const mongodbLib = require('./lib/mongodb');
const cas = mongodbLib.collections.cas;
cas().insertMany(docs)
.then()
.catch()

How to Synchronize the file writes in Node.Js

I am using the EJS compile to create notification templates and I would like to know how to write the file to the file system in parallel and send the notification once all the files are saved.
Please see the below code snippet which I used
var fs = require('fs');
var ejs = require('ejs');
var arrayOfData = [someData]; //Prepare data from database
//Iterate through the data
for (var i = 0; i < arrayOfData.length; i++) {
generateFileFromTemplate(arrayOfData[i],function(){});
}
function generateFileFromTemplate(templateData,callback)
{
var outputFile = fileData.Id + ".html";
var compiled = ejs.compile(fs.readFileSync('email-template.ejs', 'utf8'));
var html = compiled(templateData);
fs.writeFile(outputFile, html, callback);
}
Please help.
Use async.each for your use case
async.each(arrayOfData,
function(ele, next){
generateFileFromTemplate(ele,function(){});
},
function(err){
if(err) console.log('err', err);
sendNotification();
}
);
You can use a great utility library called Async, particularly its parallel method: https://github.com/caolan/async#parallel.
Here's an example:
var async = require('async');
/*-------------*/
var tasks = arrayOfData.map(function(data) {
return function(cb) {
generateFileFromTemplate(data,function(){});
cb(null);
}
});
async.parallel(tasks, function(err) {
console.log('My job is done');
})

Inconsistent method result in node js

I have been trying to figure the following for the last couple of days and just can't seem to figure out the answer. I am new to node and JS (only experience is online tutorials).
I am trying to create a class (function) to scrape the source code from websites. I want to read in a url from the command line and return the html content. However, I seem to be getting different results when running the code different ways (which I think I should be getting the same results).
I have been reading about events in node and so I have used them a little in the code. One listener event prompts the me for the url and then after setting the url it (the listener function) emits a message, which is picked up by another listener which goes out and fetches the html content.
The problem I am having is that when I create an instance of the object, it seems like the request portion of the code does not execute. However, if I call the method from the instance I get the print out of the html content of the page.
Any help is appreciated. Thanks.
function test() {
var events = require('events').EventEmitter;
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
console.log("Input the URL: ");
process.stdin.resume();
process.stdin.setEncoding('utf8');
process.stdin.on('data', function (text) {
that.url = util.inspect(text);
that.url = that.url.substr(1, that.url.length - 4);
that.eventEmitter.emit('Get url html');
process.exit();
});
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
scrapper.httpGet(); // This gives the desired results from that.httpGet
I solved using the Prompt library https://www.npmjs.com/package/prompt
function test() {
var events = require('events').EventEmitter;
var prompt = require('prompt');
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
prompt.start();
process.stdin.setEncoding('utf8');
prompt.get(['url'], function( err, result ) {
that.url = result.url;
that.eventEmitter.emit('Get url html');
} );
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
// scrapper.httpGet(); // This gives the desired results from that.httpGet
I ran the script from the commandline, input http://www.google.com and it retrieved the results without the additional call to scrapper.httpGet();

WebRTC getStat() API Set UP

I am trying to use getStat() from WebRTC's api to see if it provides any useful info measure latency and other video streaming data. The problem is that there's not much info of how to use it.
Even older existing examples are pretty rare but the api has changed since then.
For example, my set up:
peerconnection.getStats(function(stats) {
console.log(stats); } ));
This returns a RTCStatsResponse object with 2 functions
RTCStatsResponse {result: function, namedItem: function}
Trying to call that result() function returns an array of RTCStatsReport objects with type 'googLibjingleSession' for the 1st object and type 'googTrack' for the 2nd object. The other nameItem function is undefined when trying to call it
[RTCStatsReport, RTCStatsReport]
From what little info available (https://groups.google.com/forum/#!topic/discuss-webrtc/fpr4yn4-3sg), I would be getting alot more RTCStatObjects with more useful info than I am currently getting.
Does anyone have experience with using webrtc's getStats? I believe I may not be doing this correctly
The following solution works for me.
Creating peer connection
pc = new RTCPeerConnection(pc_config, pc_constraints);
Adding onaddstream handler
pc.onaddstream = onRemoteStreamAdded;
The handler itself
var onRemoteStreamAdded = function(event) {
attachMediaStream(remoteVideo, event.stream);
remoteStream = event.stream;
getStats(pc);
};
Pay attention on the getStats function called from the handler, the function is following
function getStats(peer) {
myGetStats(peer, function (results) {
for (var i = 0; i < results.length; ++i) {
var res = results[i];
console.log(res);
}
setTimeout(function () {
getStats(peer);
}, 1000);
});
}
The myGetStats function is a wrapper to make it possible universal in different browsers;
function myGetStats(peer, callback) {
if (!!navigator.mozGetUserMedia) {
peer.getStats(
function (res) {
var items = [];
res.forEach(function (result) {
items.push(result);
});
callback(items);
},
callback
);
} else {
peer.getStats(function (res) {
var items = [];
res.result().forEach(function (result) {
var item = {};
result.names().forEach(function (name) {
item[name] = result.stat(name);
});
item.id = result.id;
item.type = result.type;
item.timestamp = result.timestamp;
items.push(item);
});
callback(items);
});
}
};
Every second it will get statistics and print raw object into console log. You can parse the log and then change the code, getting necessary object's field.

Resources