Scraping with phantomJS and NodeJS - node.js

I'm following the tutorial listed here :
http://code.tutsplus.com/tutorials/screen-scraping-with-nodejs--net-25560
When I run the code:
var host = 'http://www.shoutcast.com/?action=sub&cat=Hindi#134';
var phantom = require('phantom');
phantom.create(function(ph) {
return ph.createPage(function(page) {
return page.open(host, function(status) {
console.log("opened site? ", status);
page.injectJs('http://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js', function() {
//jQuery Loaded.
//Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
setTimeout(function() {
return page.evaluate(function() {
//Get what you want from the page using jQuery. A good way is to populate an object with all the jQuery commands that you need and then return the object.
console.log(document.getElementsByClassName('transition')[0]);
return document.getElementsByClassName('transition')[0];
}, function(result) {
console.log(result);
ph.exit();
});
}, 5000);
});
});
});
});
I get the following error :
phantom stdout: ReferenceError: Can't find variable: $
phantom stdout: phantomjs://webpage.evaluate():7
phantomjs://webpage.evaluate():10
phantomjs://webpage.evaluate():10
I have no idea what this means and there's no help on how to resolve it ...
How can this be solved ?
Basically I want all the 'a' tags with class transition from the site I'm scraping. All these tags are loaded asynchronously on the site.

The $ is due to jQuery and possible conflicts. You hardly require to inject jQuery just to scrape 'a' tags with class transition. You always have document.querySelector or document.querySelectorAll.
var host = 'http://www.shoutcast.com/?action=sub&cat=Hindi#134';
var phantom = require('phantom');
phantom.create(function(ph) {
ph.createPage(function(page) {
page.open(host, function(status) {
console.log("opened site? ", status);
//Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
setTimeout(function() {
page.evaluate(function() {
// here you need to add more code to get the html/text
// more code incase you use querySelectorAll
return document.document.querySelector('a.transition');
//return document.document.querySelectorAll('a.transition');
},
function(result) {
console.log(result);
ph.exit();
});
}, 5000);
});
});
});
However, I am not able to understand the way function (result) { console.log(result); ...} is coded. I am not aware if page.evaluate takes callback function as second parameter. Please check that with the documentation.

Related

How to Get Window Variable Using WebdriverIO

I am trying to run webdriverio with PhantomJS/Chrome to load a page and then grab the window object for use with other scripts. For some reason I am unable to get the window object. Everytime I get, I end up seeing output like this:
Title is: XXXXX
{ state: 'pending' }
Using the following script:
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'chrome',
logLevel: 'verbose'
}
};
var client = webdriverio.remote(options);
client
.init()
.url('https://xxxx.com')
.waitUntil(function () {
return client.execute(function () {
return Date.now() - window.performance.timing.loadEventEnd > 40000;
}).then(function (result) {
console.log(window);
return window;
});
})
.end();
Does anyone know how I can fix my code so that the window object is returned to my NodeJS console app after the page is completely loaded?
Thanks!
Window is an object in the browser's DOM, so it's only available inside of the 'execute' function. If you wanted access to it, you could return it from your 'execute' function:
return client.execute(function () {
return window;
}).then(function (result) {
console.log(result);
});
This work as well:
browser.execute('return window');

Gather POST data asynchronously

I've got a node.js-based webserver running at home and i'm trying to implement a login form.
So basically I need to access POSTed data (login/password). I found this :
How do you extract POST data in Node.js?
(req.on('data'...) & req.on('end'...))
But i need to do this asynchronously, can someone tell me how to do that ?
(I need this code to be blocking, not non-blocking)
EDIT: All my code is available on Github :
https://github.com/Fointard/NodeJs/tree/authentication
The problem lies here : https://github.com/Fointard/NodeJs/blob/authentication/js/reqHandlers/auth.js Lines 98 and 104, i'm relying on 'data' and 'end' envents but i'd like to do that asychronously so that checkID() (line 95) is able to return true or false.
You can't. HTTP Requests are I/O operations and will always be resolved asychronously. Your checkID function will never return a value. You need to add a second parameter (usually called callback) that will be called with true or false.
function checkID(req, callback) {
var body = '';
req.on('data', function (data) {
body += data;
if (body.length > 1e6)
req.connection.destroy();
});
req.on('end', function () {
var post = qs.parse(body);
if ('loginInputLogin' in post && 'loginInputPassword' in post) {
console.log('login : '+post['loginInputLogin']);
console.log('password : '+post['loginInputPassword']);
}
if (post['loginInputLogin'] === 'fointard' && post['loginInputPassword'] === 'f01n') {
console.log('ID confirmed');
callback(true);
}
callback(false);
});
}
And use it like so:
checkID(yourRequest, function(success) {
if(success) {
//login successfull
} else {
//login failed
}
})

How can I access PhantomJS WebPage module from inside Mocha tests in NodeJS?

I'm basing this off of the Page Loading section of http://phantomjs.org/quick-start.html
I want to do something like this:
tests.js
var should = require('chai').should();
var page = require('webpage').create();
describe('test website with phantomJS', function() {
it('should load html from page', function() {
page.open('myHomePageToTest.html', function(status) {
if (status === 'success') {
page.content.should.equal('<!DOCTYPE html>...etc...</html>');
}
});
});
});
If I try to run this with 'mocha-phantomjs test.js' I get the error 'Failed to start mocha: Init timeout'
If I try to run this with 'mocha test.js' I get the error 'Cannot find module "webpage"'
I'm sure those are the expected error messages given the code. It's my understanding that is failing. The code is my description of what I want to do. After several hours of treading water last night, I have no idea how to actually do it.
Thank you for any help or nudge in the right direction.
var assert = require('assert');
var phantom = require('phantom');
describe('Mocha and phantom', function () {
this.timeout(150000);
it('Tweeking with phantomjs', function (done) {
phantom.create(function (ph) {
ph.createPage(function (page) {
page.open('https://www.facebook.com/', function (status) {
page.evaluate(function () {
return document.all[0].outerHTML //can check different elements
}, function (result) {
console.log('----------->>>>result',result);
assert.equal(status,'success','Not appropriate status');
done();
})
})
})
})
})
})

Nodejs Http request has no response

Currently using http GET to an external API. When called individually, the response is good. When put in a for loop, some requests don't seem to have a response.
This is the http GET function:
function httpGetChunk(url, callback) {
http.get(url, function(resp) {
var body='';
resp.on('data', function(chunk) {
body += chunk; //chunk too large from this response
});
resp.on('end', function() {
var data = JSON.parse(body);
callback(data);
});
resp.on("error", function(e) {
console.log("Got error: " + e.message);
});
});
}
When I call the GET function in a for loop for 5 different urls, I only get responses for some of them. Ran it a couple of times and the response would be from a different combination of the called urls but never all of them.
Any insight?
Edit 1: To give more information, my for loop looks something like this.
for (var i=0;i<5; i++) {
httpGetChunk(someUrl, function(data) {
console.log(data);
});
}
This would only print out some responses but not all.
Edit 2:
I've taken into account all the advice on this thread. I'm now using the async module and have increased the number of concurrent connections to 20:
http.globalAgent.maxSockets = 20;
Following code is the one im currently testing:
getMatchStats() returns an game 'match' object with statistics (e.g kills, deaths in the match etc.)
matchIds is the array containing all the id keys of the matches
async.parallel([
getMatchStats(matchIds[0], function (matchData) {
console.log('0');
}),
getMatchStats(matchIds[1], function (matchData) {
console.log('1');
}),
getMatchStats(matchIds[2], function (matchData) {
console.log('2');
}),
getMatchStats(matchIds[3], function (matchData) {
console.log('3');
}),
getMatchStats(matchIds[4], function (matchData) {
console.log('4');
}),
], function(err, result) {
console.log('done');
callback(result);
});
and getMatchStats
function getMatchStats(matchId, callback) {
var url = getMatchStatsUrl(matchId); //gets url based on id
httpGetChunk(url, function(data) {
callback(data);
});
}
again, the async.parallel never finishes since only some of the requests have responses. Every time i run it, the responses would be from a different combination of matches. Sometimes, it even completes all of the requests.
Maybe my OS has limitations on number of connections (im testing on localhost)?
Each request is asynchronous. So, if you use a regular for loop, each step is going to be executed synchronously and won't wait for callback to be called. What do you need is something like the each method from the async module, like:
async.each(yourArrayOfUrls, function (url, callback) {
httpGetChunk(url, function(data) {
console.log(data);
callback();
});
}, function (err) {
// if some step produce an error, you can get it here...
});

Zombiejs - fetching contents of links synchronouly

I have been playing with nodejs and zombiejs to fetch some personal data from a site. Unfortunately I am stuck at a point where zombiejs only gets me the data from first link and then hangsup.
The steps I follow are-
Go to to the base url
Get the number of pages
Use async library to fetch them in series by opening a new browser window everytime. Note I only create a browser window instead of a totally new browser instance as it expensive to create one.
This is my code
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
});
},
function(e) {
console.log(e);
});
});
Results
>node main_zombie.js
..... HTML DUMP
http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=1
>
Any suggestions would be appreciated
Found the mistake
As per https://github.com/caolan/async#each
One needs to call the callback function with empty arguments or null if there is no error.
So the correct code would be
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
// Add callback and check if we reached the last page
if (k == 10) {
browser.close();
}
callback();
});
},
function(e) {
console.log(e);
});
});

Resources