Batch screenshots with phantom for nodejs - node.js

I am desperately trying to process something like 200 screenshots in a single shot,
my first attempt was to follow the guidelines with a simple script invoked 200 times,
phantom.create()
.then(function(instance) {
console.log("1 - instance")
phInstance = instance;
return instance.createPage();
})
.then(function(page) {
console.log("2 - page")
sitepage = page;
return page.open(url);
})
.then(function(status) {
console.log("3 - render")
sitepage.property('clipRect', {top: 0, left: 0, width:3000,height:890}).then(function() {
sitepage.render(fname).then(function(finished) {
console.log("\t\t\t---> finished");
sitepage.close();
phInstance.exit();
callback({msg: 'ok'})
phantom.exit();
return;
});
});
})
this approach kinda works, but it's really overwhelming for the cpu,
the problem is related to the fact that this way of doings things leads to 200 phantom processes that quickly eats up all the memory.
A more profitable way of doing so, would be to create a single phantom instance and then drive it to open
one page at the time and render it, something that could be done with a phantom script, like so:
var content, counter, f, fs, grab_screen, img, lines, next_screen, page, system, url;
page = require('webpage').create();
system = require('system');
fs = require('fs');
content = '';
lines = [];
url = '';
img = '';
counter = 0;
page.viewportSize = {
width: 1200,
height: 800
};
page.settings.userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36';
f = fs.open("sites.txt", "r");
content = f.read();
lines = content.split("\n");
grab_screen = function() {
var site;
site = lines[counter];
url = 'http://' + site + '/';
img = 'screens/' + site + '.png';
console.log("Grabbing screen for: " + url);
return page.open(url, function(status) {
return window.setTimeout(function() {
page.render(img);
counter++;
return next_screen();
}, 200);
});
};
next_screen = function() {
console.log("On to " + counter + " ...");
if (counter < lines.length) {
return grab_screen();
} else {
return phantom.exit();
}
};
next_screen();
so I was wondering how to achieve that with phantomjs-node.

I finally solved my problem with two things:
realizing that node.js is NOT multithreading.
Using a single instance of phantom, to render multiple urls.
here's how it came out:
var webshot = function(id) {
console.log('makeshot ', shots[id].url);
requestSync("POST", "http://localhost:4041/options/set", { json:{ opts:JSON.stringify(shots[id].options) } });
phInstance.createPage().then(function(_page) {
console.log("2 - page")
sitepage = _page;
return _page.open(shots[id].url);
})
.then(function(status) {
console.log("3 - render %s / %s", id, shots.length);
sitepage.property('clipRect', {top: 0, left: 0, width:1500,height:220}).then(function() {
sitepage.render(shots[id].fname).then(function(finished) {
console.log("\t\t\t---> finished");
sitepage.close();
fnames[Math.ceil(parseInt(shots[id].options.pack_id)/mt_per_snap)-1] = "localhost_" + shots[id].options.pack_id + ".png";
if(id<shots.length-1) {
id += 1;
webshot(id);
} else {
console.log("all done: %s files has been written", shots.length);
// invoke pdf generation for the pdf page
cb("files_written", { });
generatePDF();
}
return;
});
});
})
}
so, long story short: I have put the page I wanted to render in a separate script, which I feed with variables before making the shot, and this solves the "multithreading problem", afterwards I have a single variable named phInstance, that is declared as follows:
var initPhantom = function() {
phantom.create()
.then(function(instance) {
console.log("1 - instance")
phInstance = instance;
})
}
remember to kill the phantom instance once you're done, otherwise it
will stay there and suck your resources for good.

You could try something like webshot. I'm using it with async.js,
however I sometimes get Error: PhantomJS exited with return value 1.
Have not yet found out why.
async.map(
links,
function(link, cb) {
var config = {...}; // your webshot options
var folder = link; // make unique folder name from link?
var file = path.join('./', 'screenshots', folder, 'screenshot.png');
webshot(link, file, config, function(err) {
cb(err, link);
});
},
function(e, links) {
// done
}
);
Resources:
https://www.npmjs.com/package/webshot
https://www.npmjs.com/package/asyncjs

Related

How to get all fonts used on a page using node.js?

I need to crawl all the pages on a site (the crwling part works fine.) and so i need to run THIS script on my server using node.js. I tried implementing the following logic:
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var jsdom = require("jsdom");
var { JSDOM } = jsdom;
var START_URL = "http://balneol.com/";
var SEARCH_FONT = "helvetica";
var MAX_PAGES_TO_VISIT = 100000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the window.document body
// var window = jsdom.jsdom(body).defaultView();
var { window } = new JSDOM(body);
//var $ = cheerio.load(body);
var helveticaFound = searchForHelvetica(window, 'font-family');
if(helveticaFound) {
console.log('Word ' + SEARCH_FONT + ' found at page ' + url);
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
// callback();
}
});
}
function searchForHelvetica( window , css) {
if(typeof getComputedStyle == "undefined")
getComputedStyle= function(elem){
return elem.currentStyle;
}
var who, hoo, values= [], val,
nodes= window.document.body.getElementsByTagName('*'),
L= nodes.length;
for(var i= 0; i<L; i++){
who= nodes[i];
console.log(nodes[i]);
if(who.style){
hoo= '#'+(who.id || who.nodeName+'('+i+')');
console.log(who.style._values);
// return false;
val= who.style.fontFamily || getComputedStyle(who, '')[css];
if(val){
if(verbose) values.push([hoo, val]);
else if(values.indexOf(val)== -1) values.push(val);
// before IE9 you need to shim Array.indexOf (shown below)
}
}
}
// console.log(values);
// return values;
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
If you see my visit page function you will see the below two lines of code:
var { window } = new JSDOM(body);
var helveticaFound = searchForHelvetica(window, 'font-family');
as you can see on the 2nd line i am passing the window object to the searchForHelvetic function.
In my searchForHelvetic function , if i console.log(nodes[i]); , i don't get the html element and hence the rest of the script does't quite run as expected. does the jsdom window differ from the window object in the browser ? how do i get the script working ? I.E. basically use the window object to run through all the pages on the page and spit out all the fonts used on the page ?
EDIT::-
To break the problem down to a micro level, if i console.log(who); inside searchForHelvetica function , i get the following result:
HTMLElement {}
HTMLDivElement {}
HTMLDivElement {}
HTMLDivElement {}
HTMLAnchorElement {}
HTMLImageElement {}
HTMLDivElement {}
HTMLFormElement {}
HTMLDivElement {}
HTMLLabelElement {}
HTMLInputElement {}
HTMLButtonElement {}
HTMLButtonElement {}
HTMLSpanElement {}
etc..
But if i were to do the same in a web browser the result world be different Eg.
nodes = window.document.body.getElementsByTagName('*');
console.log(node[1]) // <div id="mobile-menu-box" class="hide">...</div>
How do i get a similar result in node.js ?

phantomjs call from node.js uing spawn does not seem to execute script

I am trying to generate a pdf file from a dynamic handlebar webpage that would display data from the database. Once I get this to work on my local machine, I would like to move the code up to elastic beanstalk. The code seems to run that initially makes the call to spawn, however, call to phantomjs does not seem to kick off the script passed to it. Just to confirm, I added a console.log statement at the very beginning of the file and never the print out. Not to mention the other print statement in the script file. The code is below:
code that performs the spawn:
var childArgs = [
//'--debug=true',
path.join(__dirname, '../phantom/capture.js'),
id,
filename
];
let child = spawn(phantomjs.path, childArgs, {detached: false});
console.log('set up stderr.....');
child.stderr.on('data', (data) => {
console.log('ps stderr: ${data}');
});
console.log('set up stdout.....');
child.stdout.on('data', (data) => {
console.log('ps stdout: ${data}');
});
console.log('set up close.....');
child.on('close', (code) => {
console.log('this is a test 2.....');
console.log(code);
if (code !== 0){
let error = new Error('An error occurred while creating the current report.');
error.status = 500;
reject(error);
}
resolve(fname);
return;
});
capture.js:
console.log('we are inside....');
let os = require('os');
var system = require('system');
console.log('we are inside.... 1');
var page = require('webpage').create();
let phantom = require('phantomjs-prebuilt');
console.log('we are inside....2');
var args = system.args;
console.log('we are inside....3');
if (args.length === 1) {
phantom.exit(0);
} else {
page.viewportSize = { width: 2000, height: 800 };
//page.paperSize = { format: 'Letter', orientation: 'landscape', margin: '1cm' };
page.paperSize = { width: '1280px', height: '800px', margin: '0px' };
page.settings.localToRemoteUrlAccessEnabled = true;
page.settings.loadImages = true;
page.settings.javascriptEnabled = true;
let done = false;
//"http://example.com/order/" + args[1] + '?token=' + args[2]
page.open("http://example.com/order/" + args[1], function start(status) {
if (status === "success"){
page.render(os.tmpdir() + '/' + args[2], { format: 'pdf' });
}
done = true;
});
var intervalId = setInterval(function(){
if (done){
phantom.exit();
}
}, 100);
}
I am at a loss on where the problem can be. The first line of capture.js does not seem to get called. 1) What is the problem that I am missing? 2) How can I fix it?
The problem could be that PhantomJS couldn't run the capture.js with the path with dots due to security reasons. Try to run the capture.js from the current directory, for example.

How to loop through links and execute a jQuery javascript for each site in PhantomJS

Im trying to build a responsive webdesign testing tool. I have managed to capture each link in an array of links in different device specific viewport sizes.
But an additonal requirement for the tool is to be able to inject custom javascripts into the site, wait for that script to finish and then capture the state of the page (For example someone wants to open a bootstrap modal before capturing it).
Ive been using the jQueryGo Node Module to accomplish my work up till now.
What I've accomplished so far
I've managed to make it work but without being able to use jQuery components(and ive tried using injectJs and includeJs provided by phantom). In the following code using asyncjs and some helper function.
var $ = require('../node_modules/jquerygo/lib/jquery.go.js');
var async = require('../node_modules/jquerygo/node_modules/async/lib/async.js');
var jqueryCdn = 'http://ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js';
var links = [{
url: "http://google.com",
sitename: "Google",
jsinject: function() {
var log = document.querySelectorAll('title')[0].innerText;
console.log(log);
}
},
...
];
var sizes = [{
device: "desktop",
userAgent: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36",
width: 1440,
height: 1080
},
...
];
/*
* This function wraps WebPage.evaluate, and offers the possibility to pass
* parameters into the webpage function. The PhantomJS issue is here:
*
*/
var evaluate = function(page, func) {
var args = [].slice.call(arguments, 2);
var fn = "function() { return (" + func.toString() + ").apply(this, " + JSON.stringify(args) + ");}";
return page.evaluate(fn);
};
var adjustPageSettings = function(page, size) {
page.set(
'viewportSize', {
width: size.width,
height: size.height
}
);
$.config.userAgent = size.userAgent;
return page;
};
$.getPage(function(page) {
/**
* snaps screenshot of page and safes it
* #param siteName
* #param device
* #param done
*/
var snapShoot = function(siteName, device, done) {
console.log('snapshooting: ' + siteName + ' for ' + device);
page.render('screenshot' + siteName + device + '.png', done);
};
async.eachSeries(sizes, function(size, done) {
async.eachSeries(links, function(link, done) {
page = adjustPageSettings(page, size);
page.open(link.url, function(status) {
if (status == 'success') {
async.series([
function(done) {
if (typeof link.jsinject == 'function') {
evaluate(page, link.jsinject, '');
done();
} else {
done();
}
},
function(done) {
snapShoot(link.sitename, size.device, done);
}
], function() {
done()
});
} else {
console.log('couldnt load:' + link.url);
}
});
}, done);
});
});
But as soon as I use jQuery components as following:
...
$.getPage(function (page) {
page.injectJs(jqueryCdn);
....
and...
var links = [
{
url: "http://google.com",
sitename: "Google",
jsinject: function () {
var log = $('title').text();
console.log(log);
}
},
I get an jQuery is not loaded error:
phantom stdout: ReferenceError: Can't find variable: $
phantomjs://webpage.evaluate():2
phantomjs://webpage.evaluate():4
phantomjs://webpage.evaluate():4
phantomjs://webpage.evaluate():4
The error message says that jQuery was not loaded into the DOM.
The variable name jqueryCdn suggests that you use a remote resource (URL). page.injectJs() supports only local paths. You need to use page.includeJs(). Don't forget to use the callback otherwise it may be the case that your script is executed, but jQuery is not yet loaded into the DOM.

Making jquery ajax call in child process - node.js

I am using 'child_process'(fork method) to handle task of saving some records across server. For this I was using jquery ajax call in the child process to save the records. But somehow that code doesn't get executed.
I have already included the jquery.min.js file in the html in which I am including the file forking child process as well.
The file forking child process:
var childProcess = require('child_process');
var syncProcess;
function launchMindwaveDataSync(){
//alert("launchMindwaveDataSync fired");
var iconSync = document.getElementById("iconSync");
iconSync.src = "images/findDevice.gif";
iconDevice.title = "Synchronizing...";
//Launch the device reader script in a new V8 process.
syncProcess = childProcess.fork('./js/mindwaveDataSync.js');
syncProcess.on('message', function(message){
console.log(message);
switch(message.msg)
{
case "connected":
global.HEADSET_CONNECTED = true;
iconDevice.src = "images/icon-power.png";
iconDevice.title = "Connected to Mindwave Mobile device";
startSynchronizing();
break;
case "disconnected":
case "error":
case "close":
case "timeout":
global.HEADSET_CONNECTED = false;
iconDevice.src = "images/error.png";
iconDevice.title = "Mindwave Mobile device is disconnected";
break;
}
});
syncProcess.on('error', function(e){
console.log(e);
});
setTimeout(function(){
console.log('sending command initialize');
syncProcess.send({cmd:'initialize'});
},1000);
};
function startSynchronizing(){
syncProcess.send({cmd: 'synchronize'});
}
The child process which is supposed to make ajax call
var recursive = require('recursive-readdir');
var SecurConf = require('../js/secureConf');
var sconf = new SecurConf();
var filesToSync = [];
var crypto = require('crypto');
var options = {
//prompt : 'File Password : ',
algo : 'aes-128-ecb',
file : {
encoding : 'utf8',
out_text : 'hex'
}
};
process.on('message', function (command){
console.log(command);
switch(command.cmd)
{
case "initialize": initializeConnection();
break;
case "synchronize": checkForFiles();
break;
case "record":
client.resume();
break;
case "pause":
client.pause();
break;
case "stop":
client.destroy();
break;
}
//process.send({msg:"Sync Process: " + command.cmd});
});
function checkForFiles(){
recursive('C:/MindWaveData/Data/', function (err, files) {
// Files is an array of filename
filesToSync = files;
decryptFiles();
//process.send({msg:files});
});
}
function decryptFiles(){
var ajaxSuccess = function(res){
process.send({msg:res});
}
for(var i = 0; i < filesToSync.length; i++){
var ef = ""+filesToSync[i];
sconf.decryptFile(ef, function(err, file, content){
if(err){
process.send({msg:"some error occurred while decrypting..."});
} else {
var parsedContent = JSON.parse(content);
var decryptedContent = JSON.stringify(parsedContent, null,'\t');
for(var j = 0; j<parsedContent.length; j++){
$.ajax({
//async: false,
type: "POST",
url: "http://192.168.14.27:8001/admin/webservice",
contentType: "application/json; charset=utf-8",
dataType: "json",
data: JSON.stringify({
'ncFunction': 'login',
'ncParams': {
'ncUserEmail': "clarity_admin#yopmail.com",
'ncUserPassword': "cl123!##"
}
}),
success: function (res) {
ajaxSuccess(res);
},
error: function (xhr, type, err) {
ajaxSuccess(res);
}
});
}
});
}
}
function initializeConnection(){
//console.log('initializeConnection::function');
//process.send({msg:"initializeConnection called"});
checkConnection()
//process.send({msg:"connected"});
//call function to send ajax request
}
function checkConnection(){
//console.log('checkConnection::function');
//call ajax request 3 times to check the connection.. once in third try we get the response OK, we can send the process message as connected
var ajaxCallCount = 0;
var makeAjaxCall = function(){
//console.log('function makeAjaxCall');
//process.send({msg:"connected"});
if(ajaxCallCount < 2){
ajaxCallCount++;
//console.log('ajaxCallCount:'+ajaxCallCount);
//process.send({msg:'value of ajaxCallCount:'+ajaxCallCount});
connectionSuccess();
}
else{
process.send({msg:"connected"});
}
};
var connectionSuccess = function(){
//console.log('function connectionSuccess');
makeAjaxCall();
};
makeAjaxCall();
}
Can we use jquery ajax call in the child process like this? Plus I have included the file forking child process in one html file and on load of its body I am calling /launchMindwaveDataSync/ from the first file shown below.
Thanks in advance

Secondary tile web url

I have to pin secondary tile in my windows phone 8.1 application.
I followed the msdn tutorial : http://code.msdn.microsoft.com/windowsapps/secondary-tiles-sample-edf2a178/
It does work with internal image (ms-appx://.. ) but not with web url (http://)
working sample:
var logo = new Windows.Foundation.Uri("ms-appx:///Images/square30x30Tile-sdk.png");
var currentTime = new Date();
var TileActivationArguments = data.ad_id + " WasPinnedAt=" + currentTime;
var tile = new Windows.UI.StartScreen.SecondaryTile(data.ad_id,
data.subject,
TileActivationArguments,
logo,
Windows.UI.StartScreen.TileSize.square150x150);
tile.visualElements.foregroundText = Windows.UI.StartScreen.ForegroundText.light;
tile.visualElements.square30x30Logo = logo;
tile.visualElements.showNameOnSquare150x150Logo = true;
var selectionRect = this.element.getBoundingClientRect();
// Now let's try to pin the tile.
// We'll make the same fundamental call as we did in pinByElement, but this time we'll return a promise.
return new WinJS.Promise(function (complete, error, progress) {
tile.requestCreateForSelectionAsync({ x: selectionRect.left, y: selectionRect.top, width: selectionRect.width, height: selectionRect.height }, Windows.UI.Popups.Placement.above).done(function (isCreated) {
if (isCreated) {
complete(true);
} else {
complete(false);
}
});
});
And if I use
var logo = new Windows.Foundation.Uri(data.images[0]);
I got an invalid parameter exception.
You can take a look at the documentation for the SecondaryTile.Logo property. In it you'll see this:
The location of the image. This can be expressed as one of these schemes:
ms-appx:///
ms-appdata:///local/
You can download the image first and then set it using the ms-appdata:///local/ scheme. I'm not sure that changing the logo with something from the Internet is a good idea, though. This should be the app's logo, so it should be in the package.
I found the solution
fileExists: function (fileName) {
var applicationData = Windows.Storage.ApplicationData.current;
var folder = applicationData.localFolder;
return folder.getFileAsync(fileName).then(function (file) {
return file;
}, function (err) {
return null;
});
},
download: function (imgUrl, imgName) {
return WinJS.xhr({ url: imgUrl, responseType: "blob" }).then(function (result) {
var blob = result.response;
var applicationData = Windows.Storage.ApplicationData.current;
var folder = applicationData.localFolder;
return folder.createFileAsync(imgName, Windows.Storage.
CreationCollisionOption.replaceExisting).then(function (file) {
// Open the returned file in order to copy the data
return file.openAsync(Windows.Storage.FileAccessMode.readWrite).
then(function (stream) {
return Windows.Storage.Streams.RandomAccessStream.copyAsync
(blob.msDetachStream(), stream).then(function () {
// Copy the stream from the blob to the File stream
return stream.flushAsync().then(function () {
stream.close();
});
});
});
});
}, function (e) {
//var msg = new Windows.UI.Popups.MessageDialog(e.message);
//msg.showAsync();
});
},
var self = this;
this.download(data.images[0], data.ad_id).then(function () {
self.fileExists(data.ad_id).then(function (file) {
var logo = new Windows.Foundation.Uri("ms-appdata:///Local/" + data.ad_id);
....
I need to download the image, store it and then I can use ms-appdata:///Local

Resources