I'm using phantomjs using jquerygo library and am trying to this.
Visit a url
Click on a link and wait for it to load
Grab a particular tag and return it to nodejs for processing.
I realize that in phantomjs:
The execution is sandboxed, the web page has no access to the phantom object and it can't probe its own setting
But I should be able to return a simple string from the evaluate right?
But that is not working. My code is as follows:
var photogsScrapeCount = function(url, callback){
console.log("LOADED PHOTOGSSCRAPE Count");
url = decodeURIComponent(url);
//$.config.site = 'https://www.magnumphotos.com/';
$.config.addJQuery = false;
$.visit(url, function() {
$.waitForElement(".7n7np102",function() {
$.getPage(function(page) {
var imgCounterMinus = page.evaluate(function(){
$(".7n7np102 a").click(); // open the image enlarge
var temp = setTimeout(function(){
imgCounterMinus1 = $("span[id$='TotalPageCount_Lbl']").html();
imgCounterMinus1 = imgCounterMinus1.split(" ");
imgCounterMinus1 = imgCounterMinus1[2];
imgCounterMinus1 = parseInt(imgCounterMinus1);
console.log("imgCounterMinus1" + imgCounterMinus1);
return (imgCounterMinus1 - 3);
}, 4000);
return temp;
});
//console.log("After evaluate: " + imgCounterMinus)
});
});
});
};
Can this be achieved in any different way? The basic example from website is working so I am assuming that the setTimeout is giving me problems.
Any ideas or suggestions would be very helpful as I have very little experience in writing jquery, Js.
The docs say (emphasis mine):
For one, this library is not a complete API mirror of jQuery. Every API is asynchronous (due to its interaction with Phantom.js), so there are some differences.
There is also an example how page.evaluate() must be used. The result is not returned, but passed into a second callback. There is no way to return something from an asynchronous execution of a function except by using the callback. So the setTimeout syntax is also wrong.
$(".7n7np102 a").click(function(){
setTimeout(function(){
$.getPage(function(page) {
page.evaluate(function(){
var imgCounterMinus1 = $("span[id$='TotalPageCount_Lbl']").html();
imgCounterMinus1 = imgCounterMinus1.split(" ");
imgCounterMinus1 = imgCounterMinus1[2];
imgCounterMinus1 = parseInt(imgCounterMinus1);
console.log("imgCounterMinus1" + imgCounterMinus1);
return (imgCounterMinus1 - 3);
}, function(err, result){
console.log("After evaluate: " + result);
callback();
$.close();
});
});
}, 4000);
});
Related
Little info, i have an arp.js file which takes a subnet address "192.168.2" and gets all strings returned from arp -a and stores in an array.
I can't figure out why my arpList function is returning an undefined value in my index.js file.
All the console.logs are returning the correct values in the arp.js page when called from the index.js, but the ipObj is coming up undefined. Even the console.log before i return of ipObj works.
Any help would be greatly appreciated.
var { spawn } = require('child_process');
const arpLs = spawn('arp', ['-a']);
var bufferData;
module.exports = {
arpList: function (subnet) {
arpLs.stdout.on('data', data => {
bufferData += data
})
arpLs.stderr.on('data', data => {
console.log('error: ' + data);
});
arpLs.on('exit', function (code) {
if (code != 0) {
console.log("Error exiting"); //if error occurs
}
console.log("exit start 1"); // checking internal processes at stages
var dataArray = bufferData.split(' ');
var ipArray = [];
for (i = 0; i < dataArray.length; i++) {
if (dataArray[i].includes(subnet)) {
ipArray.push(dataArray[i]);
console.log("loop working");
}
}
var ipObj = { "lanIps": ipArray };
console.log("Object is there: "+ipObj)
return ipObj; // this obj should be returned to the index.js call using
})
},
sayMyName: function () {
return "Hello";
}
}
//arpList(ipSubnet);
//INDEX.js
//the index page looks like this
//var arp = require('./arp.js);
//var ipSubnet = "192.168.2";
//var lanIps = arp.arpList(ipSubnet);
//console.log(lanIps);
I ended up adding a callback function to arpList - function (subnet, callback)
Then instead of returning the value pass it into the callback
Then on the index.js side instead of
var lanIps = arp.arpList(value)
i used
arp.arpList(value, function(res){lanIps = res}
return ipObj; // this obj should be returned to the index.js call using
It won't be returned. The reference say nothing about return value. Node-style callbacks rarely work like that because they are potentially asynchronous and returned value cannot be taken into account.
This a special case of this well-known problem. The process is asynchronous and is finished after arp.arpList(ipSubnet) call, there's nothing to assign to lanIps. This is a use case for promises. There are already third-party promisified counterparts like child-process-promise.
The problem can be also solved by moving to synchronous API. child_process functions have synchronous counterparts, including spawnSync.
I'm going to be honest. I'm way in over my head here.
I need to scrape data from a dynamic site for my employer. Before the data is visible on the page, there are some clicks and waits necessary. Simple PHP scraping won't do. So I found out about this NodeJS + PhantomJS combo. Quite a pain to set up, but I did manage to load a site, run some code and get a result.
I wrote a piece of jQuery which uses timeout loops to wait for some data to be loaded. Eventually I get a js object that I want to write to a file (JSON).
The issue I'm facing.
I build up the the js object inside the PhantomJS .evaluate scope, which runs in a headerless browser, so not directly in my Node.JS server scope. How do I send the variable I built up inside evaluate back to my server so I can write it to my file?
Some example code (I know it's ugly, but it's for illustrative purposes). I use node-phantom-simple as a bridge between Phantom and Node
var phantom = require('node-phantom-simple'),
fs = require('fs'),
webPage = 'https://www.imagemedia.com/printing/business-card-printing/'
phantom.create(function(err, ph) {
return ph.createPage(function(err, page) {
return page.open(webPage, function(err, status) {
page.onConsoleMessage = function(msg) {
console.log(msg);
};
console.log("opened site? ", status);
page.evaluate(function() {
setTimeout(function() {
$('.price-select-cnt').eq(0).find('select').val('1266').change()
timeOutLoop()
function timeOutLoop() {
console.log('looping')
setTimeout(function() {
if ($('#ajax_price_tool div').length != 6) {
timeOutLoop()
} else {
$('.price-select-cnt').eq(1).find('select').val('25')
$('.price-select-cnt').eq(2).find('select').val('Premium Card Stock')
$('.price-select-cnt').eq(3).find('select').val('Standard').change()
timeOutLoop2()
}
}, 100)
}
function timeOutLoop2() {
console.log('looping2')
setTimeout(function() {
if ($('.pricing-cost-cnt').text() == '$0' || $('.pricing-cost-cnt').text() == '') {
timeOutLoop2()
} else {
var price = $('.pricing-cost-cnt').text()
console.log(price)
}
}, 100)
}
}, 4000)
});
});
});
});
function writeJSON(plsWrite) {
var key = 'file'
fs.writeFile('./results/' + key + '.json', plsWrite, 'utf8', function() {
console.log('The JSON file is saved as');
console.log('results/' + key + '.json');
});
}
So do do I write the price this code takes from the website, get it out of the evaluate scope and write it to a file?
I'm trying to use a selenium server grid to run multiple commands in parallel.
Here is my first test code:
var webdriver = require('selenium-webdriver');
for(var u = 0; u < 3; u++) {
makeScreenshot('foo/test' + u + '.png');
}
function makeScreenshot(path) {
var driver = new webdriver.Builder().forBrowser('firefox').usingServer('http://someurl:44111/wd/hub/').build();
console.log('Get');
driver.get('http://www.somepage.com').then(function() {
console.log('Screenshot');
driver.takeScreenshot().then(function(data){
console.log(path);
//var decodedImage = new Buffer(data, 'base64')
driver.quit();
});
});
}
That is the result:
Get
Get
Get
Screenshot
foo/test0.png
Screenshot
foo/test1.png
Screenshot
foo/test2.png
screenshot of requests
The "Get" appears immediately in sequence, "driver.get" creates a promise. My idea here is that the three requests are made asynchronously and thus appear almost simultaneously. But as you can see in the screenshot they will be made one after the other.
The grid definitely has enough selenium instances so why isn't the driver working in parallel?
It seems to me that "new webdriver.Builder()" creates some kind of singleton that doesn't work async but waits for the previous request to finish!?
Thanks for any help!
The answer may be multiple control flows:
WebDriverJS supports defining "parallel" flows using
webdriver.promise.createFlow(). This function accepts a callback which
will be passed the newly created flow. Tasks scheduled within this
flow will be synchronized with each other, but will remain independent
of any other control flows. Each call to createFlow() returns a
promise that will resolve when the flow has completed.
The example at the end of the chapter (which I'll quite verbatim) shows multiple Google search terms being tested concurrently:
var terms = [
'javascript',
'selenium',
'webdriver'
];
var flows = terms.map(function(term) {
return webdriver.promise.createFlow(function() {
var driver = new webdriver.Builder().build();
driver.get('http://www.google.com');
driver.findElement(webdriver.By.name('q')).sendKeys(term);
driver.findElement(webdriver.By.name('btnG')).click();
driver.getTitle().then(function(title) {
if (title !== (term + ' - Google Search')) {
throw Error('Unexpected title: ' + title);
}
});
});
});
webdriver.promise.fullyResolved(flows).then(function() {
console.log('All tests passed!');
});
It should be easy enough to add your custom driver build and lookups into that example. Perhaps the following:
var flows = [0,1,2,3].map(function(index) {
return webdriver.promise.createFlow(function() {
var driver = new webdriver.Builder().forBrowser('firefox').usingServer('http://someurl:44111/wd/hub/').build();
console.log('Get');
driver.get('http://www.somepage.com').then(function() {
console.log('Screenshot');
driver.takeScreenshot().then(function(data){
console.log('foo/test' + index + '.png');
//var decodedImage = new Buffer(data, 'base64')
driver.quit();
});
});
});
});
I'm sorry if this is a basic question, but I am trying to implement a program in node.js that should wait for the value of a variable available trough a request to a cloud api (photon.variable()) to be 1. This variable should not be requested more than once per second. My first attempt is included in the sample code below. Despite knowing it does not work at all, I think it could be useful to show the functionality I would like to implement.
var photondata = 0;
while (photondata < 1)
{
setTimeout(function () {
photon.variable("witok", function(err, data) {
if (!err) {
console.log("data: ", data.result);
photondata = data.result;
}
else console.log(err);
})}, 1000);
}
Since you couldn't do async stuff in loops before, the traditional approach would be to create a function that adds itself to setTimeout for as long as needed, then calls some other function when it's done. You still need to do this in the browser if not using Babel.
These days, you can stop execution and wait for things to happen when using a generator function (which latest versions of Node now support). There are many libraries that will let you do this and I will advertise ours :)
CL.run(function* () {
var photondata = 0;
while (true) {
yield CL.try(function* () {
var data = yield photon.variable("witok", CL.cb());
console.log("data: ", data.result);
photondata = data.result;
}, function* (err) {
console.log(err.message);
});
if (photondata >= 1) break;
yield CL.sleep(1000);
}
// do whatever you need here
});
I have been trying to retrieve the data using the MongoJS driver for node.js.The Code which I am using is as follows
req.on('end', function(){
var decodedBody = querystring.parse(fullBody);
story=decodedBody.name;
var z=new Array();
console.log(story);
res.writeHead(200,{'Content-Type': 'text/html'});
res.write('<html><body>');
db.frames.find({str_id:story}).toArray(function(err,doc){
console.log(doc);
for(var t=0;t<doc.length;t++)
{
var picid=doc[t].pic_id;
console.log(picid);
db.pictures.find({_id:picid}).toArray(function(err,pic){
res.write('<img src="'+pic[0].name+'"/>');
});
}
})
res.end('</body></html>');
});
The problem here is that because of the asynchronous nature of the code the response gets ends first and then the code inside the block of database gets executed and because of that nothing gets displayed on the browser i.e an image in this case .Thankx in advance.
Don't fight the asynchronous nature of node.js, embrace it!
So you should fire off all your requests, marking each one as completed when the response arrives. When all requests are completed, render your images and body/html closing tags.
I don't usually work with node.js, so I can make some mistakes, but it may look like this:
res.write('<html><body>');
db.frames.find({str_id:story}).toArray(function(err,doc){
console.log(doc);
var completed = {};
for(var t = 0; t < doc.length; t++) {
var picid = doc[t].pic_id;
completed.picid = false;
console.log(picid);
db.pictures.find({_id: picid}).toArray(function(err, pic) {
// mark request as completed
completed.picid = pic;
// check if all requests completed
var all_finished = true;
for(var k in completed) {
if(completed[k] === false) {
all_finished = false;
break;
}
}
// render final markup
if(all_finished) {
for(var k in completed) {
var pic = completed[k];
res.write('<img src="'+pic[0].name+'"/>');
}
res.end('</body></html>);
}
});
}
})
just put the res.end('</body></html>'); inside your db.frames.find function. Check when you reached doc.length - 1 and then send the end command.