Calling external function from within Phantomjs+node.js - node.js

I'm going to be honest. I'm way in over my head here.
I need to scrape data from a dynamic site for my employer. Before the data is visible on the page, there are some clicks and waits necessary. Simple PHP scraping won't do. So I found out about this NodeJS + PhantomJS combo. Quite a pain to set up, but I did manage to load a site, run some code and get a result.
I wrote a piece of jQuery which uses timeout loops to wait for some data to be loaded. Eventually I get a js object that I want to write to a file (JSON).
The issue I'm facing.
I build up the the js object inside the PhantomJS .evaluate scope, which runs in a headerless browser, so not directly in my Node.JS server scope. How do I send the variable I built up inside evaluate back to my server so I can write it to my file?
Some example code (I know it's ugly, but it's for illustrative purposes). I use node-phantom-simple as a bridge between Phantom and Node
var phantom = require('node-phantom-simple'),
fs = require('fs'),
webPage = 'https://www.imagemedia.com/printing/business-card-printing/'
phantom.create(function(err, ph) {
return ph.createPage(function(err, page) {
return page.open(webPage, function(err, status) {
page.onConsoleMessage = function(msg) {
console.log(msg);
};
console.log("opened site? ", status);
page.evaluate(function() {
setTimeout(function() {
$('.price-select-cnt').eq(0).find('select').val('1266').change()
timeOutLoop()
function timeOutLoop() {
console.log('looping')
setTimeout(function() {
if ($('#ajax_price_tool div').length != 6) {
timeOutLoop()
} else {
$('.price-select-cnt').eq(1).find('select').val('25')
$('.price-select-cnt').eq(2).find('select').val('Premium Card Stock')
$('.price-select-cnt').eq(3).find('select').val('Standard').change()
timeOutLoop2()
}
}, 100)
}
function timeOutLoop2() {
console.log('looping2')
setTimeout(function() {
if ($('.pricing-cost-cnt').text() == '$0' || $('.pricing-cost-cnt').text() == '') {
timeOutLoop2()
} else {
var price = $('.pricing-cost-cnt').text()
console.log(price)
}
}, 100)
}
}, 4000)
});
});
});
});
function writeJSON(plsWrite) {
var key = 'file'
fs.writeFile('./results/' + key + '.json', plsWrite, 'utf8', function() {
console.log('The JSON file is saved as');
console.log('results/' + key + '.json');
});
}
So do do I write the price this code takes from the website, get it out of the evaluate scope and write it to a file?

Related

Chrome Extension API Calls order and DOM Information

I'm working on an extension that is supposed to extract information from the DOM based specific classes/tags,etc, then allow the user to save the information as a CSV file.
I'm getting stuck on a couple of places and haven't been able to find answers to questions similar enough.
Where I am tripped up at is:
1) Making sure that the page has completely loaded so the chrome.tabs.query doesn't return null a couple of times before the promise actually succeeds and allows the blocksF to successfully inject. I have tried placing it within a settimeout function but the chrome api doesn't seem to work within such the function.
2) Saving the extracted information so when the user moves onto a new page, the information is still there. I'm not sure if I should use the chrome.storage api call or simply save the information as an array and keep passing it through. It's just text, so I don't believe that it should take up too much space.
Then main function of the background.js is below.
let mainfunc = chrome.tabs.onUpdated.addListener(
async(id, tab) => {
if (buttonOn == true) {
let actTab = await chrome.tabs.query({
active: true,
currentWindow: true,
status: "complete"
}).catch(console.log(console.error()));
if (!actTab) {
console.log("Could not get URL. Turn extension off and on again.");
} else {
console.log("Tab information recieved.")
};
console.log(actTab);
let blocksF = chrome.scripting.executeScript({
target: { tabId: actTab[0]['id'] },
func: createBlocks
})
.catch(console.error)
if (!blocksF) {
console.log("Something went wrong.")
} else {
console.log("Buttons have been created.")
};
/*
Adds listeners and should return value of the works array if the user chose to get the information
*/
let listenersF = chrome.scripting.executeScript({
target: { tabId: actTab[0]['id'] },
func: loadListeners
})
.catch(console.error)
if (!listenersF) {
console.log("Listeners failed to load.")
} else {
console.log("Listeners loaded successfully.")
};
console.log(listenersF)
};
});
Information from the DOM is extracted through an event listener on a div/button that is added. The event listener is added within the loadListeners function.
let workArr = document.getElementById("getInfo").addEventListener("click", () => {
let domAr = Array.from(
document.querySelectorAll(<class 1>, <class 2>),
el => {
return el.textContent
}
);
let newAr = []
for (let i = 0; i < domAr.length; i++) {
if (i % 2 == 0) {
newAr.push([domAr[i], domAr[i + 1]])
}
}
newAr.forEach((work, i) => {
let table = document.getElementById('extTable');
let row = document.createElement("tr");
row.appendChild(document.createElement("td")).textContent = work[0];
row.appendChild(document.createElement("td")).textContent = work[1];
table.appendChild(row);
});
return newAr
I've been stuck on this for a couple of weeks now. Any help would be appreciated. Thank you!
There are several issues.
chrome methods return a Promise in MV3 so you need to await it or chain on it via then.
tabs.onUpdated listener's parameters are different. The second one is a change info which you can check for status instead of polling the active tab, moreover the update may happen while the tab is inactive.
catch(console.log(console.error())) doesn't do anything useful because it immediately calls these two functions so it's equivalent to catch(undefined)
Using return newArr inside a DOM event listener doesn't do anything useful because the caller of this listener is the internal DOM event dispatcher which doesn't use the returned value. Instead, your injected func should return a Promise and call resolve inside the listener when done. This requires Chrome 98 which added support for resolving Promise returned by the injected function.
chrome.tabs.onUpdated.addListener(onTabUpdated);
async function onTabUpdated(tabId, info, tab) {
if (info.status === 'complete' &&
/^https?:\/\/(www\.)?example\.com\//.test(tab.url) &&
await exec(tabId, createBlocks)) {
const [{result}] = await exec(tabId, loadListeners);
console.log(result);
// here you can save it in chrome.storage if necessary
}
}
function exec(tabId, func) {
// console.error returns `undefined` so we don't need try/catch,
// because executeScript is always an array of objects on success
return chrome.scripting.executeScript({target: {tabId}, func})
.catch(console.error);
}
function loadListeners() {
return new Promise(resolve => {
document.getElementById('getInfo').addEventListener('click', () => {
const result = [];
// ...add items to result
resolve(result);
});
});
}

How to wait in node.js for a variable available on the cloud to have a specific value

I'm sorry if this is a basic question, but I am trying to implement a program in node.js that should wait for the value of a variable available trough a request to a cloud api (photon.variable()) to be 1. This variable should not be requested more than once per second. My first attempt is included in the sample code below. Despite knowing it does not work at all, I think it could be useful to show the functionality I would like to implement.
var photondata = 0;
while (photondata < 1)
{
setTimeout(function () {
photon.variable("witok", function(err, data) {
if (!err) {
console.log("data: ", data.result);
photondata = data.result;
}
else console.log(err);
})}, 1000);
}
Since you couldn't do async stuff in loops before, the traditional approach would be to create a function that adds itself to setTimeout for as long as needed, then calls some other function when it's done. You still need to do this in the browser if not using Babel.
These days, you can stop execution and wait for things to happen when using a generator function (which latest versions of Node now support). There are many libraries that will let you do this and I will advertise ours :)
CL.run(function* () {
var photondata = 0;
while (true) {
yield CL.try(function* () {
var data = yield photon.variable("witok", CL.cb());
console.log("data: ", data.result);
photondata = data.result;
}, function* (err) {
console.log(err.message);
});
if (photondata >= 1) break;
yield CL.sleep(1000);
}
// do whatever you need here
});

rate limit of github using node.js

I am writing an app using node.js which sends a request to github and fetches the html page of github project issues. when I send more request for 40th page I am getting 429 response for request. how can i overcome this RATE limit of github?
function requestPage(pageNo){
var changedUrl = url+"?page="+pageNo+"&q=is%3Aissue+is%3Aopen"; //URL for requesting all the pages individually
request(changedUrl, function(error, response, html){ //requesting thee web page
if(error){
return error;
}
else{
var $ = cheerio.load(html);
if(pageNo == 40){
console.log(response.statusCode);
fs.writeFile("page.html", html ,'utf8',function(err){
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
}
//functions
}
});
}
for (var i = 1; i <= noOfPages; i++) {
requestPage(i);
}
using the right tool for the right job
First, please note that you should better use github API instead of requesting the HTML pages. It should make your work easier, as you'll directly get the data you are interested in in JSON format, without having to parse HTML.
Issues API endpoints are documented here.
You also should check whether github allows you to crawl their page, and make sure you don't overload their servers with your requests.
fixing your code
However, in case you want to continue the way you're doing, you arrange your code this way:
handle rate limiting
Inside request function callback, you should add a condition to check the response from github:
request(changedUrl, function(error, response, html){
if(error){
return error;
}
else{
if (response.headers.status === '429 Too Many Requests') {
setTimeout(function() {requestPage(pageNo)}, 54000000}) // retry in some delay (find out the appropriate timeout value)
}
else {
... //continue the processing
behaving well
Another change required in your code may avoid you from being rate-limited. It should also spare your CPU and github servers' too.
This is bad:
for (var i = 1; i <= noOfPages; i++) {
requestPage(i);
}
Reason: your are sending noOfPages requests to github quasi-simultaneously.
How to fix: use a recursive function instead of a for loop, and set a delay between the calls.
Fixed code:
function requestPage(pageNo){
var changedUrl = url+"?page="+pageNo+"&q=is%3Aissue+is%3Aopen"; //URL for requesting all the pages individually
request(changedUrl, function(error, response, html){ //requesting thee web page
if(error){
return error;
}
else{
if (response.headers.status === '429 Too Many Requests') {
// retry to request the same page after some delay
setTimeout(function() {requestPage(pageNo)}, 54000000})
}
else {
var $ = cheerio.load(html);
if(pageNo == 40){
console.log(response.statusCode);
fs.writeFile("page.html", html ,'utf8',function(err){
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
if (i < noOfPages) {
// request next page after a small delay
setTimeout(function() {requestPage(pageNo+1)}, 15000})
}
}
}
});
}
// request the first page immediately
requestPage(1);
I leave it up to you to cleanup the hard-coded if(pageNo == 40){, which probably should be if(pageNo == noOfPages){ and could be arranged with the following if clause.
In ES7 with babel, just one page at a time:
import req from 'request-promise';
async function getPages(urls) {
for (let url of urls) {
let html = await req(url);
console.log(html);
}

Return a value from phantomjs to nodejs

I'm using phantomjs using jquerygo library and am trying to this.
Visit a url
Click on a link and wait for it to load
Grab a particular tag and return it to nodejs for processing.
I realize that in phantomjs:
The execution is sandboxed, the web page has no access to the phantom object and it can't probe its own setting
But I should be able to return a simple string from the evaluate right?
But that is not working. My code is as follows:
var photogsScrapeCount = function(url, callback){
console.log("LOADED PHOTOGSSCRAPE Count");
url = decodeURIComponent(url);
//$.config.site = 'https://www.magnumphotos.com/';
$.config.addJQuery = false;
$.visit(url, function() {
$.waitForElement(".7n7np102",function() {
$.getPage(function(page) {
var imgCounterMinus = page.evaluate(function(){
$(".7n7np102 a").click(); // open the image enlarge
var temp = setTimeout(function(){
imgCounterMinus1 = $("span[id$='TotalPageCount_Lbl']").html();
imgCounterMinus1 = imgCounterMinus1.split(" ");
imgCounterMinus1 = imgCounterMinus1[2];
imgCounterMinus1 = parseInt(imgCounterMinus1);
console.log("imgCounterMinus1" + imgCounterMinus1);
return (imgCounterMinus1 - 3);
}, 4000);
return temp;
});
//console.log("After evaluate: " + imgCounterMinus)
});
});
});
};
Can this be achieved in any different way? The basic example from website is working so I am assuming that the setTimeout is giving me problems.
Any ideas or suggestions would be very helpful as I have very little experience in writing jquery, Js.
The docs say (emphasis mine):
For one, this library is not a complete API mirror of jQuery. Every API is asynchronous (due to its interaction with Phantom.js), so there are some differences.
There is also an example how page.evaluate() must be used. The result is not returned, but passed into a second callback. There is no way to return something from an asynchronous execution of a function except by using the callback. So the setTimeout syntax is also wrong.
$(".7n7np102 a").click(function(){
setTimeout(function(){
$.getPage(function(page) {
page.evaluate(function(){
var imgCounterMinus1 = $("span[id$='TotalPageCount_Lbl']").html();
imgCounterMinus1 = imgCounterMinus1.split(" ");
imgCounterMinus1 = imgCounterMinus1[2];
imgCounterMinus1 = parseInt(imgCounterMinus1);
console.log("imgCounterMinus1" + imgCounterMinus1);
return (imgCounterMinus1 - 3);
}, function(err, result){
console.log("After evaluate: " + result);
callback();
$.close();
});
});
}, 4000);
});

node.js redis async query

Hope someone can assist with a (simple) async question on node-redis. I'm trying to load a set from a hash in the redis db and then use that populated set further on. Here's the code snippet :-
var redis_client = redis.createClient(REDIS_PORT, REDIS_URL);
redis_client.hgetall(target_hash,function(e,o){
Object.keys(o).forEach(function(target){
// get the "name" from the hash
redis_client.hget(o[target],"name",function(e,o){
if (e){
console.log("Error occurred getting key: " + e);
}
else {
redis_client.sadd("newset",o);
}
});
});
// the following line prints nothing - why ??
redis_client.smembers("newset",redis.print);
When I examine the contents of "newset" in redis it is populated as expected, but at runtime it displayed as empty. I'm sure it's an async issue - any help much appreciated !
hgetall is an asynchronous call: when it receives a reply from the redis server, it will eventually call your callback function (target) { ... }. But within your script, it actually returns immediately. Since hgetall returns very fast, Node will immediately run the next statement, smembers. But at this point the sadd statements haven’t run yet (even if your system is very fast because there hasn’t been a context switch yet).
What you need to do is to make sure smembers isn’t called before all the possible sadd calls have executed. redis_client provides the multi function to allow you to queue up all the sadd calls and run a callback when they’re all done. I haven’t tested this code, but you could try this:
var redis_client = redis.createClient(REDIS_PORT, REDIS_URL);
redis_client.hgetall(target_hash, function(e, o) {
var multi = redis_client.multi();
var keys = Object.keys(o);
var i = 0;
keys.forEach(function (target) {
// get the "name" from the hash
redis_client.hget(o[target], "name", function(e, o) {
i++;
if (e) {
console.log("Error occurred getting key: " + e);
} else {
multi.sadd("newset", o);
}
if (i == keys.length) {
multi.exec(function (err, replies) {
console.log("MULTI got " + replies.length + "replies");
redis_client.smembers("newset", redis.print);
});
}
});
});
});
Some libraries have a built-in equivalent of forEach that allows you to specify a function to be called when the loop is all done. If not, you have to manually keep track of how many callbacks there have been and call smembers after the last one.
You shouldn't use multi unless you need actually need a transaction.
just keep a counter of the transactions and call smembers in the final callback
var redis_client = redis.createClient(REDIS_PORT, REDIS_URL);
var keys = Object.keys(o);
var i = 0;
redis_client.hgetall(target_hash,function(e,o){
Object.keys(o).forEach(function(target){
// get the "name" from the hash
redis_client.hget(o[target],"name",function(e,o){
i++
if (e){
console.log("Error occurred getting key: " + e);
}
else {
redis_client.sadd("newset",o);
if (i == keys.length) {
redis_client.smembers("newset", redis.print);
}
}});

Resources