PhantomJS includeJS not working - node.js

I'm trying to use PhantomJS in order to get data from a website like this:
app.get('/scrape', function(req, res) {
var _ph, _page;
phantom.create()
.then(function (ph) {
_ph = ph;
return ph.createPage();
})
.then(function (page) {
_page = page;
var url = "https://banc.nl/";
return page.open(url);
})
.then(function(page) {
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js', function() {
page.evaluate(function() {
$('.sc-dnqmqq kaMmif').filter(function () {
var data = $(this);
price = data.children().first().text();
console.log("Price: " + price);
});
});
});
})
.catch(function(err) {
console.log("Error: " , err);
_page.close();
//_ph.exit();
});
});
The problem is that I get the following error: Error: TypeError: page.includeJs is not a function
If I uncomment the ph.exit() I will get a warning also: warn: exit() was called before waiting for commands to finish. Make sure you are not calling exit() prematurely.
I found several questions about this in SO but non of the answers solved my problem.

The latest version of phantomjs-node module uses Promises so we can rewrite scripts using async functions with await operator — much more readable:
var phantom = require('phantom');
var url = 'https://www.huobipro.com';
(async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
await page.on('onError', function(msg) {
console.info(msg);
});
const status = await page.open(url);
await console.log('STATUS:', status);
// Wait a bit for javascript to load and run
await new Promise(resolve => setTimeout(resolve, 3000))
await page.includeJs('https://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js');
await page.evaluate(function() {
$('span[price]').filter(function () {
var data = $(this);
console.log("Price: " + data.text());
});
});
await instance.exit();
})();

Related

Cannot conect to redis in Node

I'm trying to create a basic caching app just to test redis. Im using Redis Version: 4.0.6.
First I was getting error clientclosederror: the client is closed.
Then, after reading the docs, I added
let client;
(async ()=> {
client = redis.createClient()
await client.connect()
})();
But now, when trying on Postman, it just hangs, no response is returned
Full Code:
const express = require("express");
const redis = require("redis");
const axios = require('axios')
const app = express();
let client;
(async ()=> {
client = redis.createClient()
await client.connect()
})();
app.get('/result', async (req, res) => {
const searchTerm = req.query.name;
try {
await client.get(searchTerm, async (err, result) => {
console.log('cached called')
if (err) throw err;
if (result) {
res.status(200).send({
result: JSON.parse(result),
message: "data retrieved from the cache"
});
}
else {
const result = await axios.get(`https://api.agify.io/?name=${searchTerm}`);
await client.set(searchTerm, JSON.stringify(result.data));
return res.status(200).send({
result: result.data,
message: "cache miss"
});
}
})
} catch (error) {
console.log('get error', error)
return res.status(500).send({ message: error.message })
}
})
app.listen(process.env.PORT || 3000, () => {
console.log("Node server started");
});
client.get doesn't need a callback function. It's async. My guess is that it's never getting called and thus Express is not returning anything.
Try this instead:
const result = await client.get('foo')
if (result !== null) {
// it's a hit
} else {
// it's a miss
}

How to use a variable outside of its scope in node.js and nightmare.js (web scraping)

How to use a variable outside of its scope in node.js and nightmare.js (web scraping)
When i try to use the variable 'downloadLink' out of the await scope, it returns as undefined.
app.post('/search', function(req, res){
const val = req.body.searchText;
const nightmare = new Nightmare({
show: true
});
(async function() {
const downloadLink = await nightmare
.viewport(1200, 700)
.goto('https://google.com/')
.insert('#selector0')
.click('#selector1')
.click('#selector2')
.evaluate(() => document.querySelector('#selector3').href)
.end()
.catch((err) => {
console.log(err)
})
console.log('download link ' + downloadLink) //this line prints a string
})();
console.log('download link ' + downloadLink) //this line returns undefined
})
can i use 'downloadLink' outside of its scope and print it using the latter line of code??
Express supports async handlers, so you can refactor your method like this. No need to put the code under an async IIFE.
app.post('/search', async function(req, res){
try {
const val = req.body.searchText;
const nightmare = new Nightmare({
show: true
});
const downloadLink = await nightmare
.viewport(1200, 700)
.goto('https://google.com/')
.insert('#selector0')
.click('#selector1')
.click('#selector2')
.evaluate(() => document.querySelector('#selector3').href)
.end()
console.log('download link ' + downloadLink);
} catch (err) {
console.error(err.message);
}
});

NodeJS https module create reusable function to send request and handle response dynamically

This question is similar to this older question but I was not able to get the accepted answer to work correctly.
I am using the built-in NodeJS 'https' module to make requests to an external API. NodeJS version 12.
node: 12.16
express: 4.16.1
I was able to get it working with the example code from the documentation.
router.get('/', (req, res, next) => {
const requestOptions = httpCtrl.getReqOptions();
// Working example
// How to separate this logic into reusable function?
const request = https.request(requestOptions, (response) => {
let result = {
status: null,
data: {}
};
let rawData = '';
response.on('data', (chunk) => {
rawData += chunk;
});
response.on('end', () => {
console.log('No more data in response.');
try {
parsedData = JSON.parse(rawData);
result.status = parsedData.status || 200;
result.data = parsedData;
return res.status(result.status).json(result);
} catch (e) {
result.status = 500;
result.data.message = `ERROR: Unable to parse API response`;
result.data.exception = e;
return res.status(result.status).send(result);
}
});
});
request.on('error', (e) => {
result.status = 500;
result.data.message = `ERROR: API response`;
result.data.exception = e;
return res.status(result.status).send(result);
});
request.end();
});
However, I want to break out this logic into a reusable function, and just pass it the request options dynamically.
I tried just creating a synchronous function wrapper and returning the results, but obviously that didn't work because the sync function does not wait for the completion of the async request.
httpCtrl = {};
httpCtrl.createRequest = (requestOptions) => {
// Does not work due to being synchronous, also tried with async await to no avail
const request = https.request(requestOptions, (response) => {
let result = {
status: null,
data: {}
};
let rawData = '';
response.on('data', (chunk) => {
rawData += chunk;
});
response.on('end', () => {
console.log('No more data in response.');
try {
parsedData = JSON.parse(rawData);
result.status = parsedData.status || 200;
result.data = parsedData;
return result;
} catch (e) {
result.status = 500;
result.data.message = `ERROR: Unable to parse NRS Admin API response`;
result.data.exception = e;
return result;
}
});
});
request.on('error', (e) => {
result.status = 500;
result.data.message = `ERROR: API response`;
result.data.exception = e;
return result;
});
request.end();
});
}
router.get('/', (req, res, next) => {
const requestOptions = httpCtrl.setRequestOptions();
const result = httpCtrl.createRequest(requestOptions);
return res.status(result.status).send(result);
});
How can I update this code to be more re-usable?
Transform createRequest function to a promise, promises work like callbacks except they are much better to read.
// *** createReuqest function is a Promise ***
httpCtrl.createRequest = (requestOptions) => {
return new Promise((resolve, reject) => {
const result = {};
// *** http.request function is a Callback ***
const request = http.request(requestOptions, response => {
let rawData = '';
response.on('data', chunk => rawData += chunk);
// resolve the promise when response ends
response.on('end', () => {
result.status = response.status || 200;
result.data = rawData;
resolve(result);
});
});
// or reject on error
request.on('error', e => {
result.status = 500;
result.data = {
message: 'ERROR: API response',
exception: e
};
reject(result);
});
request.end();
});
};
Now we simply call the function and we chain it with then and catch, however, I choose to use async/await to include all asynchronous JavaScript in this example :) async/await is based on promises but with even cleaner markup.
// *** Finally async/await ***
router.get('/', async (req, res) => {
// initial options for testing
const requestOptions = {
hostname: 'www.google.com',
port: 443,
method: 'GET'
};
// await must be in try/catch to properly handle promise's resolve/reject
try {
const response = await httpCtrl.createRequest(requestOptions);
res.status(response.status).send(response.data);
} catch (error) {
res.status(error.status).send(error.data);
}
});
Hope I've helped.

Using async await with setImmediate

I want to know the how to use setImmediate with async await and handle errors properly. I have written following code. But I am not sure it is adhering to the best practices.
There is a route in my express app
router.get('/parseinvoice', async (req, res, next) => {
try {
const parsedInvoiceResponse = await userhelper.getParseInVoiceList();
res.json({parsedInvoiceResponse})
} catch (error) {
res.json({});
}
});
The userhelper class code
var userhelper = {};
const fs = require('fs'),
path = require('path'),
filePath = path.join(__dirname, './input_user_story_12.txt');
const { promisify } = require('util')
const readFile = promisify(fs.readFile);
userhelper.getParseInVoiceList = async function() {
return new Promise( async (resolve, reject) => {
try {
setImmediate(async function() {
try {
const contents = await readFile(filePath, 'UTF-8');
resolve(contents);
} catch (error) {
reject(error);
}
});
} catch (error) {
reject(error);
}
});
}
module.exports = userhelper;
Although I am getting the response. I am not sure about the setImmediate part, whether the multiple try catch are required. Is there any neat way to write the below code?.
try {
setImmediate(async ()=>{
var res = await readFile(filePath, 'UTF-8');
})
} catch(err) {
}
2.
await setImmediate(()=>{
var res = await readFile(filePath, 'UTF-8');
}).catch(){}
3.
try {
await setImmediate(()=>{
await readFile(filePath, 'UTF-8');
}).catch(){}
} catch() {
}
should return result into res
const res = await setImmediate(()=>{
return readFile(filePath, 'UTF-8');
})
Why are you not just using?
userhelper.getParseInVoiceList = async function() {
return await readFile(filePath, 'UTF-8');
}
Expanding on #Dan D.'s answer, you can await the resolution of an asynchronous setImmediate prior to calling the asynchronous promisified readFile, but I am not sure why you would need to do this without more context.
userhelper.getParseInVoiceList = async function() {
await new Promise((resolve) => setImmediate(() => resolve()));
return await readFile(filePath, 'UTF-8');
}

Returning wrong URL

I am using phantomejs-node for facebook login . Here is my nodejs code :
var phantom = require('phantom');
phantom.create(function(ph) {
ph.createPage(function(page) {
page.open("https://facebook.com", function(status) {
setTimeout(function () {
page.evaluate((function(URL) {
document.getElementById("email").value = "wrong username";
document.getElementById("pass").value = "wrong password";
document.getElementById("u_0_1").click();
return document.URL;
}), function(result) {
console.log('Page url is ' + result);
ph.exit();
}, 5000);
});
});
//page.render("page2.png");
});
});
Instead of returning https://www.facebook.com/login.php?login_attempt=1 , its returning https://www.facebook.com/ . By the way here is Phantomjs code that I am following :
var page = require('webpage').create();
page.open("http://www.facebook.com/login.php", function(status) {
if (status === "success") {
page.evaluate(function() {
document.getElementById("email").value = "#gmail.com";
document.getElementById("pass").value = "";
document.getElementById("u_0_1").click();
});
window.setTimeout(function() {
var url = page.evaluate(
function () {
return document.URL;
}
);
console.log( "- current url is " + url );
page.render("page.png");
phantom.exit();
}, 5000);
}
});
Try this code:
var phantom = require('phantom');
phantom.create(function(ph) {
ph.createPage(function(page) {
page.open("https://facebook.com", function(status) {
page.evaluate((function() {
document.getElementById("email").value = "#gmail.com";
document.getElementById("pass").value = "password";
document.getElementById("login_form").submit();
return;
}), function() {
console.log("loaded");
setTimeout(function(){
page.evaluate(function () {
return document.URL;
},function(result){
page.render("page2.png",function(){
console.log("done rendering");
});
console.log("Page url is "+result);
ph.exit();
});
},6000)
});
});
});
});
Hope this is helpfull :)
If you're tired of callback hell you could also give phridge a try. I've written this bridge because I didn't want to wrap all assignments and function calls with callbacks. It stringifies the given function and runs it inside PhantomJS.
A-0-'s solution would look like:
var page;
// creates a new PhantomJS process
phridge.spawn()
.then(function (phantom) {
return phantom.openPage("https://facebook.com");
})
.then(function (p) {
page = p;
return page.run(function (resolve) {
// this function runs inside PhantomJS
var page = this;
page.evaluate(function () {
document.getElementById("email").value = "#gmail.com";
document.getElementById("pass").value = "password";
document.getElementById("login_form").submit();
});
setTimeout(function () {
page.render("page2.png");
resolve();
}, 6000);
});
})
.then(function () {
// page2.png rendered
});

Resources