Just started using Puppeteer. Trying to parse a page but the evaluate method won't work somehow.
var Browser
var Page
var Result
puppeteer.launch()
.then(function (browser) {
console.log('Browser Created\nCreating Blank Page')
Browser = browser
return Browser.newPage()
})
.then(function (page) {
console.log('Page Created\nVisiting URL')
Page = page
return Page.goto(URL)
})
.then(function (resp) {
console.log('Website Loaded')
return Page.evaluate(function () {
// Completely Sync Stuff
console.log('Evaluating Selectors')
var myElems = document.getElementsByClassName('challenge-type light')
Result = myElems
})
})
.then(function (val) {
console.log(Result)
console.log('Done! Exiting')
Browser.close()
process.exit()
})
.catch(function (err) {
Browser.close()
console.log(err)
process.exit(1)
})
Output :
Browser Created
Creating Blank Page
Page Created
Visiting URL
Website Loaded
undefined
Done! Exiting
What could possibly be the error? Would prefer a solution without async/await.
EDIT: "Evaluating Selectors" is not logged to the console as well, so the code never reaches there, is my concern.
I would double check that
document.getElementsByClassName('challenge-type light')
returns a result.
I believe you're using a headless browser, so sometimes elements may not load as you might expect.
Got the things working finally.
Console inside evaluate will be in page context, so that's the console of chromium page.
We need to return something from the evaluate function. DOM elements won't be returned as is because they lose context outside evaluate.
This worked :
.then(function (resp) {
console.log('Website Loaded')
return Page.evaluate(function () {
return document.querySelector('.cover-heading').innerText
})
})
OK you are on the right path but you have a few problems.
From your own answer: you noted that the console logs executed in the page context when they are executed in the evaluate method. You are correct in saying that but you are incorrect in saying that you can't return DOM elements from the evaluate method. You can just your code isn't quite correct.
So what you have is this:
.then(function (resp) {
console.log('Website Loaded')
return Page.evaluate(function () {
// Completely Sync Stuff
console.log('Evaluating Selectors')
var myElems = document.getElementsByClassName('challenge-type light')
Result = myElems
})
})
.then(function (val) {
console.log(Result)
console.log('Done! Exiting')
});
This won't work since you're trying to assign myElems to the Result variable inside the evaluate method. The evaluate method is executed in the browser. It has no idea that a Result variable exists in your puppeteer script. This is why your variable outputs as undefined at the end.
How to resolve this is as follows:
.then(function () {
return Page.evaluate(function () {
// Return the array of elements from inside the evaluate method
return document.getElementsByClassName('challenge-type light')
});
})
.then(function (elements) {
console.log(elements) // Will be your array of elements
});
Hopefully this helps!
Related
Ultimately, I'm trying to get two different calls to data to res.render on a page. In the process, I console.log to see if the data could eventually be sent to the page from res.render. 1 out of the 5 console.logs does not work, #4, and I don't understand why.
Here's what I have...
server.js code
const uri = 'neo4j address';
const user = 'uname';
const password = 'pword'
const driver = neo4j.driver(uri, neo4j.auth.basic(user, password));
const session = driver.session({ database: 'dbname' });
app.get('/page', async (req, res) => {
try {
const dbTitleResult = await session.run(`MATCH (db:Database)
RETURN db.wikiPageId as dbWikiPageId, db.Title as dbTitle Order By db.Title ASC`);
const dbTitleArr = dbTitleResult.records.map(({_fields}) => {
return {dbWikiPageId:_fields[0],dbTitle:_fields[1]};
});
const wikiPage = request('https://uname:pword!#wiki.com/api/content/id/child?expand=page', function (error, response, body) {
if (error) throw new Error(error);
const wikiAPI = JSON.parse(body);
//console.log(wikiAPI.page.results); //#1 THIS WORKS AND RETURNS THE EXPECTED DATA
const wikiData = wikiAPI.page.results.map(item => item);
//console.log(wikiData); //#2 THIS WORKS AND RETURNS THE EXPECTED DATA
});
//console.log(dbTitleArr); //#3 THIS WORKS AND RETURNS THE EXPECTED DATA
//console.log(wikiData); //#4 THIS DOES NOT WORK
res.render('page.ejs', {dbTitle: dbTitleArr});
//console.log(dbTitleArr); //#5 THIS WORKS AND RETURNS THE EXPECTED DATA
} catch(e) {
console.log("Something went wrong", e) }
});
This is what seems inconsistent and what I appear to need help with, console.log #3 is successful but console.log #4 is not. console.log #4 gives the error of Something went wrong ReferenceError: wikiData is not defined. But it would seem to be defined in the same way that dbTitleArr is defined?
How do I make #4 console.log-able? Or why is it not console.log-able like #3?
For actually using the returned data, I can res.render the dbTitleArr data (and I can view it on page.ejs), I cannot res.render the wikiData data.
What do I need to do to see #4 console.log wikiData and ultimately res.render it?
I was able to ultimately res.render both sets of data, but #4 console.log still does not work. I realize that was the initial question as to why it did not work, so I'll provide someone else with the answer points if they can explain why #3 works but #4 doesn't.
...
What I needed to do was move my res.render into the request code (and I also removed the const wikiPage in front of the request).
app.get('/page', async (req, res) => {
try {
const dbTitleResult = await session.run(`MATCH (db:Database)
RETURN db.wikiPageId as dbWikiPageId, db.Title as dbTitle Order By db.Title ASC`);
const dbTitleArr = dbTitleResult.records.map(({_fields}) => {
return {dbWikiPageId:_fields[0],dbTitle:_fields[1]};
});
request('https://uname:pword!#wiki.com/api/content/id/child?expand=page', function (error, response, body) {
if (error) throw new Error(error);
const confluenceAPI = JSON.parse(body);
//console.log(confluenceAPI.page.results); //#1 WORKS
const wikiData = confluenceAPI.page.results.map(item => item);
//console.log(wikiData); //#2 WORKS
res.render('page.ejs', {dbTitle: dbTitleArr, wikiData});
});
//console.log(dbTitleArr); //#3 WORKS
//console.log(wikiData); //#4 STILL DOES NOT WORK, BUT OK SINCE IT RENDERS
} catch(e) {
console.log("Something went wrong", e)
}
});
I am trying to create a script to download pages from multiple urls using node js but the loop didn't want to wait for the request to finish and continued printing, I also got a hint to use the async for loop, but still it didn't work.
here's my code
function GetPage(url){
console.log(` Downloading page ${url}`);
request({
url: `${url}`
},(err,res,body) => {
if(err) throw err;
console.log(` Writing html to file` );
fs.writeFile(`${url.split('/').slice(-1)[0]}`,`${body}`,(err) => {
if(err) throw err;
console.log('saved');
});
});
}
var list = [ 'https://www.someurl.com/page1.html', 'https://www.someurl.com/page2.html', 'https://www.someurl.com/page3.html' ]
const main = async () => {
for(let i = 0; i < list.length; i++){
console.log(` processing ${list[i]}`);
await GetPage(list[i]);
}
};
main().catch(console.error);
Output :
processing https://www.someurl.com/page1.html
Downloading page https://www.someurl.com/page1.html
processing https://www.someurl.com/page2.html
Downloading page https://www.someurl.com/page2.html
processing https://www.someurl.com/page3.html
Downloading page https://www.someurl.com/page3.html
Writing html to file
Writing html to file
saved
saved
Writing html to file
saved
There are a couple of problems with your code.
You are mixing code that uses the callback style programming and code that should be using promises. Also, your getPage function is not async (it doesn't return a promise) so you cannot await on it.
You just have to return a promise from your getPage() function, and correctly resolve it or reject it.
function getPage(url) {
return new Promise((resolve, reject) => {
console.log(` Downloading page ${url}`);
request({ url: `${url}` }, (err, res, body) => {
if (err) reject(err);
console.log(` Writing html to file`);
fs.writeFile(`${url.replace(/\//g,'-')}.html`, `${body}`, (writeErr) => {
if (writeErr) reject(writeErr);
console.log("saved");
resolve();
});
});
});
}
You don't have to change your main() function loop will await for the getPage() function.
For loop doesn't wait for callback to be finished, it will continue executing it. You need to turn either getPage function to promise or use Promise.all as shown below.
var list = [
"https://www.someurl.com/page1.html",
"https://www.someurl.com/page2.html",
"https://www.someurl.com/page3.html",
];
function getPage(url) {
return new Promise((resolve, reject) => {
console.log(` Downloading page ${url}`);
request({ url: `${url}` }, (err, res, body) => {
if (err) reject(err);
console.log(` Writing html to file`);
fs.writeFile(`${url}.html`, `${body}`, (writeErr) => {
if (writeErr) reject(writeErr);
console.log("saved");
resolve();
});
});
});
}
const main = async () => {
return new Promise((resolve, reject) => {
let promises = [];
list.map((path) => promises.push(getPage(path)));
Promise.all(promises).then(resolve).catch(reject);
});
};
main().catch(console.error);
GetPage() is not built around promises and doesn't even return a promise so await on its result does NOTHING. await has no magic powers. It awaits a promise. If you don't give it a promise that properly resolves/rejects when your async operation is done, then the await does nothing. Your GetPage() function returns nothing so the await has nothing to do.
What you need is to fix GetPage() so it returns a promise that is properly tied to your asynchronous result. Because the request() library has been deprecated and is no longer recommended for new projects and because you need a promise-based solution anyway so you can use await with it, I'd suggest you switch to one of the alternative promise-based libraries recommended here. My favorite from that list is got(), but you can choose whichever one you like best. In addition, you can use fs.promises.writeFile() for promise-based file writing.
Here's how that code would look like using got():
const got = require('got');
const { URL } = require('url');
const path = require('path');
const fs = require('fs');
function getPage(url) {
console.log(` Downloading page ${url}`);
return got(url).text().then(data => {
// can't just use an URL for your filename as it contains potentially illegal
// characters for the file system
// so, add some code to create a sanitized filename here
// find just the root filename in the URL
let urlObj = new URL(url);
let filename = path.basename(urlObj.pathname);
if (!filename) {
filename = "index.html";
}
let extension = path.extname(filename);
if (!extension) {
filename += ".html";
} else if (extension === ".") {
filename += "html";
}
console.log(` Writing file ${filename}`)
return fs.promises.writeFile(filename, data);
});
}
const list = ['https://www.someurl.com/page1.html', 'https://www.someurl.com/page2.html', 'https://www.someurl.com/page3.html'];
async function main() {
for (let url of list) {
console.log(` processing ${url}`);
await getPage(url);
}
}
main().then(() => {
console.log("all done");
}).catch(console.error);
If you put real URLs in the array, this is directly runnable in nodejs. I ran it myself with my own URLs.
Summary of Changes and Improvements:
Switched from request() to got() because it's promise-based and not deprecated.
Modified getPage() to return a promise that represents the asynchronous operations in the function.
Switched to fs.promises.writeFile() so we are using only promises for asynchronous control-flow.
Added legal filename generation from the base path of the URL since you can't just use a full URL as a filename (at least in some file systems).
Switched to simpler for/of loop
I am trying to access what a function returns in node.js
I have the following function:
function getImg(callback) {
https.get('https://api.nasa.gov/planetary/apod?api_key=api-key', response => {
let data = "";
response.on('data', chunk => {
data += chunk;
});
response.on('end', () => {
let img = JSON.parse(data).hdurl;
callback(null, img);
})
}).end();
}
let image = getImg(function(err, image) {
console.log(image);
})
res.render('index', {
indexCSS: true,
image
})
It can log it to the console correctly, but if I want to access the value of the variable like I do in the last line of my code or if I console.log(image) I get undefined.
What have I done wrong. How can I access what the function produces?
It is a callback style function which wouldn't return any thing. Better to convert it to promise sttyle and use async/await to get the value in a variable
function getImg() {
return new Promise((resolve, reject) => {
https.get('https://api.nasa.gov/planetary/apod?api_key=api-key', response => {
let data = "";
response.on('data', chunk => {
data += chunk;
});
response.on('end', () => {
let img = JSON.parse(data).hdurl;
resolve(img);
})
}).end();
});
}
(async() => {
let image = await getImg();
res.render('index', {
indexCSS: true,
image
});
})();
You can't really store the return value of your function like that. Unfortunately JS is non-blocking, so the code will continue to execute past it, before it has a chance to return that value from the https request. I am not sure exactly when you call this function, but you could call res.render in the callbacks response after calling getImg() without assigning its value to something. You can use promises, otherwise it's better to handle the response you need when it is returned from the callback. That would just be a simple call like:
getImg(function(err, image) {
res.render('index', {
indexCSS: true,
image
});
})
Within whatever route is calling this function. You just cannot assign any kind of return value from a callback to a variable (really not recommended at least) in the normal way.
I'm using phantomjs with phridge for correct sharing pages for single page app.
Here's some code example:
if (isbot(req.headers['user-agent'])){
var url= req.protocol+'://'+req.get('host')+ req.originalUrl
phridge.spawn()
.then(function (phantom) {
var page = phantom.createPage();
return page.run(url, function (url, resolve, reject) {
var page = this;
page.open(url, function (status) {
// handle page after load
});
})
.then(function (contnt) {
res.send(contnt);
})
.then(phridge.disposeAll());
.catch(function (err) {
console.error(err.stack);
})
}
else {
next();
}
The question is - how the mechanic res.send() works with promise? Will be phridge.disposeAll() performed?
You are making numerous mistakes. You should make sure you are familiar with Promise style programming before writing these code. See the last section.
In this case, no, because the
.then(function (contnt) {
res.send(contnt);
})
part is not returning a Promise.
In this part, if you are sure res.send will not raise any exception, you could write:
.then(function (contnt) {
res.send(contnt);
return new Promise()
})
And the later part,
.then(phridge.disposeAll())
Is also problematic, you should modify it to be
.then(() => phridge.disposeAll())
even if it is the end of the chain and there is no use of create a new Promise, you should write it this way, because then() function takes functions, not result, to be its argument.
And you need to make sure each .then() branch returns a Promise like object while you are chaining them. (I did not check the others since I don't know what they returns.)
OK, there are more errors, I have seen redundant ; after then() branch. I am not sure if there are more problems.
I think the problem is more serious: you are not understanding Promise style programming. You should read the ES6 Promise docs, or Promise library (like bluebird, depends on which library your library depends on) docs closely first.
I would combine the res.send and disposeAll(). No need to over complicate the code.
res.send is synchronous and returns a boolean.
phridge.spawn()
.then(function (phantom) {
var page = phantom.createPage();
return page.run(url, function (url, resolve, reject) {
var page = this;
page.open(url, function (status) {
// handle page after load
});
})
.then(function (contnt) {
res.send(contnt);
phridge.disposeAll()
})
.catch(function (err) {
console.error(err.stack);
})
I'm wondering how to execute multiple "describe" blocks in synchronous order?
describe('Go To Home Page', function () {
browser.ignoreSynchronization = true;
it("Url should be on Login Page", function () {
browser.get(HomePageUrl).then(function () {
browser.wait(urlChanged(loginPageUrl), 2000).then(
function (newurl){
var url = browser.getCurrentUrl().then(function (url) {
expect(url).tobe(loginPageUrl);
//I know I'm at Login page url... how do I kick off next test?
});
}
)
});
});
});
This test goes to Home page, then if it is redirected to Login page I want to execute all my Login Tests using new Describe blocks. Problem is that if I put the next Describe block at same level as the first, node executes all of them in parallel.
I'd prefer not to get into callback-hell... code above is already getting too deep in my opinion.
Jasmine has Asyncronous support exacly for that.
describe('Go To Home Page', function () {
browser.ignoreSynchronization = true;
it("Url should be on Login Page", function (done) {
browser.get(HomePageUrl).then(function () {
browser.wait(urlChanged(loginPageUrl), 2000).then(
function (newurl){
var url = browser.getCurrentUrl().then(function (url) {
expect(url).tobe(loginPageUrl);
done();
});
}
)
});
});
it("next", function () {
//will start this block after previous done callback has been called
});
});
PS Promises can and should be chained in a single line to avoid nesting:
it("Url should be on Login Page", function (done) {
browser
.get(HomePageUrl)
.then(function () {
return browser.wait(urlChanged(loginPageUrl), 2000);
})
.then(function () {
return browser.getCurrentUrl();
})
.then(function (currentUrl){
expect(currentUrl).tobe(loginPageUrl);
done();
});
});