document.querySelectorAll('.summary').innerText;
This throws an error in the below snippet saying "document.querySelector is not a function" in my Puppeteer page's exposed fucntion docTest.
I want to pass a specific node to each method and get the result inside evaluate.
Same with document.getElemenetbyId.
const puppeteer = require('puppeteer');
//var querySelectorAll = require('query-selector');
let docTest = (document) => {
var summary = document.querySelectorAll(.summary).innerText;
console.log(summary);
return summary;
}
let scrape = async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('http://localhost.com:80/static.html');
await page.waitFor(5000)
await page.exposeFunction('docTest', docTest);
var result = await page.evaluate(() => {
var resultworking = document.querySelector("tr");
console.log(resultworking);
var summary = docTest(document);
console.log(resultworking);
return summary;
});
console.log(result);
await page.waitFor(7000);
browser.close();
return {
result
}
};
scrape().then((value) => {
console.log(value); // Success!
});
I just had the same question. The problem is that the page.evaluate() function callback has to be an async function and your function docTest() will return a Promise when called inside the page.evaluate(). To fix it, just add the async and await keywords to your code:
await page.exposeFunction('docTest', docTest);
var result = await page.evaluate(async () => {
var summary = await docTest(document);
console.log(summary);
return summary;
});
Just remember that page.exposeFunction() will make your function return a Promise, then, you need to use async and await. This happens because your function will not be running inside your browser, but inside your nodejs application.
exposeFunction() does not work after goto()
Why can't I access 'window' in an exposeFunction() function with Puppeteer?
How to use evaluateOnNewDocument and exposeFunction?
exposeFunction remains in memory?
Puppeteer: pass variable in .evaluate()
Puppeteer evaluate function
allow to pass a parameterized funciton as a string to page.evaluate
Functions bound with page.exposeFunction() produce unhandled promise rejections
How to pass a function in Puppeteers .evaluate() method?
How can I dynamically inject functions to evaluate using Puppeteer?
Related
I am trying to scrape https://www.premierleague.com/clubs/38/Wolverhampton-Wanderers/stats?se=274
The results being returned are for the page minus the ?se=274
This is applied by using the filter dropdown on the page and selecting 2019/20 season. I can navigate directly to the page and it works fine, but through code it does not work.
I have tried in cheerio and puppeteer. I was going to try nightmare too but this seems overkill I think. I am clearly not an expert! ;)
function getStats(callback){
var url = "https://www.premierleague.com/clubs/38/Wolverhampton-Wanderers/stats?se=274";
request(url, function (error, response, html) {
//console.log(html);
var $ = cheerio.load(html);
if(!error){
$('.allStatContainer.statontarget_scoring_att').filter(function(){
var data = $(this);
var vSOT = data.text();
//console.log(data);
console.log(vSOT);
});
}
});
callback;
}
This will return 564 instead of 2
It seems like you're calling callback before request returns. Move the callback call into the internal block, where the task you need is completed (in your case, it looks like the filter block).
It also looks like you're missing the () on the callback call.
Also, a recommendation: return the value you need through the callback.
So this code works....$10 from a rent-a-coder did the trick. Easy when you know how!
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://www.premierleague.com/clubs/4/Chelsea/stats?se=274')
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms))
await sleep(4000)
const element = await page.$(".allStatContainer.statontarget_scoring_att");
const text = await page.evaluate(element => element.textContent, element);
console.log("Shots on Target:"+text)
browser.close()
})()
I'm using tesseract JS to convert an image into text format. The conversion is successful and I'm able to print it out in the console. But I am unable to get this text outside the scope of the function.
I have tried assigning the text to a global variable and then printing it but nothing happens.
(async () => {
tesseract.process('new.png', (err, text) => {
if(err){return console.log("An error occured: ", err); }
console.log("Recognized text:",text);
});
})();
Need to be able to get the value of text outside the function and use it again in another asynchronous call.
If you use asynchronous operations, like Promise, callback, async-await you cannot use synchronous flow anymore.
Think it like this, Asynchronous functions are operations that will be completed in future, you want some value out of it then you CANNOT get the value untill the first asynchronous function is completed.
That being said, you CAN use Promises (seem) like synchronous functions if you use aysnc-await, IF you don't want to use Promise chain. So you need to promisify the tesseract.process function:
const utils = require('util');
(async () => {
const tessProcess = utils.promisify(tesseract.process);
try {
const text = await tessProcess('new.png');
console.log("Recognized text:", text);
} catch (err) {
console.log("An error occured: ", err);
}
})();
EDIT: After checking the code snippet:
const utils = require('util');
(async () => {
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
const tessProcess = utils.promisify(tesseract.process);
await page.setViewport(viewPort)
await page.goto('example.com')
await page.screenshot(options)
const text = await tessProcess('new.png');
//YOU CAN USE text HERE/////////////
await page.$eval('input[id=companyID]', (el, value) => el.value = value, text);//here too
await browser.close()
})()
I'm trying to get the src values for all images on Bing image search for a search term. I am using puppeteer for it. I wrote a selector to grab each image tag and it works in the Chrome DevTools. It, however, isn't working when I write it in the code-
const puppeteer = require("puppeteer");
(async () => {
try{
let url = `https://www.bing.com/images/search?q=cannabis`
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
await page.goto(url)
await page.waitForSelector("ul.dgControl_list li img.mimg")
console.log(await page.evaluate(() => {
Array.from(document.querySelectorAll("ul.dgControl_list>li img.mimg"), img => img.src)
}))
} catch(err){
console.log("error - " + err)
}
})()
I get the output as an object containing arrays of 10 items each in the devTools, but when I run it in the console through my code, it is undefined. How do I read this object?
You are not returning any data from the page.evaluate call. To return the data you have to use the return statement or use the short syntax (as explained below):
console.log(await page.evaluate(() => {
return Array.from(document.querySelectorAll("ul.dgControl_list>li img.mimg"), img => img.src)
}))
Explanation: Arrow function
The arrow function has two ways to write them. One is the short syntax, you can use it like this:
const func = () => 1; // func() will simply return 1
You can only put in one statement in there (which might call other statements though). Alternatively, you can use the long form:
const func = () => { return 1; }; // Same function as above
You can use variable declarations and any kind of code inside this function (just as in a normal function() { ... }, but this time you have to use return to return a value.
Therefore, as an alternative, you could also write this (short syntax):
console.log(await page.evaluate(
() => Array.from(document.querySelectorAll("ul.dgControl_list>li img.mimg"), img => img.src)
))
I have written the following code to retrieve song lyrics from the apiseeds lyric api.
const apiseeds = require("apiseeds-lyrics");
const apiseedskey = "MY_API_KEY";
async function getLyrics(artistName, songName)
{
return await apiseeds.getLyric(apiseedskey, artistname, songName,
(response) => {
return response;
});
}
var artist = "Darius Rucker";
var title = "Wagon Wheel";
var lyrics = await getLyrics(artist, title)
console.log(lyrics);
I should also mention that the second block of code there is enclosed within an eventEmitter.on event with an asynchronous callback function.
Whenever the code runs, I get undefined in the console.
async and await can only be used to treat asynchronous functions that returns Promises, not callbacks. You should be able to transform your call to use Promises, or use another library.
The main reason we use await is to wait for the promise to resolve before continuing the code execution:
const result = await codeThatReturnsPromise()
console.log(result)
We could transform your code to this:
// async here means it returns a promise
async function getLyrics(artistName, songName)
{
return new Promise((resolve, reject) => {
apiseeds.getLyric(apiseedskey, artistname, songName, (response) => resolve(response))
})
}
var artist = "Darius Rucker";
var title = "Wagon Wheel";
var lyrics = await getLyrics(artist, title)
console.log(lyrics);
I have a very simple Puppeteer script that uses exposeFunction() to run something inside headless Chrome.
(async function(){
var log = console.log.bind(console),
puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
var functionToInject = function(){
return window.navigator.appName;
}
await page.exposeFunction('functionToInject', functionToInject);
var data = await page.evaluate(async function(){
console.log('woo I run inside a browser')
return await functionToInject();
});
console.log(data);
await browser.close();
})()
This fails with:
ReferenceError: window is not defined
Which refers to the injected function. How can I access window inside the headless Chrome?
I know I can do evaluate() instead, but this doesn't work with a function I pass dynamically:
(async function(){
var log = console.log.bind(console),
puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
var data = await page.evaluate(async function(){
console.log('woo I run inside a browser')
return window.navigator.appName;
});
console.log(data);
await browser.close();
})()
evaluate the function
You can pass the dynamic script using evaluate.
(async function(){
var puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
var functionToInject = function(){
return window.navigator.appName;
}
var data = await page.evaluate(functionToInject); // <-- Just pass the function
console.log(data); // outputs: Netscape
await browser.close();
})()
addScriptTag and readFileSync
You can save the function to a seperate file and use the function using addScriptTag.
await page.addScriptTag({path: 'my-script.js'});
or evaluate with readFileSync.
await page.evaluate(fs.readFileSync(filePath, 'utf8'));
or, pass a parameterized funciton as a string to page.evaluate.
await page.evaluate(new Function('foo', 'console.log(foo);'), {foo: 'bar'});
Make a new function dynamically
How about making it into a runnable function :D ?
function runnable(fn) {
return new Function("arguments", `return ${fn.toString()}(arguments)`);
}
The above will create a new function with provided arguments. We can pass any function we want.
Such as the following function with window, along with arguments,
function functionToInject() {
return window.location.href;
};
works flawlessly with promises too,
function functionToInject() {
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve(window.location.href);
}, 5000);
});
}
and with arguments,
async function functionToInject(someargs) {
return someargs; // {bar: 'foo'}
};
Call the desired function with evaluate,
var data = await page.evaluate(runnable(functionToInject), {bar: "foo"});
console.log(data); // shows the location
exposeFunction() isn't the right tool for this job.
From the Puppeteer docs
page.exposeFunction(name, puppeteerFunction)
puppeteerFunction Callback function which will be called in Puppeteer's context.
'In puppeteer's context' is a little vague, but check out the docs for evaluate():
page.evaluateHandle(pageFunction, ...args)
pageFunction Function to be evaluated in the page context
exposeFunction() doesn't expose a function to run inside the page, but exposes a function to be be run in node to be called from the page.
I have to use evaluate():
You problem could be related to the fact that page.exposeFunction() will make your function return a Promise (requiring the use of async and await). This happens because your function will not be running inside your browser, but inside your nodejs application and its results are being send back and forth into/to the browser code. This is why you function passed to page.exposeFunction() is now returning a promise instead of the actual result. And it explains why the window function is not defined, because your function is running inside nodejs (not your browser) and inside nodejs there is no window definition available.
Related questions:
exposeFunction() does not work after goto()
exposed function queryseldtcor not working in puppeteer
How to use evaluateOnNewDocument and exposeFunction?
exposeFunction remains in memory?
Puppeteer: pass variable in .evaluate()
Puppeteer evaluate function
allow to pass a parameterized funciton as a string to page.evaluate
Functions bound with page.exposeFunction() produce unhandled promise rejections
How to pass a function in Puppeteers .evaluate() method?
How can I dynamically inject functions to evaluate using Puppeteer?