Simple web scraping with puppeteer / cheerio not working with params - node.js

I am trying to scrape https://www.premierleague.com/clubs/38/Wolverhampton-Wanderers/stats?se=274
The results being returned are for the page minus the ?se=274
This is applied by using the filter dropdown on the page and selecting 2019/20 season. I can navigate directly to the page and it works fine, but through code it does not work.
I have tried in cheerio and puppeteer. I was going to try nightmare too but this seems overkill I think. I am clearly not an expert! ;)
function getStats(callback){
var url = "https://www.premierleague.com/clubs/38/Wolverhampton-Wanderers/stats?se=274";
request(url, function (error, response, html) {
//console.log(html);
var $ = cheerio.load(html);
if(!error){
$('.allStatContainer.statontarget_scoring_att').filter(function(){
var data = $(this);
var vSOT = data.text();
//console.log(data);
console.log(vSOT);
});
}
});
callback;
}
This will return 564 instead of 2

It seems like you're calling callback before request returns. Move the callback call into the internal block, where the task you need is completed (in your case, it looks like the filter block).
It also looks like you're missing the () on the callback call.
Also, a recommendation: return the value you need through the callback.

So this code works....$10 from a rent-a-coder did the trick. Easy when you know how!
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://www.premierleague.com/clubs/4/Chelsea/stats?se=274')
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms))
await sleep(4000)
const element = await page.$(".allStatContainer.statontarget_scoring_att");
const text = await page.evaluate(element => element.textContent, element);
console.log("Shots on Target:"+text)
browser.close()
})()

Related

Puppeteer - ExpressJS infinite loop a function

I have a problem with my Express JS app : When I'm trying to call a function, this function is endlessly called ... It opens a lot of chromium browser and cause performance issues ...
I just want to call this function one time.
I've found a solution to make it work (And called just one time), but in this situation I can't pass any parameters ...
const farm = (async () => {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.goto("https://www.example.com/?s=" + term);
await page.waitForSelector("div");
const postLinks = await page.evaluate(() => {
let postLinks = [];
let elements = document.querySelectorAll('div.article');
for (element of elements) {
postLinks.push({
title: element.querySelector('div.meta-info > h3 > a')?.textContent,
url: element.querySelector('div.meta-info > h3 > a')?.href
})
}
return postLinks;
});
console.log(postLinks);
await browser.close();
})();
app.get('/', (req, res) => {
var term = "Drake";
res.send(farm);
});
With the code below, I can pass parameters but I can't return the result in "res.send", and the function is called endlessly :
const farm = async (term) => {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.goto("https://www.example.com/?s=" + term);
await page.waitForSelector("div");
const postLinks = await page.evaluate(() => {
let postLinks = [];
let elements = document.querySelectorAll('div.article');
for (element of elements) {
postLinks.push({
title: element.querySelector('div.meta-info > h3 > a')?.textContent,
url: element.querySelector('div.meta-info > h3 > a')?.href
})
}
return postLinks;
});
console.log(postLinks);
await browser.close();
}
app.get('/', (req, res) => {
var term = "Drake";
var results = farm(term);
res.send(results);
});
Did I miss something ?
Thanks !
It's not an infinite loop, but unresolved promise. The farm returns a promise, which you're not waiting for, but instead send the pending promise before it resolves, i.e. before the puppeteer is done.
You need to wait for farm's promise to resolve, make middleware function async and add await to the farm call:
app.get('/', async(req, res) => {
var term = "Drake";
// farm returns a promise, so you need to wait for it to resolve, i.e. block execution
// otherwise it just sends pending promise, because node.js runs in non-blocking fashion
var results = await farm(term);
res.send(results);
});

How do you call an external API from Selenium script to populate a field using Node JS?

I have a use case where I need to call an external API, parse the JSON that is returned and populate a form field in a web page all within a Selenium script written using Node JS.
Something like this:
// in Selenium script get the form field
let inputElement = await getElementById(driver, "my-id");
// then call an API including callback function
// in the callback function with the JSON response from the API
const myText = response.data.text;
await inputElement.sendKeys(myText,Key.ENTER);
I actually not even sure where to start with this - because I would be adding asynchronous code (the API call and waiting for the response in the callback) to the existing asynchronous code that is running as part of the Selenium script. And I need to not lose references to the web driver and the input element.
Some advice and recommendations to get me going would be very helpful.
If you are using node's inbuild https module the you can do something like this..
const { Builder, By, Key, until } = require("selenium-webdriver");
const https = require("https");
(async function example() {
let driver = await new Builder().forBrowser("chrome").build();
try {
await driver.get("http://www.google.com/ncr");
await https.get("https://jsonplaceholder.typicode.com/users/1", (resp) => {
let data = "";
resp.on("data", (chunk) => {
data += chunk;
});
resp.on("end", async () => {
// console.log(JSON.parse(data)["name"]);
await driver
.findElement(By.name("q"))
.sendKeys(JSON.parse(data)["name"], Key.RETURN);
});
});
await driver.wait(until.titleContains("- Google Search"), 1000);
} finally {
await driver.quit();
}
})();
Or if you are already using library like axios, then you can do something like this
const { Builder, By, Key, until } = require("selenium-webdriver");
const axios = require("axios");
(async function example() {
let driver = await new Builder().forBrowser("chrome").build();
try {
await driver.get("http://www.google.com/ncr");
const { data } = await axios.get(
"https://jsonplaceholder.typicode.com/users/1"
);
await driver.findElement(By.name("q")).sendKeys(data["name"], Key.RETURN);
await driver.wait(until.titleContains("- Google Search"), 1000);
} finally {
await driver.quit();
}
})();
Hope this is what you are looking for..

Exposed function querySelector not working in Puppeteer

document.querySelectorAll('.summary').innerText;
This throws an error in the below snippet saying "document.querySelector is not a function" in my Puppeteer page's exposed fucntion docTest.
I want to pass a specific node to each method and get the result inside evaluate.
Same with document.getElemenetbyId.
const puppeteer = require('puppeteer');
//var querySelectorAll = require('query-selector');
let docTest = (document) => {
var summary = document.querySelectorAll(.summary).innerText;
console.log(summary);
return summary;
}
let scrape = async () => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('http://localhost.com:80/static.html');
await page.waitFor(5000)
await page.exposeFunction('docTest', docTest);
var result = await page.evaluate(() => {
var resultworking = document.querySelector("tr");
console.log(resultworking);
var summary = docTest(document);
console.log(resultworking);
return summary;
});
console.log(result);
await page.waitFor(7000);
browser.close();
return {
result
}
};
scrape().then((value) => {
console.log(value); // Success!
});
I just had the same question. The problem is that the page.evaluate() function callback has to be an async function and your function docTest() will return a Promise when called inside the page.evaluate(). To fix it, just add the async and await keywords to your code:
await page.exposeFunction('docTest', docTest);
var result = await page.evaluate(async () => {
var summary = await docTest(document);
console.log(summary);
return summary;
});
Just remember that page.exposeFunction() will make your function return a Promise, then, you need to use async and await. This happens because your function will not be running inside your browser, but inside your nodejs application.
exposeFunction() does not work after goto()
Why can't I access 'window' in an exposeFunction() function with Puppeteer?
How to use evaluateOnNewDocument and exposeFunction?
exposeFunction remains in memory?
Puppeteer: pass variable in .evaluate()
Puppeteer evaluate function
allow to pass a parameterized funciton as a string to page.evaluate
Functions bound with page.exposeFunction() produce unhandled promise rejections
How to pass a function in Puppeteers .evaluate() method?
How can I dynamically inject functions to evaluate using Puppeteer?

Why can't I access 'window' in an exposeFunction() function with Puppeteer?

I have a very simple Puppeteer script that uses exposeFunction() to run something inside headless Chrome.
(async function(){
var log = console.log.bind(console),
puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
var functionToInject = function(){
return window.navigator.appName;
}
await page.exposeFunction('functionToInject', functionToInject);
var data = await page.evaluate(async function(){
console.log('woo I run inside a browser')
return await functionToInject();
});
console.log(data);
await browser.close();
})()
This fails with:
ReferenceError: window is not defined
Which refers to the injected function. How can I access window inside the headless Chrome?
I know I can do evaluate() instead, but this doesn't work with a function I pass dynamically:
(async function(){
var log = console.log.bind(console),
puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
var data = await page.evaluate(async function(){
console.log('woo I run inside a browser')
return window.navigator.appName;
});
console.log(data);
await browser.close();
})()
evaluate the function
You can pass the dynamic script using evaluate.
(async function(){
var puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
var functionToInject = function(){
return window.navigator.appName;
}
var data = await page.evaluate(functionToInject); // <-- Just pass the function
console.log(data); // outputs: Netscape
await browser.close();
})()
addScriptTag and readFileSync
You can save the function to a seperate file and use the function using addScriptTag.
await page.addScriptTag({path: 'my-script.js'});
or evaluate with readFileSync.
await page.evaluate(fs.readFileSync(filePath, 'utf8'));
or, pass a parameterized funciton as a string to page.evaluate.
await page.evaluate(new Function('foo', 'console.log(foo);'), {foo: 'bar'});
Make a new function dynamically
How about making it into a runnable function :D ?
function runnable(fn) {
return new Function("arguments", `return ${fn.toString()}(arguments)`);
}
The above will create a new function with provided arguments. We can pass any function we want.
Such as the following function with window, along with arguments,
function functionToInject() {
return window.location.href;
};
works flawlessly with promises too,
function functionToInject() {
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve(window.location.href);
}, 5000);
});
}
and with arguments,
async function functionToInject(someargs) {
return someargs; // {bar: 'foo'}
};
Call the desired function with evaluate,
var data = await page.evaluate(runnable(functionToInject), {bar: "foo"});
console.log(data); // shows the location
exposeFunction() isn't the right tool for this job.
From the Puppeteer docs
page.exposeFunction(name, puppeteerFunction)
puppeteerFunction Callback function which will be called in Puppeteer's context.
'In puppeteer's context' is a little vague, but check out the docs for evaluate():
page.evaluateHandle(pageFunction, ...args)
pageFunction Function to be evaluated in the page context
exposeFunction() doesn't expose a function to run inside the page, but exposes a function to be be run in node to be called from the page.
I have to use evaluate():
You problem could be related to the fact that page.exposeFunction() will make your function return a Promise (requiring the use of async and await). This happens because your function will not be running inside your browser, but inside your nodejs application and its results are being send back and forth into/to the browser code. This is why you function passed to page.exposeFunction() is now returning a promise instead of the actual result. And it explains why the window function is not defined, because your function is running inside nodejs (not your browser) and inside nodejs there is no window definition available.
Related questions:
exposeFunction() does not work after goto()
exposed function queryseldtcor not working in puppeteer
How to use evaluateOnNewDocument and exposeFunction?
exposeFunction remains in memory?
Puppeteer: pass variable in .evaluate()
Puppeteer evaluate function
allow to pass a parameterized funciton as a string to page.evaluate
Functions bound with page.exposeFunction() produce unhandled promise rejections
How to pass a function in Puppeteers .evaluate() method?
How can I dynamically inject functions to evaluate using Puppeteer?

How to pass dynamic page automation commands to puppeteer from external file?

I'm trying to pass dynamic page automation commands to puppeteer from an external file. I'm new to puppeteer and node so I apologize in advance.
// app.js
// ========
app.get('/test', (req, res) =>
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('http://testurl.com');
var events = require('./events.json');
for(var i=0;i<events.length;i++){
var tmp = events[i];
await page.evaluate((tmp) => { return Promise.resolve(tmp.event); }, tmp);
}
await browser.close();
})());
My events json file looks like:
// events.json
// ========
[
{
"event":"page.waitFor(4000)"
},
{
"event":"page.click('#aLogin')"
},
{
"event":"page.waitFor(1000)"
}
]
I've tried several variations of the above as well as importing a module that passes the page object to one of the module function, but nothing has worked. Can anyone tell me if this is possible and, if so, how to better achieve this?
The solution is actually very simple and straightforward. You just have to understand how this works.
First of all, you cannot pass page elements like that to evaluate. Instead you can do the following,
On a seperate file,
module.exports = async function getCommands(page) {
return Promise.all([
await page.waitFor(4000),
await page.click("#aLogin"),
await page.waitFor(1000)
]);
};
Now on your main file,
await require('./events.js').getCommands(page);
There, it's done! It'll execute all commands for you one by one just as you wanted.
Here is a complete code with some adjustments,
const puppeteer = require("puppeteer");
async function getCommands(page) {
return Promise.all([
await page.title(),
await page.waitFor(1000)
]);
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
let data = await getCommands(page);
console.log(data);
await page.close();
await browser.close();
})();

Resources