How to save a canvas as an image using puppeteer? - node.js

I'm trying to load a page with a canvas and then save it as an image.
For example, this page. On Chrome, I can right click the canvas with a circle on the upper right side of the page and click save image. I want to do this exact same thing but through NodeJS and Puppeteer. Is this possible?
So far I'm trying to select it via
const express = require('express')
const router = express.Router()
const puppeteer = require('puppeteer')
const { Cluster } = require('puppeteer-cluster')
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 2,
})
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
await cluster.task(async({ page, data: url }) => {
// let starmapId = 'celestial-canvas'
await page.goto(url)
const canvas = await page.evaluate(() => document.querySelector('#myCanvas'))
return canvas // .toDataURL()
})
router.get('/export/canvas', function(req, res) {
// Get URL
var url = 'https://www.w3schools.com/html/tryit.asp?filename=tryhtml5_canvas_tut_path2'
cluster.execute(url).then( canvas => {
console.log(canvas)
res.send(canvas)
})
})
})();
module.exports = router
But canvas is returning null.

In your example, the canvas is inside an iframe. So you need to get the frame first, then you will able to transfer the string with the data URL:
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch(/* { headless: false, defaultViewport: null } */);
try {
const [page] = await browser.pages();
await page.goto('https://www.w3schools.com/html/tryit.asp?filename=tryhtml5_canvas_tut_path2');
const frame = await (await page.$('#iframeResult')).contentFrame();
const data = await frame.evaluate(() => {
return document.querySelector('#myCanvas').toDataURL();
});
console.log(data); // data:image/png;base64,iVBORw0K...
} catch(err) { console.error(err); } finally { await browser.close(); }

Related

How to get the quantity of children inside the element using xpath query on the puppeter?

I need to get the number of children inside an element with the page.$x(xpath) function in Puppeteer, mad I was not successful. In the browser console I can do it using $x().children.length. What would be the best way to do this in the puppeteer with node?
You can use either element/JS handle API or pure Web API in page.evaluate(). These are both wways:
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
const [element] = await page.$x('//body/div');
const children = await element.getProperty('children');
const length1 = await (await children.getProperty('length')).jsonValue();
console.log(length1); // 3
const length2 = await page.evaluate(() => {
return document.evaluate(
'//body/div', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE
).singleNodeValue.children.length;
});
console.log(length2); // 3
await browser.close();
} catch (err) {
console.error(err);
}
})();

Scraper with Puppeteer login returns just one element of the array

This code is supposed to loop through the urls that get scraped from the scrapeProductPage function. But before looping, it needs to log in so that it can obtain the prices. The prices are only displayed to logged in users. Instead of looping through the urls it just returns the scraped data from one page. The error I get is "MaxListenersExceededWarning: Possible EventEmitter memory leak detected".
const request = require("request-promise");
const cheerio = require("cheerio");
const ObjectsToCsv = require("objects-to-csv");
const puppeteer = require('puppeteer');
const url = "https://www.example.com";
const scrapeResults = [];
async function scrapeProductPage() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("td.productListing-data > a[style='position:relative;float:left;']").each((index, element) => {
let url = $(element).attr("href");
url = "https\://www.example.com/" + url;
const scrapeResult = { url };
scrapeResults.push(scrapeResult);
});
return scrapeResults;
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(productsWithImages) {
process.setMaxListeners(0);
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.example.com/login');
await page.waitFor(500);
await page.waitFor('input[name="email_address"]');
await page.type('input[name="email_address"]', 'example#gmail.com');
await page.type('input[name="password"]', '123test');
await page.click('#btnLogin');
return await Promise.all(
productsWithImages.map(async job => {
try {
await page.goto(job.url, { waitUntil: "load" });
const content = await page.content();
const $ = await cheerio.load(content);
job.main_img = $('img#main_img').attr('src');
job.name = $('h2').text();
job.price = $("td.products_info_price").text();
return job;
} catch (error) {
console.error(error);
}
})
);
}
async function saveDataToCsv(data) {
const csv = new ObjectsToCsv(data);
console.log(csv);
}
async function scrapeWona() {
const productsWithImages = await scrapeProductPage();
const wonaFullData = await scrapeDescription(productsWithImages);
await saveDataToCsv(productsWithImages);
}
scrapeWona();
The reason you're getting the warning is because of process.setMaxListeners(0)
Indicates you have a memory leak somewhere in the code.
You can take a look at the documentation here also: https://nodejs.org/docs/latest/api/events.html#events_emitter_setmaxlisteners_n
Take a look at the answer from here: node.js - request - How to "emitter.setMaxListeners()"?

Nodejs console.log within an async function

Can someone help me understand why doesn't the product data get printed out? I'm currently using puppeteer to scrape a website for product data.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
//link to page that i want to scrape
await page.goto(
"link link",
{ waitUntil: "networkidle2" }
);
var data = await page
.evaluate(() => {
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
return productData;
})
.catch(err => {
console.log(err);
});
console.log(data);
await browser.close();
})();
you are using promise and callback together. If you instead return a promise from the page.evaluate, it should work.
thanks to #tehhowch.
var data = await page
.evaluate(async () => {
return await new Promise(resolve => { // <-- return the data to node.js from browser
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
resolve(productData);
});
})
.catch(err => {
console.log(err);
});
console.log(data);

Puppeteer wrong result with evaluate() & exposeFunction()

I ran the following and it appears to gather a large number of links, however on actual inspection of the site with collectLinks1 I get all valid links, but with collectLinks2 I got 59 iterations of http://pieroxy.net/blog/2014/11/18/[
I'm new to Puppeteer and I can't find out why with collectLinks2 I don't get the links.
const { parse, resolve } = require('url');
const trim = require('lodash/trim');
const startsWith = require('lodash/startsWith');
const includes = require('lodash/includes');
// https://github.com/GoogleChrome/puppeteer
const puppeteer = require('puppeteer');
// https://github.com/gwuhaolin/chrome-finder
const findChrome = require('chrome-finder');
function resolveUrl(url, baseUrl) {
url = trim(url);
if (!url) return null;
if (startsWith(url, '#')) return null;
const { protocol } = parse(url);
if (includes(['http:', 'https:'], protocol)) {
return url.split('#')[0];
} if (!protocol) {
return resolve(baseUrl, url).split('#')[0];
}
return null;
}
async function collectLinks1(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
const assetUrls = await htmlPage.$$eval('a[href]', assetLinks => assetLinks.map(link => link.href));
assetUrls.forEach(link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
return links;
}
async function collectLinks2(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
await htmlPage.exposeFunction('pushToLinks', link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
await htmlPage.evaluate(() => {
function findLinks(document) {
document.querySelectorAll('a[href]')
.forEach(link => {
window.pushToLinks(link.href);
});
}
findLinks(window.document);
});
return links;
}
const crawl = async url => {
try {
console.log(`Crawling ${url}`);
const browser = await puppeteer.launch({
headless: false,
executablePath: findChrome(),
});
const page = await browser.newPage();
await page.goto(url);
// OK
const links1 = await collectLinks1(page);
links1.forEach(link => { console.log(link); });
// KO
const links2 = await collectLinks2(page);
links2.forEach(link => { console.log(link); });
await browser.close();
} catch (err) {
console.log(err);
}
};
crawl('http://pieroxy.net/blog/2014/11/18/user_agent_detection_in_java.html');
You need to await the function defined via page.exposeFunction as it returns a Promise. As you are only calling the function but not awaiting its result, your page.evaluate call will resolve before your script finished executing.
Solution
Instead of the forEach, you should use a loop to iterate over all the items and communicate them to the page one after another.
async function collectLinks2(htmlPage) {
// ...
await htmlPage.evaluate(async () => {
async function findLinks(document) {
for (const link of document.querySelectorAll('a[href]')) {
await window.pushToLinks(link.href);
}
}
await findLinks(window.document);
});
return links;
}

puppeteer : cant log in and loop through urls

Hi Guys I want to log in a website and once authenticated want to loop through a given set of URLS and scrape data. What I intend to do can be described by this example,however I get Unhandled promise rejection.
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
console.log(title);
}
catch(e) {
console.log(e)
}
return title
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
getTitle(page, url)
}
await browser.close();
})();
There are multiple issues in this example.
You should await the call to function getTitle, you re awaiting inside the function but you have to await the call to the function too.
You should surround getTitle with a try and catch block and check inside the function if theres a title to return (ex. the title for google is null)
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
if(title){
return title
}
}
catch(e) {
throw(e)
console.log(e)
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
try{
console.log(await getTitle(page, url))
}
catch(e ){
console.log('No title')
}
}
await browser.close();
})();

Resources