Print all pdf pages except the last one using Puppeteer - node.js

I need to generate different footer for the last pdf page. After investigation I realised that the best way is to generate two different pdfs and combine them. It works fine when I need to change footer or use different templates for first page, or in cases when I know which pages should looks different (using pageRanges option), but I can't find a way to get only last (last n) pages in case when total page number is unknown. Any ideas how I can generate pdf for only last (last n) pages?
Will be appreciated for any answers.
I'm using Puppeteer v 2.1.0 with node.js v 8.16.0
This is a script which I'm using for generating pdf files now.
const puppeteer = require('puppeteer');
const fs = require('fs');
const DEFAULT_HEADER = '<span></span>';
const DEFAULT_FOOTER_HEIGHT = 90;
const DEFAULT_PAGE_PADDING = 36;
const createPdf = async () => {
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
const [, , bodyFilePath, outputFilePath, footerFilePath] = process.argv;
await page.goto(`file:${bodyFilePath}`, { waitUntil: 'networkidle0' });
let footerTemplate = DEFAULT_HEADER;
if (footerFilePath) {
footerTemplate = fs.readFileSync(footerFilePath, 'utf8');
}
await page.pdf({
path: outputFilePath,
format: 'A4',
margin: {
top: DEFAULT_PAGE_PADDING,
right: DEFAULT_PAGE_PADDING,
bottom: DEFAULT_FOOTER_HEIGHT,
left: DEFAULT_PAGE_PADDING,
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: DEFAULT_HEADER,
footerTemplate,
});
} catch (err) {
console.log(err.message);
} finally {
if (browser) {
browser.close();
}
process.exit();
}
};
createPdf();
Templates which I'm converting to pdf are .html.erb files

Maybe there is better ways to solve this problem but at this point I've used this approach - I'm generating export using same script as above, and than I'm using one more script which opens previous pdf file, count pages and generates two new files (which I'm combining to one file on the backend) - All pages except last one, and only last page with different footer.
const puppeteer = require('puppeteer');
const fs = require('fs');
const pdf = require('pdf-parse');
const DEFAULT_HEADER = '<span></span>';
const DEFAULT_FOOTER_HEIGHT = 90;
const DEFAULT_PAGE_PADDING = 36;
const createPdf = async () => {
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
const [
,
,
bodyFilePath,
outputFilePath,
footerFilePath,
lastPagePath,
lastPageFooterPath,
] = process.argv;
await page.goto(`file:${bodyFilePath}`, { waitUntil: 'networkidle0' });
let footerTemplate = DEFAULT_HEADER;
let lastPageFooterTemplate = DEFAULT_HEADER;
if (footerFilePath) {
footerTemplate = fs.readFileSync(footerFilePath, 'utf8');
}
if (lastPageFooterPath) {
lastPageFooterTemplate = fs.readFileSync(lastPageFooterPath, 'utf8');
}
const dataBuffer = fs.readFileSync(outputFilePath);
const pdfInfo = await pdf(dataBuffer);
const numPages = pdfInfo.numpages;
const baseOptions = {
path: outputFilePath,
format: 'A4',
margin: {
top: DEFAULT_PAGE_PADDING,
right: DEFAULT_PAGE_PADDING,
bottom: DEFAULT_FOOTER_HEIGHT,
left: DEFAULT_PAGE_PADDING,
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: DEFAULT_HEADER,
pageRanges: `${numPages}`,
footerTemplate: lastPageFooterTemplate,
};
if (numPages === 1) {
await page.pdf(baseOptions);
} else {
await page.pdf({
...baseOptions,
footerTemplate,
pageRanges: `-${numPages - 1}`,
});
await page.pdf({
...baseOptions,
path: lastPagePath,
footerTemplate: lastPageFooterTemplate,
});
}
} catch (err) {
console.log(err.message);
} finally {
if (browser) {
browser.close();
}
process.exit();
}
};
createPdf();
Hope this will be helpful for someone with same issue.

Related

How to scrape image src the right way using puppeteer?

I'm trying to create a function that can capture the src attribute from a website. But all of the most common ways of doing so, aren't working.
This was my original attempt.
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
try {
await page.setDefaultNavigationTimeout(0);
await page.waitForTimeout(500);
await page.goto(
`https://www.sirved.com/restaurant/essex-ontario-canada/dairy-freez/1/menus/3413654`,
{
waitUntil: "domcontentloaded",
}
);
const fetchImgSrc = await page.evaluate(() => {
const img = document.querySelectorAll(
"#menus > div.tab-content >div > div > div.swiper-wrapper > div.swiper-slide > img"
);
let src = [];
for (let i = 0; i < img.length; i++) {
src.push(img[i].getAttribute("src"));
}
return src;
});
console.log(fetchImgSrc);
} catch (err) {
console.log(err);
}
await browser.close();
})();
[];
In my next attempt I tried a suggestion and was returned an empty string.
await page.setViewport({ width: 1024, height: 768 });
const imgs = await page.$$eval("#menus img", (images) =>
images.map((i) => i.src)
);
console.log(imgs);
And in my final attempt I fallowed another suggestion and was returned an array with two empty strings inside of it.
const fetchImgSrc = await page.evaluate(() => {
const img = document.querySelectorAll(".swiper-lazy-loaded");
let src = [];
for (let i = 0; i < img.length; i++) {
src.push(img[i].getAttribute("src"));
}
return src;
});
console.log(fetchImgSrc);
In each attempt i only replaced the function and console log portion of the code. I've done a lot of digging and found these are the most common ways of scrapping an image src using puppeteer and I've used them in other ways but for some reason right now they aren't working for me. I'm not sure if I have a bug in my code or why it will not work.
To return the src link for the two menu images on this page you can use
const fetchImgSrc = await page.evaluate(() => {
const img = document.querySelectorAll('.swiper-lazy-loaded');
let src = [];
for (let i = 0; i < img.length; i++) {
src.push(img[i].getAttribute("src"));
}
return src;
});
This gives us the expected output
['https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3b9eabc40.jpg', 'https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3bbe93cc6.jpg']
You have two issues here:
Puppeteer by default opens the page in a smaller window and the images to be scraped are lazy loaded, while they are not in the viewport: they won't be loaded (not even have src-s). You need to set your puppeteer browser to a bigger size with page.setViewport.
Element.getAttribute is not advised if you are working with dynamically changing websites: It will always return the original attribute value, which is an empty string in the lazy loaded image. What you need is the src property that is always up-to-date in the DOM. It is a topic of attribute vs property value in JavaScript.
By the way: you can shorten your script with page.$$eval like this:
await page.setViewport({ width: 1024, height: 768 })
const imgs = await page.$$eval('#menus img', images => images.map(i => i.src))
console.log(imgs)
Output:
[
'https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3b9eabc40.jpg',
'https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3bbe93cc6.jpg'
]

Puppeteer Cluster Inconsistent Results

I've recently converted a script over from Puppeteer to Puppeteer Cluster and during testing I've observed some odd results when testing multiple pages concurrently.
Effectively I'm loading a single page and then iterating over the product options on the page and gathering the price for any product variants.
One particular product has around 9 product variants, sometimes I will accurately capture all 9 variants, whereas on the next testing cycle it may only return 2 or 3 variants.
Any help would be greatly appreciated!
const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster');
const Product = require('../utils/product')
const config = require('../config/config.json')
const selectors = config.productData;
(async () => {
const urls = [
{link: ...},
{link: ...},
{link: ...}
]
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 5,
puppeteerOptions: {
headless: false
},
});
await cluster.task(async ({ page, data: url }) => {
//instantiate a new product object
const product = new Product();
await page.goto(url, { waitUntil: 'load' });
const skuprice = await page.$eval(selectors.price, element => element.innerText);
console.log('Sku Price:' + skuprice)
//deal with variants
const options = await page.$$eval(selectors.variant, elements => elements.map(element=>element.id))
if (options.length > 0) {
//set up a variants array
for (let index = 0; index < options.length; index++) {
const element = options[index];
await page.waitForSelector(`#${element}`);
await page.$eval(`#${element}`, radio => radio.click());
await page.waitForTimeout(500);
const variantprice = await page.$eval(selectors.price, element => element.innerText);
console.log('Variant Price:' + variantprice)
}
}
});
urls.forEach(url => {
cluster.queue(url.link);
})
// many more pages
await cluster.idle();
await cluster.close();
})();
Dynamic javascript page should be scraped when all of the element is visible.
You can do following tricks:
[1] wait until selector is visible, check withawait page.waitForSelector(selector, {visible: true, timeout: 0})
[2] wait for desired time, but this is more flaky and prone to resulting error.
You can simplify and rewrite your code, like this:
await page.waitForSelector(`#${element}`, {visible: true, timeout: 0})
await page.click(`#${element}`)
/* await page.waitForTimeout(500) <= prone to error, use line below */
await page.waitForSelector(selectors.price, {visible: true, timeout: 0})
const variantprice = await page.$eval(selectors.price, element => element.innerText)
For anyone else searching for an answer, it looked like some of my CSS selectors weren't working on page refresh.
Re-reading the project documentation its worth including the following:
// Event handler to be called in case of problems
cluster.on('taskerror', (err, data) => {
console.log(`Error crawling ${data}: ${err.message}`);
});

How to download pdf file that opens in new tab with puppeteer?

I am trying to download invoice from website using puppeteer, I just started to learn puppeteer. I am using node to create and execute the code. I have managed to login and navigate to the invoice page, but it opens in new tab, so, code is not detecting it since its not the active tab. This is the code I used:
const puppeteer = require('puppeteer')
const SECRET_EMAIL = 'emailid'
const SECRET_PASSWORD = 'password'
const main = async () => {
const browser = await puppeteer.launch({
headless: false,
})
const page = await browser.newPage()
await page.goto('https://my.apify.com/sign-in', { waitUntil: 'networkidle2' })
await page.waitForSelector('div.sign_shared__SignForm-sc-1jf30gt-2.kFKpB')
await page.type('input#email', SECRET_EMAIL)
await page.type('input#password', SECRET_PASSWORD)
await page.click('input[type="submit"]')
await page.waitForSelector('#logged-user')
await page.goto('https://my.apify.com/billing#/invoices', { waitUntil: 'networkidle2' })
await page.waitForSelector('#reactive-table-1')
await page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a')
const newPagePromise = new Promise(x => browser.once('targetcreated', target => x(target.page())))
const page2 = await newPagePromise
await page2.bringToFront()
await page2.screenshot({ path: 'apify1.png' })
//await browser.close()
}
main()
In the above code I am just trying to take screenshot. Can anyone help me?
Here is an example of a work-around for the chromium issue mentioned in the comments above. Adapt to fit your specific needs and use-case. Basically, you need to capture the new page (target) and then do whatever you need to do to download the file, possibly pass it as a buffer to Node as per the example below if no other means work for you (including a direct request to the download location via fetch or ideally some request library on the back-end)
const [PDF_page] = await Promise.all([
browser
.waitForTarget(target => target.url().includes('my.apify.com/account/invoices/' && target).then(target => target.page()),
ATT_page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a'),
]);
const asyncRes = PDF_page.waitForResponse(response =>
response
.request()
.url()
.includes('my.apify.com/account/invoices'));
await PDF_page.reload();
const res = await asyncRes;
const url = res.url();
const headers = res.headers();
if (!headers['content-type'].includes('application/pdf')) {
await PDF_page.close();
return null;
}
const options = {
// target request options
};
const pdfAb = await PDF_page.evaluate(
async (url, options) => {
function bufferToBase64(buffer) {
return btoa(
new Uint8Array(buffer).reduce((data, byte) => {
return data + String.fromCharCode(byte);
}, ''),
);
}
return await fetch(url, options)
.then(response => response.arrayBuffer())
.then(arrayBuffer => bufferToBase64(arrayBuffer));
},
url,
options,
);
const pdf = Buffer.from(pdfAb, 'base64');
await PDF_page.close();

Why am I not able to navigate through iFrames using Apify/Puppeteer?

I'm trying to manipulate forms of sites w/ iFrames in it using Puppeteer. I tried different ways to reach a specific iFrame, or even to count iFrames in a website, with no success.
Why isn't Puppeteer's object recognizing the iFrames / child frames of the page I'm trying to navigate through?
It's happening with other pages as well, such as https://www.veiculos.itau.com.br/simulacao
const Apify = require('apify');
const sleep = require('sleep-promise');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
// Create and navigate new page
console.log('Open target page');
const page = await browser.newPage();
await page.goto('https://www.credlineitau.com.br/');
await sleep(15 * 1000);
for (const frame in page.mainFrame().childFrames()) {
console.log('test');
}
await browser.close();
});
Perhaps you'll find some helpful inspiration below.
const waitForIframeContent = async (page, frameSelector, contentSelector) => {
await page.waitForFunction((frameSelector, contentSelector) => {
const frame = document.querySelector(frameSelector);
const node = frame.contentDocument.querySelector(contentSelector);
return node && node.innerText;
}, {
timeout: TIMEOUTS.ten,
}, frameSelector, contentSelector);
};
const $frame = await waitForSelector(page, SELECTORS.frame.iframeNode).catch(() => null);
if ($frame) {
const frame = page.frames().find(frame => frame.name() === 'content-iframe');
const $cancelStatus = await waitForSelector(frame, SELECTORS.frame.membership.cancelStatus).catch(() => null);
await waitForIframeContent(page, SELECTORS.frame.iframeNode, SELECTORS.frame.membership.cancelStatus);
}
Give it a shot.

Controlling margins when making a PDF using Playwright

When making a PDF from a headless chrome Playwright session (code below) I get the PDF on the left, whereas if I make it through the save to pdf in chrome, I get the second output.
I have set the margins explicitly and used preferCSSPageSize: true and both give the same (left) outcome.
How would I get Playwright to give me the same output as chrome does from the print dialog?
An example of the files being printed is here. (In real life, they're all slightly different to account for the spine width.)
const fs = require("fs");
const path = require("path");
const { chromium } = require("playwright");
const directoryPath = "outside";
(async () => {
const filesToPrint = getFilesToPrintList(directoryPath);
filesToPrint.forEach((f, i) => console.log(i, f));
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
for (let i = 0; i < filesToPrint.length; i++) {
const htmlFilename = path.join(directoryPath, filesToPrint[i]);
const pdfFilename = makePDFfilename(htmlFilename);
console.log("[html file]", htmlFilename);
console.log("[pdf file]", pdfFilename);
const page = await context.newPage();
await page.goto(
"file:///" + path.resolve(htmlFilename),
(waitUntil = "networkidle")
);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true,
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
}
await browser.close();
console.log("done");
})();
function makePDFfilename(htmlFilename) {
const parts = htmlFilename.split(/[\\\._]/);
const pdfFilename = path.join(
directoryPath,
`Experimental_${parts[1]}_${parts[2]}.pdf`
);
return pdfFilename;
}
function getFilesToPrintList(directoryPath) {
let theFiles = fs
.readdirSync(directoryPath)
.filter((f) => f.includes("fp_") && f.includes(".html"));
return theFiles;
}
I tried to reproduce your issue:
I used a different background image URL (online) with your HTML page, see test.html gist.
Apparently, a number of options are the defaults. See page.pdf().
Also, added browser.newContext() and browser.close().
Here's the minimal working NodeJS code:
generate-pdf.js
const { chromium } = require("playwright");
(async () => {
const htmlFilename = "file:///<file-path>/test.html";
console.log("[webpage]", htmlFilename);
const browser = await chromium.launch();
const context = await browser.newContext();
const page = await context.newPage();
await page.goto(htmlFilename);
const pdfFilename = "./test.pdf";
console.log("[pdf file]", pdfFilename);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
await browser.close();
console.log("done");
})();
Console Output:
$ node generate-pdf.js
[webpage] file:///<file-path>/test.html
[pdf file] ./test.pdf
About to print:
./test.pdf
with:
{
"path": "./test.pdf",
"pageRanges": "1",
"preferCSSPageSize": true
}
done
Snapshot of PDF (test.pdf):
You might want to test this separately and see if it works for your use-case.

Resources