Controlling margins when making a PDF using Playwright - node.js

When making a PDF from a headless chrome Playwright session (code below) I get the PDF on the left, whereas if I make it through the save to pdf in chrome, I get the second output.
I have set the margins explicitly and used preferCSSPageSize: true and both give the same (left) outcome.
How would I get Playwright to give me the same output as chrome does from the print dialog?
An example of the files being printed is here. (In real life, they're all slightly different to account for the spine width.)
const fs = require("fs");
const path = require("path");
const { chromium } = require("playwright");
const directoryPath = "outside";
(async () => {
const filesToPrint = getFilesToPrintList(directoryPath);
filesToPrint.forEach((f, i) => console.log(i, f));
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
for (let i = 0; i < filesToPrint.length; i++) {
const htmlFilename = path.join(directoryPath, filesToPrint[i]);
const pdfFilename = makePDFfilename(htmlFilename);
console.log("[html file]", htmlFilename);
console.log("[pdf file]", pdfFilename);
const page = await context.newPage();
await page.goto(
"file:///" + path.resolve(htmlFilename),
(waitUntil = "networkidle")
);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true,
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
}
await browser.close();
console.log("done");
})();
function makePDFfilename(htmlFilename) {
const parts = htmlFilename.split(/[\\\._]/);
const pdfFilename = path.join(
directoryPath,
`Experimental_${parts[1]}_${parts[2]}.pdf`
);
return pdfFilename;
}
function getFilesToPrintList(directoryPath) {
let theFiles = fs
.readdirSync(directoryPath)
.filter((f) => f.includes("fp_") && f.includes(".html"));
return theFiles;
}

I tried to reproduce your issue:
I used a different background image URL (online) with your HTML page, see test.html gist.
Apparently, a number of options are the defaults. See page.pdf().
Also, added browser.newContext() and browser.close().
Here's the minimal working NodeJS code:
generate-pdf.js
const { chromium } = require("playwright");
(async () => {
const htmlFilename = "file:///<file-path>/test.html";
console.log("[webpage]", htmlFilename);
const browser = await chromium.launch();
const context = await browser.newContext();
const page = await context.newPage();
await page.goto(htmlFilename);
const pdfFilename = "./test.pdf";
console.log("[pdf file]", pdfFilename);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
await browser.close();
console.log("done");
})();
Console Output:
$ node generate-pdf.js
[webpage] file:///<file-path>/test.html
[pdf file] ./test.pdf
About to print:
./test.pdf
with:
{
"path": "./test.pdf",
"pageRanges": "1",
"preferCSSPageSize": true
}
done
Snapshot of PDF (test.pdf):
You might want to test this separately and see if it works for your use-case.

Related

Upload an image to Instagram on Windows without API

This is a knowledge sharing inspired by : Posting (Uploading) an image to Instagram using Selenium not using an API
Most of the answer from internet only has Python version, so I am here to publish a JavaScript version in node.js
This solution only works under Windows OS.
Below code reads photos with .jpg format from upload folder and post one by one with caption.
Download the autoit module from :
https://github.com/xhawk18/node-autoit/issues/15
Update package.json with below dependencies:
"dependencies": {
"selenium-webdriver": "^4.0.0-beta.4",
"autoit": "file:autoit-2.0.1.tgz"
},
Run npm install
Run the actual code :
const fs = require('fs');
const { Builder, By, Key } = require('selenium-webdriver');
const au = require('autoit');
const chrome = require('selenium-webdriver/chrome');
const instagramUrl = 'https://www.instagram.com/';
const account = 'YOUR_INSTAGRAM_ACCOUNT';
const password = 'YOUR_INSTAGRAM_PASSWORD';
const instagramLoginUrl = instagramUrl + 'accounts/login/';
const instagramAccountUrl = instagramUrl + account;
const waitTime = 2000;
const uploadFolder = 'uploadFolderName'
const mobileDeviceName = 'iPhone X';
const loginAlert = '//*[#id="slfErrorAlert"]';
const newPostButton = '//div[#data-testid="new-post-button"]';
const nextButton = '//button[text()="Next"]';
const captionTextArea = '//textarea[#aria-label="Write a caption…"]';
const shareButton = '//button[text()="Share"]';
let readFileNames = () => {
let filenames = [];
fs.readdirSync(uploadFolder).forEach(file => {
let filename = file.split('.');
if (filename[1] == 'jpg') {
filenames.push(file);
}
})
return filenames;
}
let filenames = readFileNames();
(async function example() {
let driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(new chrome.Options()
.setMobileEmulation({ deviceName: mobileDeviceName }))
.build();
try {
// Go to Login page
await driver.get(instagramLoginUrl);
await driver.sleep(waitTime);
// Login
await driver.findElement(By.name('username')).sendKeys(account);
await driver.findElement(By.name('password')).sendKeys(password, Key.RETURN);
await driver.sleep(waitTime);
// Redirect to Profile Home Page
await driver.get(instagramAccountUrl);
await driver.sleep(waitTime);
//Read Local Photos
for (let file of filenames) {
//Create New Post
await driver.findElement(By.xpath(newPostButton)).click();
await driver.sleep(waitTime);
let imagePath = `${__dirname}\\${uploadFolder}\\${file}`;
console.log(imagePath);
// Handle Windows Upload Photo Dialog
let handle = "[TITLE:Open]";
au.Init();
au.WinWait(handle);
au.Send(imagePath);
au.ControlClick(handle, "", "Button1");
await driver.sleep(waitTime);
// Hit Next
await driver.findElement(By.xpath(nextButton)).click();
await driver.sleep(waitTime);
// Fill The Captions
await driver.findElement(By.xpath(captionTextArea)).sendKeys("Your caption");
await driver.sleep(waitTime);
// Share New Post
await driver.findElement(By.xpath(shareButton)).click();
await driver.sleep(waitTime);
// Back To Home Page
await driver.get(instagramAccountUrl);
}
}
catch (err) {
console.error(err);
}
finally {
await driver.quit();
}
})();

How to intercept downloads of blob generated in client side of website through puppeteer?

I have a page on this link (https://master.d3tei1upkyr9mb.amplifyapp.com/report) with 3 export buttons.
These export buttons generate XLSX, CSV, PDF on the frontend, and hence there are no URLs for XLSX, CSV, PDF.
I need puppeteer to be able to download or get or intercept the blobs or buffers of these files in my node backend.
I tried different ways to achieve this but still haven't figured out.
It was possible through playwright library through the code written below. But I need to be able to do it with Puppeteer.
const {chromium} = require('playwright');
const fs = require('fs');
(async () => {
const browser = await chromium.launch();
const context = await browser.newContext({acceptDownloads: true});
const page = await context.newPage();
await page.goto('http://localhost:3000/');
const [ download ] = await Promise.all([
page.waitForEvent('download'), // <-- start waiting for the download
page.click('button#expoXLSX') // <-- perform the action that directly or indirectly initiates it.
]);
const path = await download.path();
console.log(path);
const newFile = await fs.readFileSync(path);
console.log(newFile);
fs.writeFile("test.xlsx", newFile, "binary",function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
await browser.close()
})();
Is there any way?
Any reason not to simulate the click on the frontend and allow puppeteer download the file to the location of your choice? You can easily download the file this way with the following:
Edit: You can determine when the file download completes by listening to the Page.downloadProgress event and checking for the completed state. Getting the actual filename saved to disk isn't 100% guaranteed with this method, but you are able to get what is termed the suggestedFileName from the Page.downloadWillBegin event, which in my tests thus far (at least on the example page in the question) does match the filename persisted to disk.
const puppeteer = require('puppeteer');
const path = require('path');
const downloadPath = path.resolve('./download');
(async ()=> {
let fileName;
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(
'https://master.d3tei1upkyr9mb.amplifyapp.com/report',
{ waitUntil: 'networkidle2' }
);
await page._client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath
});
await page._client.on('Page.downloadWillBegin', ({ url, suggestedFilename }) => {
console.log('download beginning,', url, suggestedFilename);
fileName = suggestedFilename;
});
await page._client.on('Page.downloadProgress', ({ state }) => {
if (state === 'completed') {
console.log('download completed. File location: ', downloadPath + '/' + fileName);
}
});
await page.click('button#expoPDF');
})();

How to download pdf file that opens in new tab with puppeteer?

I am trying to download invoice from website using puppeteer, I just started to learn puppeteer. I am using node to create and execute the code. I have managed to login and navigate to the invoice page, but it opens in new tab, so, code is not detecting it since its not the active tab. This is the code I used:
const puppeteer = require('puppeteer')
const SECRET_EMAIL = 'emailid'
const SECRET_PASSWORD = 'password'
const main = async () => {
const browser = await puppeteer.launch({
headless: false,
})
const page = await browser.newPage()
await page.goto('https://my.apify.com/sign-in', { waitUntil: 'networkidle2' })
await page.waitForSelector('div.sign_shared__SignForm-sc-1jf30gt-2.kFKpB')
await page.type('input#email', SECRET_EMAIL)
await page.type('input#password', SECRET_PASSWORD)
await page.click('input[type="submit"]')
await page.waitForSelector('#logged-user')
await page.goto('https://my.apify.com/billing#/invoices', { waitUntil: 'networkidle2' })
await page.waitForSelector('#reactive-table-1')
await page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a')
const newPagePromise = new Promise(x => browser.once('targetcreated', target => x(target.page())))
const page2 = await newPagePromise
await page2.bringToFront()
await page2.screenshot({ path: 'apify1.png' })
//await browser.close()
}
main()
In the above code I am just trying to take screenshot. Can anyone help me?
Here is an example of a work-around for the chromium issue mentioned in the comments above. Adapt to fit your specific needs and use-case. Basically, you need to capture the new page (target) and then do whatever you need to do to download the file, possibly pass it as a buffer to Node as per the example below if no other means work for you (including a direct request to the download location via fetch or ideally some request library on the back-end)
const [PDF_page] = await Promise.all([
browser
.waitForTarget(target => target.url().includes('my.apify.com/account/invoices/' && target).then(target => target.page()),
ATT_page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a'),
]);
const asyncRes = PDF_page.waitForResponse(response =>
response
.request()
.url()
.includes('my.apify.com/account/invoices'));
await PDF_page.reload();
const res = await asyncRes;
const url = res.url();
const headers = res.headers();
if (!headers['content-type'].includes('application/pdf')) {
await PDF_page.close();
return null;
}
const options = {
// target request options
};
const pdfAb = await PDF_page.evaluate(
async (url, options) => {
function bufferToBase64(buffer) {
return btoa(
new Uint8Array(buffer).reduce((data, byte) => {
return data + String.fromCharCode(byte);
}, ''),
);
}
return await fetch(url, options)
.then(response => response.arrayBuffer())
.then(arrayBuffer => bufferToBase64(arrayBuffer));
},
url,
options,
);
const pdf = Buffer.from(pdfAb, 'base64');
await PDF_page.close();

Print all pdf pages except the last one using Puppeteer

I need to generate different footer for the last pdf page. After investigation I realised that the best way is to generate two different pdfs and combine them. It works fine when I need to change footer or use different templates for first page, or in cases when I know which pages should looks different (using pageRanges option), but I can't find a way to get only last (last n) pages in case when total page number is unknown. Any ideas how I can generate pdf for only last (last n) pages?
Will be appreciated for any answers.
I'm using Puppeteer v 2.1.0 with node.js v 8.16.0
This is a script which I'm using for generating pdf files now.
const puppeteer = require('puppeteer');
const fs = require('fs');
const DEFAULT_HEADER = '<span></span>';
const DEFAULT_FOOTER_HEIGHT = 90;
const DEFAULT_PAGE_PADDING = 36;
const createPdf = async () => {
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
const [, , bodyFilePath, outputFilePath, footerFilePath] = process.argv;
await page.goto(`file:${bodyFilePath}`, { waitUntil: 'networkidle0' });
let footerTemplate = DEFAULT_HEADER;
if (footerFilePath) {
footerTemplate = fs.readFileSync(footerFilePath, 'utf8');
}
await page.pdf({
path: outputFilePath,
format: 'A4',
margin: {
top: DEFAULT_PAGE_PADDING,
right: DEFAULT_PAGE_PADDING,
bottom: DEFAULT_FOOTER_HEIGHT,
left: DEFAULT_PAGE_PADDING,
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: DEFAULT_HEADER,
footerTemplate,
});
} catch (err) {
console.log(err.message);
} finally {
if (browser) {
browser.close();
}
process.exit();
}
};
createPdf();
Templates which I'm converting to pdf are .html.erb files
Maybe there is better ways to solve this problem but at this point I've used this approach - I'm generating export using same script as above, and than I'm using one more script which opens previous pdf file, count pages and generates two new files (which I'm combining to one file on the backend) - All pages except last one, and only last page with different footer.
const puppeteer = require('puppeteer');
const fs = require('fs');
const pdf = require('pdf-parse');
const DEFAULT_HEADER = '<span></span>';
const DEFAULT_FOOTER_HEIGHT = 90;
const DEFAULT_PAGE_PADDING = 36;
const createPdf = async () => {
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
const [
,
,
bodyFilePath,
outputFilePath,
footerFilePath,
lastPagePath,
lastPageFooterPath,
] = process.argv;
await page.goto(`file:${bodyFilePath}`, { waitUntil: 'networkidle0' });
let footerTemplate = DEFAULT_HEADER;
let lastPageFooterTemplate = DEFAULT_HEADER;
if (footerFilePath) {
footerTemplate = fs.readFileSync(footerFilePath, 'utf8');
}
if (lastPageFooterPath) {
lastPageFooterTemplate = fs.readFileSync(lastPageFooterPath, 'utf8');
}
const dataBuffer = fs.readFileSync(outputFilePath);
const pdfInfo = await pdf(dataBuffer);
const numPages = pdfInfo.numpages;
const baseOptions = {
path: outputFilePath,
format: 'A4',
margin: {
top: DEFAULT_PAGE_PADDING,
right: DEFAULT_PAGE_PADDING,
bottom: DEFAULT_FOOTER_HEIGHT,
left: DEFAULT_PAGE_PADDING,
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: DEFAULT_HEADER,
pageRanges: `${numPages}`,
footerTemplate: lastPageFooterTemplate,
};
if (numPages === 1) {
await page.pdf(baseOptions);
} else {
await page.pdf({
...baseOptions,
footerTemplate,
pageRanges: `-${numPages - 1}`,
});
await page.pdf({
...baseOptions,
path: lastPagePath,
footerTemplate: lastPageFooterTemplate,
});
}
} catch (err) {
console.log(err.message);
} finally {
if (browser) {
browser.close();
}
process.exit();
}
};
createPdf();
Hope this will be helpful for someone with same issue.

Puppeteer chrome get active/visible tab

In a chrome extension you can use below to find the active tab in a window
chrome.tabs.query({
currentWindow: true,
active: true,
}
I have a below code which connects to existing browser and get all the pages. I am not able to make out if there is a way for me to know which tab/page is currently the active one and get its url (page.url(), but which one from the the array to use?)
const puppeteer = require('puppeteer');
debuggerUrl = "http://127.0.0.1:9999/json/version"
const request = require('request');
request(debuggerUrl, function (error, response, body) {
data = JSON.parse(body);
webSocketDebuggerUrl = data["webSocketDebuggerUrl"];
console.log("Connecting to ", webSocketDebuggerUrl);
puppeteer.connect({browserWSEndpoint: webSocketDebuggerUrl}).then(async browser => {
var pages = await browser.pages();
console.log(pages);
console.log(await browser.targets())
await browser.disconnect();
})
});
document.hidden is now deprecated. But we can use document.visibilityState
note that page will always refer to the same tab even if you have change to different tab. So you have to change page to the active tab manually.
const pages = await browser.pages();
// this will return list of active tab (which is pages object in puppeteer)
const visiblePages = pages.filter(async (p) => {
const state = await p.evaluate(() => document.visibilityState);
return state === 'visible';
});
const activeTab = visiblePages[0]; // since there should be only 1 active tab per window
Using document.hidden
const pages = await browser.pages()
let page
for (let i = 0; i < pages.length && !page; i++) {
const isHidden = await pages[i].evaluate(() => document.hidden)
if (!isHidden) {
page = pages[i]
}
}
const pages = await browser.pages();
const vis_results = await Promise.all(pages.map(async (p) => {
const state = await p.evaluate(() => document.webkitHidden);
return !state;
}));
let visiblePage = pages.filter((_v, index) => vis_results[index])[0];

Resources