How to download PDF blob using puppeteer? - node.js

When the download button is clicked, a new tab is opened where the user can view a PDF statement.
This new tab has a URL starting with blob:, e.g.: blob:https://some-domain.com/statement-id.
How could I download this PDF statement to the file system?
Note: I'm using { headless: false } mode.

Trying to simulate the case:
import puppeteer from 'puppeteer';
import { writeFileSync } from 'fs';
// Minimal PDF from https://github.com/mathiasbynens/small#documents
const minimalPdf = `%PDF-1.
1 0 obj<</Pages 2 0 R>>endobj
2 0 obj<</Kids[3 0 R]/Count 1>>endobj
3 0 obj<</Parent 2 0 R>>endobj
trailer <</Root 1 0 R>>`;
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
try {
const [page] = await browser.pages();
await page.goto('http://example.com/');
await page.evaluate((pdf) => {
const url = URL.createObjectURL(new Blob([pdf], {type: 'application/pdf'}));
window.open(url);
}, minimalPdf);
const newTarget = await page.browserContext().waitForTarget(
target => target.url().startsWith('blob:')
);
const newPage = await newTarget.page();
const blobUrl = newPage.url();
page.once('response', async (response) => {
console.log(response.url());
const pdfBuffer = await response.buffer();
console.log(pdfBuffer.toString());
console.log('same:', pdfBuffer.toString() === minimalPdf);
writeFileSync('minimal.pdf', pdfBuffer);
});
await page.evaluate((url) => { fetch(url); }, blobUrl);
} catch(err) { console.error(err); } finally { /* await browser.close(); */ }

Related

puppeteer-cluster error page.solveRecaptchas is not a function

i'm converting my puppeteer code to puppeteer cluster it was working just fine now i'm facing this error "page.solveRecaptchas is not a function" when trying to 2captcha to solve hcaptcha
this is the complete code that i wrote, it just takes data from an excel file and then filled them on the website
number of the pages depends
`
const xlsx = require('xlsx')
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha')
puppeteer.use(StealthPlugin())
puppeteer.use(
RecaptchaPlugin({
provider: {
id: '2captcha',
token: 'xxxxxxxxxxxx'
},
visualFeedback: true
})
)
const {executablePath} = require('puppeteer')
const { Cluster } = require('puppeteer-cluster');
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 10,
timeout: 150 * 1000 ,
puppeteerOptions: {
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox","--disable-web-security"],
defaultViewport: null,
executablePath: executablePath()
},
});
cluster.on('taskerror', (err, url) => {
console.error((new Date()).toJSON() + ` Error crawling ${url}: ${err.message}`);
});
//get excele data
let fileURL = 'C:/xxxx/xxxx/xxxxx/clients2.xlsx'
let workbook = xlsx.readFile(fileURL)
const sheet_name_list = workbook.SheetNames;
let clientsArr = xlsx.utils.sheet_to_json(workbook.Sheets[sheet_name_list[0]])
console.log(clientsArr);
await cluster.task(async ({ page, data: [email , password,appiontment, firstName , lastName ] }) => {
await page.goto('https://website.com/')
await page.waitForTimeout(1000)
// close popup 1
await page.waitForSelector('#IDBodyPanelapp > div.popup-appCloseIcon');
await page.click('#IDBodyPanelapp > div.popup-appCloseIcon')
//choose region
await page.waitForSelector('#juridiction');
if(region == 'ALGER'){
region = "15#Al#10"
await page.select('#juridiction', region);
}
else{
region = "14#Ora#9"
await page.select('#juridiction', region);
}
// click to get 2nd otp
page.$eval(`#verification_code`, element =>
element.click()
)
// close popup 2
await page.waitForTimeout(1500)
await page.waitForSelector('#IDBodyPanelapp > div.popup-appCloseIcon');
await page.click('#IDBodyPanelapp > div.popup-appCloseIcon')
//solve hcaptcha and submit form
await page.waitForTimeout(2000)
await page.waitForSelector('#category');
if(appiontment == 'Normal'){
appiontment = "Normal"
await page.select('#category', appiontment);
}
else{
appiontment = "Premuim"
await page.select('#category', appiontment);
}
await page.waitForTimeout(15000)
await page.solveRecaptchas()
await Promise.all([
page.waitForNavigation(),
//click submit
page.click(`#em_tr > div.col-sm-6 > input`)
])
await page.screenshot({ path: 'screenshot.png', fullPage: true })
});
clientsArr.map((data)=>{
cluster.execute([data.email, data.password , data.appiontment, data.firstname , data.lastPrenom ]);
})
// await cluster.idle();
// await cluster.close();
})();
`
i have already searched but there are no solutions
need help and thank you

Puppeteer is calling in a loop in nextjs

I have a nextjs page which consists of a react video player which plays a YouTube video based on some id passed in the url. The YouTube video is fetched in getServerSideProps based on the id. Then on the client side I am using /api/some-route to take a screenshot of that video player div using Puppeteer. Problem is when in api side I am opening a browser with Puppeteer with that particular URL, getServerSideProps is called and again my api/some-routes is getting called. So It has made a loop and is not finishing. How do I stop this?
My page:
export default function Home() {
useEffect(() => {
if (typeof window === undefined) {
return;
}
const url = window.location.href;
setTimeout(() => {
fetch(`/api/scrapper?url=${url}`)
.then((res) => {
res.json();
})
.then((data) => {
console.log(data);
});
}, 10000);
}, [params.slug[0]);
return (
<>
<Layout>
<Frame id="capture" />
</Layout>
</>
);
}
export const getServerSideProps = async ({ params }) => {
return {
props: { params, serverData },
};
}
/api/scrapper.js
import puppeteer from "puppeteer";
export default async function My(req, res) {
const url = req.query.url;
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const img = await page.screenshot({ path: "output.png" });
console.log("img", img);
await page.close();
await browser.close();
return res.json("done");
}

How to run latest chromium/puppeteer on firebase

I do have a working application on firebase, that renders code and delivers a png image as output using puppeteer. However I would like to have a newer chromium version running (new features).
There are newer puppeteer versions, but no newer version of chrome-aws-lambda then 10.1.0
and that uses chromium 92.0.4512.0.
Is this project dead (last update 9 month ago), or is there a other package that I can use?
For reference, this is the code that runs on firebase.
const functions = require("firebase-functions");
const chromium = require("chrome-aws-lambda");
const puppeteer = chromium.puppeteer;
const admin = require("firebase-admin");
admin.initializeApp();
exports.preview = functions
.runWith({ memory: "512MB", timeoutSeconds: 10 })
.https.onRequest(async (req, res) => {
const browser = await puppeteer.launch({
args: chromium.args,
defaultViewport: { width: 400, height: 300 },
executablePath: await chromium.executablePath,
headless: true,
});
const {
query: { q = "" },
} = req;
const page = await browser.newPage();
await page.setContent(q);
const screenshot = await page.screenshot();
await browser.close();
res.header({ "Content-Type": "image/png" });
res.end(screenshot, "binary");
});
I have worked around that by using puppeteer directly without -core. So there is no chrome-aws-lambda dependency needed anymore and I can use the latest puppeteer version.
So this is how it works now if anyone else needs that:
const functions = require("firebase-functions");
const puppeteer = require('puppeteer');
// admin seems to be necessary in order
// to run the function via `firebase emulators`
const admin = require("firebase-admin");
admin.initializeApp();
exports.preview = functions
.runWith({ memory: "512MB", timeoutSeconds: 10 })
.https.onRequest(async (req, res) => {
const browser = await puppeteer.launch({
defaultViewport: { width: 400, height: 300 },
headless: true
});
const {
query: { q = "" },
} = req;
const page = await browser.newPage();
await page.setContent(q);
const screenshot = await page.screenshot();
await browser.close();
res.header({ "Content-Type": "image/png" });
res.end(screenshot, "binary");
});

How to send data from node/express to preview pdf in front without saving file?

My aim is to generate pdf contact with puppeteer with an html page that I built.
I succesfully generate this pdf in my back. But I have a problem to send data to my front. I tried many things but... One I got an arrayBuffer, once a blob, now a readableStream and I can read with my front none of theses...
Is there a way to easily send pdf and preview it in browser (in modal) ?
Here is my back :
const date = Date.now();
const pathPDF = `contract-${date}.pdf`;
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
`http://localhost:3000/admin/reservation/${req.params.reservation}/contract`,
{
waitUntil: 'networkidle2',
}
);
const pdf = await page.pdf({
path: pathPDF,
format: 'A4',
printBackground: true,
});
await browser.close();
// res.json(pdf) <-- I tried this first, don't work
// res.contentType('application/pdf');
// res.sendFile(pathPDF); <-- Then this, not working...
// const data = fs.readFileSync(`./${pathPDF}`);
// res.contentType('application/pdf');
// res.send(data); <-- I tryed this too, same...
Here action :
export const createContract = (reservation) => {
return fetch(`${API}/reservation/contract/${reservation}`, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
},
})
.then((response) => {
// return response.blob(); <-- Tried this
return response.json();
})
.catch((err) => console.log(err));
};
Here my call in page :
const generateContract = () => {
setLoading(true);
createContract(reservation._id).then((result) => {
if (result.error) {
setLoading(false);
snackbarShowMessage(`${result.error}`);
} else {
setPdf(URL.createObjectURL(result)); <-- Tried this
setPdf(result) <-- This too
setLoading(false);
snackbarShowMessage(`${result.message}`, 'success');
setOpen(true);
}
});
};
Do you have any idea where I doing wrong..?

How to login to google account with playwright?

I have following source code and run it in headful mode.
I can input email address.
But, after that, there is message that saying "Couldn't sign you in.For your protection, you can't sign in from this device. Try again later, or sign in from another device.".
Do I need to set additional header or something else?
Here is my source code.
const playwright = require('playwright');
const cookiePath = '/home/ubuntu/.config/chromium/Default';
browser['chromium'] = await playwright['chromium'].launchPersistentContext(cookiePath,{
headless: false,
args: [
`--disable-extensions-except=${pathToExtension}`,
`--load-extension=${pathToExtension}`,
],
});
const page = await browser['chromium'].newPage();
const login_url = "https://accounts.google.com/signin/v2/identifier?hl=ja&flowName=GlifWebSignIn&flowEntry=ServiceLogin";
await page.goto(login_url);
await page.fill('#identifierId',userinfo['id']);
await page.click("#identifierNext");
await page.fill('[name=password]',userinfo['password']);
await page.click("#passwordNext");
My solution:
const { chromium } = require("playwright");
(async () => {
const browser = await chromium.launch({
headless: false,
args: ["--disable-dev-shm-usage"],
});
const context = await browser.newContext({});
const page = await context.newPage();
const navigationPromise = page.waitForNavigation({
waitUntil: "domcontentloaded",
});
await page.setDefaultNavigationTimeout(0);
await page.goto(
"https://accounts.google.com/signin/v2/identifier?hl=en&flowName=GlifWebSignIn&flowEntry=ServiceLogin"
);
await navigationPromise;
await page.waitForSelector('input[type="email"]');
await page.type('input[type="email"]', "youremail");
await page.click("#identifierNext");
await page.waitForSelector('input[type="password"]', { visible: true });
await page.type('input[type="password"]', "yourpassword");
await page.waitForSelector("#passwordNext", { visible: true });
await page.click("#passwordNext");
await navigationPromise;
//you are in
I think you can search for login to google with Puppeteer also.
This works for me:
add --disable-blink-features=AutomationControlled to your args.
This works for me:
const browser = await playwright.chromium.launch({
ignoreDefaultArgs: ['--disable-component-extensions-with-background-pages']
})

Resources