Queueing Asynchronous tasks

Queueing Asynchronous tasks - node.js

I'm trying to create a script that takes a list of URL's, goes to the site and takes a screenshot.
I have managed to get this to work with puppeteer. However the problem I've had is when I have say 50 URLs in the list, it will attempt to launch puppet sessions for all of them at once, which means that most time out before the site loads and it can take a screenshot.
I've found I can successfully run 10 at once, so I'm wanting to set up a queueing system to do this.
parser.on('readable', function(){
while(record = parser.read()){
counter +=1;
console.log(record.URL);
(async (url = record.URL, name = record.shortURL, counter1 = counter) => {
const browser = await puppeteer.launch( {defaultViewport: {width: 1024, height:768} } );
const page = await browser.newPage();
await page.goto(url);
title = await page.title();
domainRegex = /^(?:https?:\/\/)?(?:[^#\n]+#)?(?:www\.)?([^:\/\n?]+)/img;
match = domainRegex.exec(url);
width = 1024;//await page.viewport().width;
height = 1000;//await page.viewport.height();
await page.screenshot({path: "Screenshots/"+counter1+". "+match[1] + "- " +title.replace(/[\W_]+/g,"")+".jpg", clip : {x:0, y:0, width: width, height: height}});
await browser.close();
})();
}
});

The code below will initially launch 10 sessions. Once each session finishes, it will dequeue the next record and launch another one, until there are no more records remaining. This will ensure that a max of 10 will be running at the same time.
parser.on('readable', async () => {
const maxNumberOfSessions = 10;
let counter = 0;
await Promise.all(Array.from({length: maxNumberOfSessions}, dequeueRecord));
console.log("All records have been processed.");
function dequeueRecord() {
const nextRecord = parser.read();
if(nextRecord) return processRecord(nextRecord).then(dequeueRecord);
}
async function processRecord(record) {
const number = ++counter;
console.log("Processing record #" + number + ": " + record.URL);
const browser = await puppeteer.launch({defaultViewport: {width: 1024, height: 768}});
const page = await browser.newPage();
await page.goto(record.URL);
const title = await page.title();
const domainRegex = /^(?:https?:\/\/)?(?:[^#\n]+#)?(?:www\.)?([^:\/\n?]+)/img;
const match = domainRegex.exec(record.URL);
const width = 1024; // await page.viewport().width;
const height = 1000; // await page.viewport().height;
await page.screenshot({path: "Screenshots/" + number + ". " + match[1] + "- " + title.replace(/[\W_]+/g, "") + ".jpg", clip: {x: 0, y: 0, width, height}});
await browser.close();
}
});

If you want to run all of them serially, you can turn this into a async function and await it. This way, it will run one by one.
// let's separate it for readability
async function getRecord(record, counter) {
const url = record.URL,
name = record.shortURL,
counter1 = counter;
const browser = await puppeteer.launch({
defaultViewport: {
width: 1024,
height: 768
}
});
const page = await browser.newPage();
await page.goto(url);
title = await page.title();
domainRegex = /^(?:https?:\/\/)?(?:[^#\n]+#)?(?:www\.)?([^:\/\n?]+)/img;
match = domainRegex.exec(url);
width = 1024; //await page.viewport().width;
height = 1000; //await page.viewport.height();
await page.screenshot({
path: "Screenshots/" + counter1 + ". " + match[1] + "- " + title.replace(/[\W_]+/g, "") + ".jpg",
clip: {
x: 0,
y: 0,
width: width,
height: height
}
});
await browser.close();
}
parser.on('readable', async function() { // <-- here we make it async
while (record = parser.read()) {
counter += 1;
console.log(record.URL);
await getRecord(record, counter) // <-- and we await each call
}
});
There are other ways like Promise.map and for..of, but let's keep this simpler for now.

You might want to take a look at puppeteer-cluster (disclaimer: I'm the author).
You can do it like this:
(async () => {
// create a cluster that handles 10 parallel browsers
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 10,
});
// define the task
await cluster.task(async ({ page, data: { counter, record} }) => {
const url = record.URL;
await page.goto(url);
title = await page.title();
domainRegex = /^(?:https?:\/\/)?(?:[^#\n]+#)?(?:www\.)?([^:\/\n?]+)/img;
match = domainRegex.exec(url);
width = 1024;//await page.viewport().width;
height = 1000;//await page.viewport.height();
await page.screenshot({path: "Screenshots/"+counter+". "+match[1] + "- " +title.replace(/[\W_]+/g,"")+".jpg", clip : {x:0, y:0, width: width, height: height}});
});
// queue your jobs
parser.on('readable', function () {
while (record = parser.read()) {
counter += 1;
cluster.queue({ counter, record });
}
});
})();
This will handle 10 parallel browser instances and will also take care of browser crashes and error handling.

If you want to run a set of promises in sequence you could make use of Promise.mapSeries from Bluebird package. I understand that this would mean adding an additional package, but it's simple and does not need you to build a queuing system.
http://bluebirdjs.com/docs/api/promise.mapseries.html

Related

How to scrape image src the right way using puppeteer?

I'm trying to create a function that can capture the src attribute from a website. But all of the most common ways of doing so, aren't working.
This was my original attempt.
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
try {
await page.setDefaultNavigationTimeout(0);
await page.waitForTimeout(500);
await page.goto(
`https://www.sirved.com/restaurant/essex-ontario-canada/dairy-freez/1/menus/3413654`,
{
waitUntil: "domcontentloaded",
}
);
const fetchImgSrc = await page.evaluate(() => {
const img = document.querySelectorAll(
"#menus > div.tab-content >div > div > div.swiper-wrapper > div.swiper-slide > img"
);
let src = [];
for (let i = 0; i < img.length; i++) {
src.push(img[i].getAttribute("src"));
}
return src;
});
console.log(fetchImgSrc);
} catch (err) {
console.log(err);
}
await browser.close();
})();
[];
In my next attempt I tried a suggestion and was returned an empty string.
await page.setViewport({ width: 1024, height: 768 });
const imgs = await page.$$eval("#menus img", (images) =>
images.map((i) => i.src)
);
console.log(imgs);
And in my final attempt I fallowed another suggestion and was returned an array with two empty strings inside of it.
const fetchImgSrc = await page.evaluate(() => {
const img = document.querySelectorAll(".swiper-lazy-loaded");
let src = [];
for (let i = 0; i < img.length; i++) {
src.push(img[i].getAttribute("src"));
}
return src;
});
console.log(fetchImgSrc);
In each attempt i only replaced the function and console log portion of the code. I've done a lot of digging and found these are the most common ways of scrapping an image src using puppeteer and I've used them in other ways but for some reason right now they aren't working for me. I'm not sure if I have a bug in my code or why it will not work.

To return the src link for the two menu images on this page you can use
const fetchImgSrc = await page.evaluate(() => {
const img = document.querySelectorAll('.swiper-lazy-loaded');
let src = [];
for (let i = 0; i < img.length; i++) {
src.push(img[i].getAttribute("src"));
}
return src;
});
This gives us the expected output
['https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3b9eabc40.jpg', 'https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3bbe93cc6.jpg']

You have two issues here:
Puppeteer by default opens the page in a smaller window and the images to be scraped are lazy loaded, while they are not in the viewport: they won't be loaded (not even have src-s). You need to set your puppeteer browser to a bigger size with page.setViewport.
Element.getAttribute is not advised if you are working with dynamically changing websites: It will always return the original attribute value, which is an empty string in the lazy loaded image. What you need is the src property that is always up-to-date in the DOM. It is a topic of attribute vs property value in JavaScript.
By the way: you can shorten your script with page.$$eval like this:
await page.setViewport({ width: 1024, height: 768 })
const imgs = await page.$$eval('#menus img', images => images.map(i => i.src))
console.log(imgs)
Output:
[
'https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3b9eabc40.jpg',
'https://images.sirved.com/ChIJq6qqqlrZOogRs_xGxBcn0_w/5caf3bbe93cc6.jpg'
]

Puppeteer create PDF files from HTML data hangs Windows 10 system

I created an App that processes students results by extracting data from multiple excel workbooks. The problem is that using Puppeteer to generate the PDF files, throws the system into a loop till it hangs the system.
Actually, I have tested same codes below using PhantomJs which is bundled as pdf-creator-node, and was able to generate 150 PDF files comfortably in 3 minutes. The only challenge I dumped PhantomJs is that all the styling in the CSS file was not included, even when I inserted it as an inline style in the header, suing replace function of JS. Another, is that PhantomJs is no longer in active development. I searched the web, and found out that only Puppeteer is the valid solution with active development and support too.
I tried using page.close() at the end of pdfCreator() which is in a loop, and browser.close() at the end of pdfGenerator(). What I am doing wrong?
Here below are the codes in the server.js and PdfGenerator.js files, with a sample of the ERROR, and screenshot of my Task Manager after the system crawled out of hanging state. For HTML generation, I used Mustache. I excluded some lines of codes in server.js because the total character count was over 60k.
server.js
// [codes were removed here]
if(getCode == 'compute-result') {
// declare variable
let setData = null;
let setTitle = 'Results Computation...';
let setArgs = getArgs;
// dataFromFile = ReadFile(pathCodeTextFile);
// setArgs = Number(dataFromFile);
setCode = 'compute-result';
let setView = [];
let setNext = true;
let countTerms = [];
// if(getArg > 0) {
// Final Result computation
const getJson = ReadFile(pathJsonResults);
// const getCtrl = ReadFile(pathJsonCtrl);
const getResultObject = JSON.parse(getJson);
getResult = getResultObject;
const totalResults = getResult.firstTerm.length + getResult.secondTerm.length + getResult.thirdTerm.length;
if(setView.length < 1 && getResult != null) {
setData = 'PDFs for Students Results initiating...';
setView.unshift('Reading saved data...');
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: null, view: JSON.stringify(setView)});
}
Sleep(2000).then(() => {
if(getResult != null) {
setData = 'Students Results will be ready in a moment';
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
const wacthFiles = (file, className, termName, sessionName, completed, pdfList) => {
try {
if(typeof file == 'string' && !FileExists(pathJsonPdfList)) {
if(pdfList.length < 2){
setData = 'Saving PDFs to downladable files...';
}
if(className != null && termName != null && sessionName != null) {
setTitle = `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}...`;
setView.unshift(file);
if(!countTerms.includes(termName)) {
countTerms.push(termName)
}
// setCode = -1000 - pdfList.length;
// console.log('PDF PROGRESS: ', `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}... ${setCode}`);
// when all PDFs are created
if(completed) {
setTitle = setTitle.replace('...', ' [completed]');
setData = 'Result Download button is Active. You may click it now.';
setView.unshift('=== PDF GENERATION COMPLETED ===');
setView.unshift(`A total of ${pdfList.length} students' Results were generated`);
WriteFile(pathJsonPdfList, JSON.stringify(pdfList));
// set donwload button active
setCode = Number(codeTextFilePdfCompleted);
setNext = false;
getResult = null;
let termString = countTerms.toString();
termString = ReplaceAll(termString, '-term', '');
termString = ReplaceAll(termString, ',', '-');
const addTxt = `${className} _${termString} Term${countTerms.length>1?'s':''} (${sessionName})`;
WriteFile(pathCodeTextFile, addTxt);
// console.log('======== PDF GENERATION ENDS ================');
} else {
setCode = -1 * pdfList.length;
}
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
}
} catch (error) {
console.log('ERROR ON WATCHER: ', error);
}
}
if(!FileExists(pathJsonPdfList) && getResult !== null) {
PdfGenerator(getResult, wacthFiles);
}
// Watcher(pathWatchResults, setCode, wacthDir, 10000);
});
// }
}
}
} catch (error) {
})
client.on('disconnect', () => {
console.log('SERVER: Disconnected');
});
server.listen(portApi, () =>{
console.log('Server listens on port 8881')
});
// serve static files
app.use(express.static(pathPublic));
// [codes were removed here]
PdfGenerator.js
The problem lies in these functions: PdfGenerator & createPdf
'use strict';
process.setMaxListeners(Infinity) // fix for Puppeteer MaxListenerExceededWarning
const Puppeteer = require('puppeteer')
const {HtmlGenerator} = require('../components/HtmlGenerator')
const {WriteFile, FileExists, RandomNumber, RoundNumber, IsNumberFraction, ReadFile} = require('../components/Functions')
if (process.env.NODE_ENV !== 'production') {
require('dotenv').config();
}
const pathFirstTermResults = process.env.DIR_FIRST_TERM_RESULTS;
const pathSecondTermResults = process.env.DIR_SECOND_TERM_RESULTS;
const pathThirdTermResults = process.env.DIR_THIRD_TERM_RESULTS;
const publicDir = process.env.DIR_PUBLIC;
const cssFile = process.env.PATH_CSS_FILENAME;
const pathCssRaw = __dirname + '\\' + publicDir + '\\' + cssFile;
const pathCss = pathCssRaw.replace(`\\uploads`, '');
const tagCssReplace = process.env.TAG_CSS_REPLACE;
let jsonDir = process.env.PATH_JSON;
jsonDir = jsonDir.split('/').pop();
let htmlDir = process.env.DIR_HTML;
htmlDir = __dirname + '\\' + htmlDir.split('/').pop();
const htmlType1 = htmlDir + '\\' + process.env.HTML_TYPE1;
const htmlType2 = htmlDir + '\\' + process.env.HTML_TYPE2;
const htmlType3 = htmlDir + '\\' + process.env.HTML_TYPE3;
const pathJsonPdfList = './' + jsonDir + '/' + process.env.JSON_PDF_LIST_FILENAME;
const pathJsonPdfContent = __dirname + '\\' + jsonDir + '\\' + process.env.JSON_PDF_CONTENT;
const firstTermDir = 'first-term';
const secondTermDir = 'second-term';
const thirdTermDir = 'third-term';
let cumulativeFirstTermTotalList = {};
let cumulativeSecondTermTotalList = {};
let firstTermOnce = true;
let secondTermOnce = true;
let thirdTermOnce = true;
let isActive = false;
const getPath = (p, f) => {
let dir = pathFirstTermResults;
switch (p) {
case firstTermDir:
dir = pathFirstTermResults;
break;
case secondTermDir:
dir = pathSecondTermResults;
break;
case thirdTermDir:
dir = pathThirdTermResults;
break;
default:
break;
}
return dir + f
}
const resolution = {
x: 1920,
y: 1080
}
const args = [
'--disable-gpu',
`--window-size=${resolution.x},${resolution.y}`,
'--no-sandbox',
]
const createPdf = (page, content, templateType, filename, className, term, sessionName, isProcessActive, pdfFileList, cb) => {
let path, document, options;
path = getPath(term, filename);
if(path != null) {
let options = {
path: path,
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
let templateData = '';
switch (templateType) {
case '1':
templateData = ReadFile(htmlType1);
break;
case '2':
templateData = ReadFile(htmlType2);
break;
case '3':
templateData = ReadFile(htmlType3);
break;
default:
templateData = ReadFile(htmlType1);
break;
}
(async() => {
const html = HtmlGenerator(content, templateData);
if(html != undefined && html !== '' && html != null) {
// create PDF file
cb(filename, className, term, sessionName, isProcessActive, pdfFileList);
// get style from .css & replace
const css = ReadFile(pathCss);
await page.setContent(html, { waitUntil: 'networkidle0'});
await page.addStyleTag(css);
await page.pdf(options);
page.close();
}
})()
}
}
const pdfGenerator = (json, cb) => {
let data = {};
let pdfFileList = [];
if(typeof json == 'string') {
data = JSON.parse(json)
} else {
data = json;
}
try {
// declare defaults
let filename = 'Student' + '.pdf';
let termName = firstTermDir;
const templateType = data.keys.templateType;
const session = data.classInfo.Session;
const sessionName = session.replace('/', '-');
const students = data.students;
const className = data.classInfo.Class_Name;
const recordFirstTerm = data.firstTerm;
const recordSecondTerm = data.secondTerm;
const recordThirdTerm = data.thirdTerm;
let pdfCreatedList = [];
let isReset = false;
let totalResultsExpected = Object.keys(recordFirstTerm).length + Object.keys(recordSecondTerm).length + Object.keys(recordThirdTerm).length;
let totalResultsCount = 0;
let jsonForPdf = {};
let record = {};
let sRecord, path, id, fName, lName;
// get each student
let logEndOnce = true;
let logBeforeOnce = true;
logBeforeOnce && console.log('============== *** ================');
logBeforeOnce && console.log('======== PDF GENERATION BEGINS ================');
const computeResult = (page, setTerm, setRecord, setReset) => {
const termName = setTerm;
const record = setRecord;
let isReset = setReset;
logBeforeOnce && console.log(`====== ${termName} RESULTS BEGINS ======`);
for(let elem of students){
id = elem.id;
fName = elem.firstName;
lName = elem.lastName;
filename = `${lName} ${fName} _${termName} ${sessionName}.pdf`;
// sRecord = record.filter(function (entry) { return entry[id] !== undefined; });
sRecord = record[id];
path = getPath(termName, filename);
// create pdf
if(!FileExists(path) && !FileExists(pathJsonPdfList)){
// generate final JSON for the student
// isReset = (pdfCreatedList.includes(id))? false: true;
jsonForPdf = finalJson(elem, sRecord, data, termName);
(pdfFileList.length < 1) && WriteFile(pathJsonPdfContent, JSON.stringify(jsonForPdf));
pdfFileList.push({
'term': termName,
'file': filename
});
totalResultsCount = pdfFileList.length;
const pdfDate = new Date();
console.log(`${filename} (${totalResultsCount}/${totalResultsExpected}) at ${pdfDate.getHours()}hr${pdfDate.getHours()>1?'s':''} - ${pdfDate.getMinutes()}min${pdfDate.getMinutes()>1?'s':''} - ${pdfDate.getSeconds()}sec${pdfDate.getSeconds()>1?'s':''}`);
isActive = (totalResultsExpected === totalResultsCount)? true: false;
logEndOnce = false;
// cb(filename, className, termName, sessionName, isActive, pdfFileList);
// WriteFile(path, null);
isReset = true;
createPdf(page, jsonForPdf, templateType, filename, className, termName, sessionName, isActive, pdfFileList, cb);
}
}
logBeforeOnce && console.log(`====== ${termName} RESULTS ENDS ======`);
}
// get each student result for First Term
const computeFirstTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.firstTerm === '1') {
termName = firstTermDir;
record = recordFirstTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Second Term
const computeSecondTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.secondTerm === '1') {
termName = secondTermDir;
record = recordSecondTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Third Term
const computeThirdTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.thirdTerm === '1') {
termName = thirdTermDir;
record = recordThirdTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
if(totalResultsExpected === totalResultsCount && totalResultsCount !== 0 && !logEndOnce) {
logEndOnce = true;
logBeforeOnce = false;
console.log('======== PDF GENERATION ENDS ================');
}
} catch (error) {
console.log('==== ERROR IN PDF GENERATION: ', error)
}
}
module.exports = {
PdfGenerator: pdfGenerator
}
ERROR
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
lerna ERR! yarn run start stderr:
<--- Last few GCs --->
[9884:000002D68A73C6B0] 1665171 ms: Scavenge 44.1 (45.8) -> 43.2 (45.8) MB, 223.9 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1684089 ms: Scavenge 44.1 (45.8) -> 43.3 (45.8) MB, 587.3 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1749901 ms: Scavenge 44.2 (45.8) -> 43.3 (45.8) MB, 5099.0 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
<--- JS stacktrace --->
FATAL ERROR: Committing semi space failed. Allocation failed - JavaScript heap out of memory
1: 00007FF6ED61013F
2: 00007FF6ED59F396
3: 00007FF6ED5A024D
4: 00007FF6EDED19EE
5: 00007FF6EDEBBECD
6: 00007FF6EDD5F61C
7: 00007FF6EDD6933F
8: 00007FF6EDD5BF19
9: 00007FF6EDD5A0D0
10: 00007FF6EDD7EA06
11: 00007FF6EDAB1CD5
12: 00007FF6EDF5F3E1
13: 00007FF6EDF602E9
14: 000002D68C4EF69E
error Command failed with exit code 134.
Screenshot of Task Manager, Chromium running multiple instances of over 50.
I appreciate any help. I hope this can be resolved to give me a smooth PDF generation.
Thank you.

Example solution (limiting parallel browsers)
I created you a PdfPrinter class which you can integrate into your setup. It allows you to limit the amount of parallel pdf generation jobs and allows setting a limit and manages opening/closing the browser for you. The PdfPrinter class is also highly coupled and needed some modification for using it as a general queue. Logicwise this can be modified to be a general queue.
You can try to integrate that into your code. This is a fully working test example with simplified pdfs (without the part of getting the actual data from the excel..)
As far as I understood your code, you do not need to pass the page around all your functions. First create your html + css and then use the pdfPrinter and let it handle page creation + browser launching..
(I like to code stuff like this so I went straight ahead..)
var puppeteer = require('puppeteer')
const defaultPrinterOptions = {
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
class PdfPrinter {
maxBrowsers = 2
enqueuedPrintJobs = []
failedJobs = []
browserInstances = 0
// max browser instances in parallel
constructor(maxBrowsers) {
this.maxBrowsers = maxBrowsers
}
/**
*
* #param {*} html the html content to print
* #param {*} css to apply to the page
* #param {*} printOptions options passed to puppeteer
*/
// enqueues a print but the exact end moment cannot be known..
enqueuePrint = (html, css, path, done) => {
// merge custom options with defaultOptions..
const printOptions = {
...defaultPrinterOptions,
// add the path to the options.
path: path
}
// create a function which can be stored in an array
// it will later be grabbed by startPrinter() OR at the time any
// brwoser freed up..
// the function needs to be passed the actual used browser instance!
this.enqueuedPrintJobs.push(async(browser) => {
// catch the error which may be produced when printing something..
try {
// print the document
await this.print(browser, html, css, printOptions)
} catch (err) {
console.error('error when printing document..CLosing browser and starting a new job!!', printOptions.path)
console.error(err)
// store someting so you now what failed and coudl be retried or something..
this.failedJobs.push({ html, css, path: printOptions.path })
// puppeteer can run into erros too!!
// so close the browser and launch a new one!
await this.closeBrowser(browser)
browser = await this.launchBrowser()
}
// after the print, call done() so the promise is resovled in the right moment when
// this particular print has ended.!
done()
// start the next job right now if there are any left.
const job = this.enqueuedPrintJobs.shift()
if (!job) {
console.log('No print jobs available anymore. CLosing this browser instance.. Remaining browsers now:', this.maxBrowsers - this.browserInstances + 1)
await this.closeBrowser(browser)
return
}
// job is actually this function itself! It will be executed
// and automatically grab a new job after completion :)
// we pass the same browser instance to the next job!.
await job(browser)
})
// whenever a print job added make sure to start the printer
// this starts new browser instances if the limit is not exceeded resp. if no browser is instantiated yet,
// and does nothing if maximum browser count is reached..
this.tryStartPrinter()
}
// same as enqueuePrint except it wraps it in a promise so we can now the
// exact end moment and await it..
enqueuePrintPromise(html, css, path) {
return new Promise((resolve, reject) => {
try {
this.enqueuePrint(html, css, path, resolve)
} catch (err) {
console.error('unexpected error when setting up print job..', err)
reject(err)
}
})
}
// If browser instance limit is not reached will isntantiate a new one and run a print job with it.
// a print job will automatically grab a next job with the created browser if there are any left.
tryStartPrinter = async() => {
// Max browser count in use OR no jobs left.
if (this.browserInstances >= this.maxBrowsers || this.enqueuedPrintJobs.length === 0) {
return
}
// browser instances available!
// create a new one
console.log('launching new browser. Available after launch:', this.maxBrowsers - this.browserInstances - 1)
const browser = await this.launchBrowser()
// run job
const job = this.enqueuedPrintJobs.shift()
await job(browser)
}
closeBrowser = async(browser) => {
// decrement browsers in use!
// important to call before closing browser!!
this.browserInstances--
await browser.close()
}
launchBrowser = async() => {
// increment browsers in use!
// important to increase before actualy launching (async stuff..)
this.browserInstances++
// this code you have to adjust according your enviromnemt..
const browser = await puppeteer.launch({ headless: true })
return browser
}
// The actual print function which creates a pdf.
print = async(browser, html, css, printOptions) => {
console.log('Converting page to pdf. path:', printOptions.path)
// Run pdf creation in seperate page.
const page = await browser.newPage()
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.addStyleTag({ content: css });
await page.pdf(printOptions);
await page.close();
}
}
// testing the PDFPrinter with some jobs.
// make sure to run the printer in an `async` function so u can
// use await...
const testPrinterQueue = async() => {
// config
const maxOpenedBrowsers = 5 // amount of browser instances which are allowed to be opened in parallel
const testJobCount = 100 // amount of test pdf jobs to be created
const destDir = 'C:\\somepath' // the directory to store the pdfs in..
// create sample jobs for testing...
const jobs = []
for (let i = 0; i < testJobCount; i++) {
jobs.push({
html: `<h1>job number [${i}]</h1>`,
css: 'h1 { background-color: red; }',
path: require('path').join(destDir, `pdf_${i}.pdf`)
})
}
// track time
const label = 'printed a total of ' + testJobCount + ' pdfs!'
console.time(label)
// run the actual pdf generation..
const printer = new PdfPrinter(maxOpenedBrowsers)
const jobProms = []
for (let job of jobs) {
// run jobs in parallel. Each job wil be runned async and return a Promise therefor
jobProms.push(
printer.enqueuePrintPromise(job.html, job.css, job.path)
)
}
console.log('All jobs enqueued!! Wating for finish now.')
// helper function which awaits all the print jobs, resp. an array of promises.
await Promise.all(jobProms)
console.timeEnd(label)
// failed jobs::
console.log('jobs failed:', printer.failedJobs)
// as file:
await require('fs').promises.writeFile('failed-jobs.json', JSON.stringify(printer.failedJobs))
}
testPrinterQueue().then(() => {
console.log('done with everyting..')
}).catch(err => {
console.error('unexpected error occured while printing all pages...', err)
})
You only need to adjust the destDir / openedBrowsers and testJobCount vars in the beginning of testPrinterQueue() for getting this to work.
What caused the problem in your code
Let's have a look at this piece
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
You created an anonymous function which is executed immediatly. Within the function all the statements are correctly awaited using await. But if you run this whole piece within a synchronious part of your application, the whole function will start immediatly but NOT been awaited before running next code.
Checkout this example:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
AsyncFunction();
// what you have done in your code:
(async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction()
console.log('done')
Note the output:
Async named function started
Async anonymus function started
sync function ended. // => sync function already ended
done // sync function ended and code continues execution.
Async named function ended
Async anonymus function ended
To correctly await your async stuff you need to put your whole application in async scope:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
// this is now async!!
async function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
await AsyncFunction();
// what you have done in your code:
await (async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction().then(() => {
console.log('done')
}).catch(err => {
console.error('unexpected error occured..')
})
This output is what we want
sync function started
Async named function started
Async named function ended
Async anonymus function started
Async anonymus function ended
sync function ended.
done
Hope this helps you understand.
Feel free to leave a comment.

Puppeteer is blocked by the site https://nationsglory.fr/servers when it redirects to the requested page. How to pass this redirect?

Thank you to everyone who will look into my problem.
I was always able to scrape easily with puppetter but not on this page https://nationsglory.fr/servers.
The latest version of puppeteer prevents me from going on the page, I get stuck at the redirection.
This is why I run my script under the version:
"puppeteer": "7.0.1",
"puppeteer-extra": "3.1.16",
"puppeteer-extra-plugin-stealth": "2.7.4"
With these versions, I manage from time to time to pass and I can scrape the data of this site.
But I wish it would work all the time and I don't know how to make it happen.
Here is my code:
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const { log } = require("npm-colorlog");
puppeteer.use(StealthPlugin());
let countryInfos = "";
//true for hidden Chromium
//ancian ip, dos is useless
puppeteer.launch({
headless: false,
timeout: 10 * 60 * 1000,
args: [
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36",
"--no-sandbox",
"--disabe-infobars",
"--disable-setuid-sandbox",
"--window-position=0,0",
"--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list"
],
}).then(async browser => {
console.log('✷ Running browser..')
const page = await browser.newPage();
// await page.setViewport({ width: 800, height: 600 })
await page.goto('https://nationsglory.fr/server/yellow/countries')
// the code gets stuck here because the redirection fails and the page refreshes in a loop
await page.waitForTimeout(500);
await page
.waitForSelector('.lead', { timeout: 10 * 60 * 1000 })
.then(() => console.log('load page!'));
await page.screenshot({ path: 'botTester.png' });
//i screen for show if i pass
const hrefs = await page.$$eval("tr > td > a", (list) => list.map((elm) => elm.href));
const links = [];
hrefs.forEach(hf => {
if (hf.startsWith('https://nationsglory.fr/country/yellow/') == true) {
links.push(hf)
}
});
const linkLength = links.length / 2;
for (let i = 0; i < linkLength; i++) {
let pay = links[i].substring(37, links[i].length)
await page.goto(links[i])
await page
.waitForSelector('.section-title', { timeout: 5 * 60 * 1000 })
.then(() => console.log('load country'));
await page.waitForTimeout(500);
page.waitForSelector('#bodymembers>tr>.pl-4 > a > div');
const claims = await page.evaluate(() => Array.from(document.querySelectorAll(".mb-2"), element => element.textContent));
const powers = await page.evaluate(() => Array.from(document.querySelectorAll(".col-md-3 > .mb-2"), element => element.textContent));
const members = await page.evaluate(() => Array.from(document.querySelectorAll(" >tr>.pl-4 > a > div"), element => element.textContent));
let level = claims[2];
let power = powers[1].split("/");
let claim = claims[4];
power = parseInt(power[0], 10)
claim = parseInt(claim, 10)
countryInfos += "n°" + i + pay + " → ♝ level: " + level + " → ♚ power: " + power + " → ♛ claim: " + claim + "\n" + "→ ♟Members: " + members ";
}
const fs = require('fs');
fs.writeFile('countryInfos.txt', countryInfos, function(err) {
if (err) throw err;
console.log('File created !');
});
await browser.close();
console.log("✨All done, check the console✨");
})
If you have any questions do not hesitate.
Thank you all for your help !

Print all pdf pages except the last one using Puppeteer

I need to generate different footer for the last pdf page. After investigation I realised that the best way is to generate two different pdfs and combine them. It works fine when I need to change footer or use different templates for first page, or in cases when I know which pages should looks different (using pageRanges option), but I can't find a way to get only last (last n) pages in case when total page number is unknown. Any ideas how I can generate pdf for only last (last n) pages?
Will be appreciated for any answers.
I'm using Puppeteer v 2.1.0 with node.js v 8.16.0
This is a script which I'm using for generating pdf files now.
const puppeteer = require('puppeteer');
const fs = require('fs');
const DEFAULT_HEADER = '<span></span>';
const DEFAULT_FOOTER_HEIGHT = 90;
const DEFAULT_PAGE_PADDING = 36;
const createPdf = async () => {
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
const [, , bodyFilePath, outputFilePath, footerFilePath] = process.argv;
await page.goto(`file:${bodyFilePath}`, { waitUntil: 'networkidle0' });
let footerTemplate = DEFAULT_HEADER;
if (footerFilePath) {
footerTemplate = fs.readFileSync(footerFilePath, 'utf8');
}
await page.pdf({
path: outputFilePath,
format: 'A4',
margin: {
top: DEFAULT_PAGE_PADDING,
right: DEFAULT_PAGE_PADDING,
bottom: DEFAULT_FOOTER_HEIGHT,
left: DEFAULT_PAGE_PADDING,
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: DEFAULT_HEADER,
footerTemplate,
});
} catch (err) {
console.log(err.message);
} finally {
if (browser) {
browser.close();
}
process.exit();
}
};
createPdf();
Templates which I'm converting to pdf are .html.erb files

Maybe there is better ways to solve this problem but at this point I've used this approach - I'm generating export using same script as above, and than I'm using one more script which opens previous pdf file, count pages and generates two new files (which I'm combining to one file on the backend) - All pages except last one, and only last page with different footer.
const puppeteer = require('puppeteer');
const fs = require('fs');
const pdf = require('pdf-parse');
const DEFAULT_HEADER = '<span></span>';
const DEFAULT_FOOTER_HEIGHT = 90;
const DEFAULT_PAGE_PADDING = 36;
const createPdf = async () => {
let browser;
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
const [
,
,
bodyFilePath,
outputFilePath,
footerFilePath,
lastPagePath,
lastPageFooterPath,
] = process.argv;
await page.goto(`file:${bodyFilePath}`, { waitUntil: 'networkidle0' });
let footerTemplate = DEFAULT_HEADER;
let lastPageFooterTemplate = DEFAULT_HEADER;
if (footerFilePath) {
footerTemplate = fs.readFileSync(footerFilePath, 'utf8');
}
if (lastPageFooterPath) {
lastPageFooterTemplate = fs.readFileSync(lastPageFooterPath, 'utf8');
}
const dataBuffer = fs.readFileSync(outputFilePath);
const pdfInfo = await pdf(dataBuffer);
const numPages = pdfInfo.numpages;
const baseOptions = {
path: outputFilePath,
format: 'A4',
margin: {
top: DEFAULT_PAGE_PADDING,
right: DEFAULT_PAGE_PADDING,
bottom: DEFAULT_FOOTER_HEIGHT,
left: DEFAULT_PAGE_PADDING,
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: DEFAULT_HEADER,
pageRanges: `${numPages}`,
footerTemplate: lastPageFooterTemplate,
};
if (numPages === 1) {
await page.pdf(baseOptions);
} else {
await page.pdf({
...baseOptions,
footerTemplate,
pageRanges: `-${numPages - 1}`,
});
await page.pdf({
...baseOptions,
path: lastPagePath,
footerTemplate: lastPageFooterTemplate,
});
}
} catch (err) {
console.log(err.message);
} finally {
if (browser) {
browser.close();
}
process.exit();
}
};
createPdf();
Hope this will be helpful for someone with same issue.

puppeteer await page.$$('.className'), but I get only the first 11 element with that class, why?

the code that I am using to scrape a student list:
let collection1 = await page.$$('div.layout-2DM8Md')
console.log("Student Online:")
for (let el of collection1) {
let name = await el.$eval(('div.name-uJV0GL'), node => node.innerText.trim());
console.log(name)
}

It's probably because the contents of rest of those elements are loaded dynamically with a Javascript framework like React or Vue. This means that it only gets loaded when those elements enter the viewport of the browser.
To fix this you will need to write a function that auto scrolls the page so that those elements can get into the viewport and then you have to wait for that function to finish before you collect the data.
The scrolling function:
const autoScroll = async(page) => {
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 30);
});
});
}
Then call this function after page.goto() and before you grab the content with page.content(). I also set the viewport width and height then the scrolling goes a little faster:
await page.goto(url, {waitUntil: 'load'});
await page.setViewport({
width: 1200,
height: 800
});
await autoScroll(page); // The scroll function
const html = await page.content()

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Queueing Asynchronous tasks - node.js

If you want to run a set of promises in sequence you could make use of Promise.mapSeries from Bluebird package. I understand that this would mean adding an additional package, but it's simple and does not need you to build a queuing system. http://bluebirdjs.com/docs/api/promise.mapseries.html

Related

How to scrape image src the right way using puppeteer?

Puppeteer create PDF files from HTML data hangs Windows 10 system

Puppeteer is blocked by the site https://nationsglory.fr/servers when it redirects to the requested page. How to pass this redirect?

Print all pdf pages except the last one using Puppeteer

puppeteer await page.$$('.className'), but I get only the first 11 element with that class, why?

Categories

Resources