NodeJS scraper needing to increment page and re-run - node.js

I'm building a simple NodeJS web scraper, and I want to re-run the function like a 'for loop' until pageNum = totalNumberOfPages... im having a brain fart, and unable to re-run the function from inside itself, since it returns an array fragment and kills itself. Could someone help me overcome this obstacle? I'm pretty sure it's very simple.
I looked at this and this but didn't figure it out...
const cheerio = require("cheerio");
const axios = require("axios");
let pageNum = 0;
let siteUrl = "https://whatever.com?&page=" + pageNum + "&viewAll=true";
let productArray = [];
let vendor = [];
let productTitle = [];
let plantType = [];
let thcRange = [];
let cbdRange = [];
let price = [];
let totalNumberOfPages = undefined;
// called by getResults()
const fetchData = async () => {
const result = await axios.get(siteUrl);
return cheerio.load(result.data);
};
// this function is called from index.js
const getResults = async () => {
// >>>>>>>>>>>>>>>>>> HOW DO I RERUN FROM HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<
const $ = await fetchData();
// first check how many total pages there are
totalNumberOfPages = parseInt($('.pagination li:nth-last-child(2)').text());
// use fetched data to grab elements (and their text) and push into arrays defined above
$('.product-tile__vendor').each((index, element) => {
vendor.push($(element).text());
});
$('.product-tile__title').each((index, element) => {
productTitle.push($(element).text());
});
$('.product-tile__plant-type').each((index, element) => {
plantType.push($(element).text());
});
$('.product-tile__properties li:nth-child(2) p').each((index, element) => {
thcRange.push($(element).text());
});
$('.product-tile__properties li:nth-child(3) p').each((index, element) => {
cbdRange.push($(element).text());
});
$('.product-tile__price').each((index, element) => {
price.push($(element).text());
});
// increment page number to get more products if the page count is less than total number of pages
if (pageNum < totalNumberOfPages) {
pageNum ++;
};
//Convert to an array so that we can sort the results.
productArray.push ({
vendors: [...vendor],
productTitle: [...productTitle],
plantType: [...plantType],
thcRange: [...thcRange],
cbdRange: [...cbdRange],
price: [...price],
pageNum
});
// >>>>>>>>>>>>>>>>>> UNTIL HERE I THINK <<<<<<<<<<<<<<<<<<<<<<<<<<<
return productArray;
};
module.exports = getResults;

you can use recursion concept in your code:
which means the function itself will call itself
so what you can do is
const getResults = async () => {
// >>>>>>>>>>>>>>>>>> HOW DO I RERUN FROM HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<
const $ = await fetchData();
// first check how many total pages there are
totalNumberOfPages = parseInt($('.pagination li:nth-last-child(2)').text());
// use fetched data to grab elements (and their text) and push into arrays defined above
$('.product-tile__vendor').each((index, element) => {
vendor.push($(element).text());
});
$('.product-tile__title').each((index, element) => {
productTitle.push($(element).text());
});
$('.product-tile__plant-type').each((index, element) => {
plantType.push($(element).text());
});
$('.product-tile__properties li:nth-child(2) p').each((index, element) => {
thcRange.push($(element).text());
});
$('.product-tile__properties li:nth-child(3) p').each((index, element) => {
cbdRange.push($(element).text());
});
$('.product-tile__price').each((index, element) => {
price.push($(element).text());
});
// increment page number to get more products if the page count is less than total number of pages
if (pageNum < totalNumberOfPages) {
pageNum ++;
};
//Convert to an array so that we can sort the results.
productArray.push ({
vendors: [...vendor],
productTitle: [...productTitle],
plantType: [...plantType],
thcRange: [...thcRange],
cbdRange: [...cbdRange],
price: [...price],
pageNum
});
// >>>>>>>>>>>>>>>>>> UNTIL HERE I THINK <<<<<<<<<<<<<<<<<<<<<<<<<<<
if(pageNum >= totalNumberOfPages) getResults()
return productArray;
};

Related

how to use await instead of then in promise?

How to correctly resolve a Promise.all(...), I'm trying that after resolving the promise which generates a set of asynchronous requests (which are simple database queries in supabase-pg SQL) I'm iterating the result in a forEach , to make a new request with each of the results of the iterations.
But, try to save the result that it brings me in a new array, which prints fine in the console, but in the response that doesn't work. It comes empty, I understand that it is sending me the response before the promise is finished resolving, but I don't understand why.
In an answer to a previous question I was told to use await before the then, but I didn't quite understand how to do it.
What am I doing wrong?
export const getReportMonthly = async(req: Request & any, res: Response, next: NextFunction) => {
try {
let usersxData: UsersxModalidadxRolxJob[] = [];
let data_monthly: HoursActivityWeeklySummary[] = [];
let attendance_schedule: AttendanceSchedule[] = [];
let time_off_request: TimeOffRequestRpc[] = [];
let configs: IndicatorConfigs[] = [];
const supabaseService = new SupabaseService();
const promises = [
supabaseService.getSummaryWeekRpcWihoutFreelancers(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
data_monthly = dataFromDB as any;
}),
supabaseService.getUsersEntity(res).then(dataFromDB => {
usersxData = dataFromDB as any;
}),
supabaseService.getAttendaceScheduleRpc(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
attendance_schedule = dataFromDB as any;
}),
supabaseService.getTimeOffRequestRpc(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
time_off_request = dataFromDB as any;
}),
supabaseService.getConfigs(res).then(dataFromDB => {
configs = dataFromDB;
}),
];
let attendanceInMonthly = new Array();
await Promise.all(promises).then(() => {
attendance_schedule.forEach(element => {
let start_date = element.date_start.toString();
let end_date = element.date_end.toString();
supabaseService.getTrackedByDateAndIDArray(start_date, end_date).then(item => {
console.log(item);
attendanceInMonthly.push(item);
});
});
})
res.json(attendanceInMonthly)
} catch (error) {
console.log(error);
res.status(500).json({
title: 'API-CIT Error',
message: 'Internal server error'
});
}
If you await a promise you could write the return of this in a variable and work with this normaly.
So instead of your current code you could use the following changed code:
export const getReportMonthly = async(req: Request & any, res: Response, next: NextFunction) => {
try {
let usersxData: UsersxModalidadxRolxJob[] = [];
let data_monthly: HoursActivityWeeklySummary[] = [];
let attendance_schedule: AttendanceSchedule[] = [];
let time_off_request: TimeOffRequestRpc[] = [];
let configs: IndicatorConfigs[] = [];
const supabaseService = new SupabaseService();
const promises = [
supabaseService.getSummaryWeekRpcWihoutFreelancers(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
data_monthly = dataFromDB as any;
}),
supabaseService.getUsersEntity(res).then(dataFromDB => {
usersxData = dataFromDB as any;
}),
supabaseService.getAttendaceScheduleRpc(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
attendance_schedule = dataFromDB as any;
}),
supabaseService.getTimeOffRequestRpc(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
time_off_request = dataFromDB as any;
}),
supabaseService.getConfigs(res).then(dataFromDB => {
configs = dataFromDB;
}),
];
const resolvedPromises = await Promise.all(promises)
const attendanceInMonthly = await Promise.all(
resolvedPromises.map(
async (element) => {
let start_date = element.date_start.toString();
let end_date = element.date_end.toString();
return supabaseService.getTrackedByDateAndIDArray(start_date, end_date)
}
)
)
console.log(attendanceInMonthly) // this should be your finaly resolved promise
res.json(attendanceInMonthly)
} catch (error) {
console.log(error);
res.status(500).json({
title: 'API-CIT Error',
message: 'Internal server error'
});
}
Something like this should your code looks like. I am not sure if this solves exactly your code because your code has some syntax errors wich you have to solve for you.
If I understand correctly, you launch a few requests, among which one (getAttendaceScheduleRpc, which assigns attendance_schedule) is used to launch some extra requests again, and you need to wait for all of these (including the extra requests) before returning?
In that case, the immediate issue is that you perform your extra requests in "subqueries", but you do not wait for them.
A very simple solution would be to properly separate those 2 steps, somehow like in DerHerrGammler's answer, but using attendance_schedule instead of resolvedPromises as input for the 2nd step:
let attendanceInMonthly = new Array();
await Promise.all(promises);
await Promise.all(attendance_schedule.map(async (element) => {
let start_date = element.date_start.toString();
let end_date = element.date_end.toString();
const item = await supabaseService.getTrackedByDateAndIDArray(start_date, end_date);
console.log(item);
attendanceInMonthly.push(item);
});
res.json(attendanceInMonthly);
If you are really looking to fine tune your performance, you could take advantage of the fact that your extra requests depend only on the result of one of your initial requests (getAttendaceScheduleRpc), so you could launch them as soon as the latter is fullfilled, instead of waiting for all the promises of the 1st step:
let attendance_schedule: AttendanceSchedule[] = [];
let attendanceInMonthly = new Array();
const promises = [
supabaseService.getAttendaceScheduleRpc(req.query.fecha_inicio, req.query.fecha_final).then(dataFromDB => {
attendance_schedule = dataFromDB as any;
// Immediately launch your extra (2nd step) requests, without waiting for other 1st step requests
// Make sure to return when all new extra requests are done, or a Promise
// that fullfills when so.
return Promise.all(attendance_schedule.map(async (element) => {
let start_date = element.date_start.toString();
let end_date = element.date_end.toString();
const item = await supabaseService.getTrackedByDateAndIDArray(start_date, end_date);
console.log(item);
attendanceInMonthly.push(item);
});
}),
// etc. for the rest of 1st step requests
];
await Promise.all(promises);
res.json(attendanceInMonthly);

Puppeteer create PDF files from HTML data hangs Windows 10 system

I created an App that processes students results by extracting data from multiple excel workbooks. The problem is that using Puppeteer to generate the PDF files, throws the system into a loop till it hangs the system.
Actually, I have tested same codes below using PhantomJs which is bundled as pdf-creator-node, and was able to generate 150 PDF files comfortably in 3 minutes. The only challenge I dumped PhantomJs is that all the styling in the CSS file was not included, even when I inserted it as an inline style in the header, suing replace function of JS. Another, is that PhantomJs is no longer in active development. I searched the web, and found out that only Puppeteer is the valid solution with active development and support too.
I tried using page.close() at the end of pdfCreator() which is in a loop, and browser.close() at the end of pdfGenerator(). What I am doing wrong?
Here below are the codes in the server.js and PdfGenerator.js files, with a sample of the ERROR, and screenshot of my Task Manager after the system crawled out of hanging state. For HTML generation, I used Mustache. I excluded some lines of codes in server.js because the total character count was over 60k.
server.js
// [codes were removed here]
if(getCode == 'compute-result') {
// declare variable
let setData = null;
let setTitle = 'Results Computation...';
let setArgs = getArgs;
// dataFromFile = ReadFile(pathCodeTextFile);
// setArgs = Number(dataFromFile);
setCode = 'compute-result';
let setView = [];
let setNext = true;
let countTerms = [];
// if(getArg > 0) {
// Final Result computation
const getJson = ReadFile(pathJsonResults);
// const getCtrl = ReadFile(pathJsonCtrl);
const getResultObject = JSON.parse(getJson);
getResult = getResultObject;
const totalResults = getResult.firstTerm.length + getResult.secondTerm.length + getResult.thirdTerm.length;
if(setView.length < 1 && getResult != null) {
setData = 'PDFs for Students Results initiating...';
setView.unshift('Reading saved data...');
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: null, view: JSON.stringify(setView)});
}
Sleep(2000).then(() => {
if(getResult != null) {
setData = 'Students Results will be ready in a moment';
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
const wacthFiles = (file, className, termName, sessionName, completed, pdfList) => {
try {
if(typeof file == 'string' && !FileExists(pathJsonPdfList)) {
if(pdfList.length < 2){
setData = 'Saving PDFs to downladable files...';
}
if(className != null && termName != null && sessionName != null) {
setTitle = `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}...`;
setView.unshift(file);
if(!countTerms.includes(termName)) {
countTerms.push(termName)
}
// setCode = -1000 - pdfList.length;
// console.log('PDF PROGRESS: ', `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}... ${setCode}`);
// when all PDFs are created
if(completed) {
setTitle = setTitle.replace('...', ' [completed]');
setData = 'Result Download button is Active. You may click it now.';
setView.unshift('=== PDF GENERATION COMPLETED ===');
setView.unshift(`A total of ${pdfList.length} students' Results were generated`);
WriteFile(pathJsonPdfList, JSON.stringify(pdfList));
// set donwload button active
setCode = Number(codeTextFilePdfCompleted);
setNext = false;
getResult = null;
let termString = countTerms.toString();
termString = ReplaceAll(termString, '-term', '');
termString = ReplaceAll(termString, ',', '-');
const addTxt = `${className} _${termString} Term${countTerms.length>1?'s':''} (${sessionName})`;
WriteFile(pathCodeTextFile, addTxt);
// console.log('======== PDF GENERATION ENDS ================');
} else {
setCode = -1 * pdfList.length;
}
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
}
} catch (error) {
console.log('ERROR ON WATCHER: ', error);
}
}
if(!FileExists(pathJsonPdfList) && getResult !== null) {
PdfGenerator(getResult, wacthFiles);
}
// Watcher(pathWatchResults, setCode, wacthDir, 10000);
});
// }
}
}
} catch (error) {
})
client.on('disconnect', () => {
console.log('SERVER: Disconnected');
});
server.listen(portApi, () =>{
console.log('Server listens on port 8881')
});
// serve static files
app.use(express.static(pathPublic));
// [codes were removed here]
PdfGenerator.js
The problem lies in these functions: PdfGenerator & createPdf
'use strict';
process.setMaxListeners(Infinity) // fix for Puppeteer MaxListenerExceededWarning
const Puppeteer = require('puppeteer')
const {HtmlGenerator} = require('../components/HtmlGenerator')
const {WriteFile, FileExists, RandomNumber, RoundNumber, IsNumberFraction, ReadFile} = require('../components/Functions')
if (process.env.NODE_ENV !== 'production') {
require('dotenv').config();
}
const pathFirstTermResults = process.env.DIR_FIRST_TERM_RESULTS;
const pathSecondTermResults = process.env.DIR_SECOND_TERM_RESULTS;
const pathThirdTermResults = process.env.DIR_THIRD_TERM_RESULTS;
const publicDir = process.env.DIR_PUBLIC;
const cssFile = process.env.PATH_CSS_FILENAME;
const pathCssRaw = __dirname + '\\' + publicDir + '\\' + cssFile;
const pathCss = pathCssRaw.replace(`\\uploads`, '');
const tagCssReplace = process.env.TAG_CSS_REPLACE;
let jsonDir = process.env.PATH_JSON;
jsonDir = jsonDir.split('/').pop();
let htmlDir = process.env.DIR_HTML;
htmlDir = __dirname + '\\' + htmlDir.split('/').pop();
const htmlType1 = htmlDir + '\\' + process.env.HTML_TYPE1;
const htmlType2 = htmlDir + '\\' + process.env.HTML_TYPE2;
const htmlType3 = htmlDir + '\\' + process.env.HTML_TYPE3;
const pathJsonPdfList = './' + jsonDir + '/' + process.env.JSON_PDF_LIST_FILENAME;
const pathJsonPdfContent = __dirname + '\\' + jsonDir + '\\' + process.env.JSON_PDF_CONTENT;
const firstTermDir = 'first-term';
const secondTermDir = 'second-term';
const thirdTermDir = 'third-term';
let cumulativeFirstTermTotalList = {};
let cumulativeSecondTermTotalList = {};
let firstTermOnce = true;
let secondTermOnce = true;
let thirdTermOnce = true;
let isActive = false;
const getPath = (p, f) => {
let dir = pathFirstTermResults;
switch (p) {
case firstTermDir:
dir = pathFirstTermResults;
break;
case secondTermDir:
dir = pathSecondTermResults;
break;
case thirdTermDir:
dir = pathThirdTermResults;
break;
default:
break;
}
return dir + f
}
const resolution = {
x: 1920,
y: 1080
}
const args = [
'--disable-gpu',
`--window-size=${resolution.x},${resolution.y}`,
'--no-sandbox',
]
const createPdf = (page, content, templateType, filename, className, term, sessionName, isProcessActive, pdfFileList, cb) => {
let path, document, options;
path = getPath(term, filename);
if(path != null) {
let options = {
path: path,
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
let templateData = '';
switch (templateType) {
case '1':
templateData = ReadFile(htmlType1);
break;
case '2':
templateData = ReadFile(htmlType2);
break;
case '3':
templateData = ReadFile(htmlType3);
break;
default:
templateData = ReadFile(htmlType1);
break;
}
(async() => {
const html = HtmlGenerator(content, templateData);
if(html != undefined && html !== '' && html != null) {
// create PDF file
cb(filename, className, term, sessionName, isProcessActive, pdfFileList);
// get style from .css & replace
const css = ReadFile(pathCss);
await page.setContent(html, { waitUntil: 'networkidle0'});
await page.addStyleTag(css);
await page.pdf(options);
page.close();
}
})()
}
}
const pdfGenerator = (json, cb) => {
let data = {};
let pdfFileList = [];
if(typeof json == 'string') {
data = JSON.parse(json)
} else {
data = json;
}
try {
// declare defaults
let filename = 'Student' + '.pdf';
let termName = firstTermDir;
const templateType = data.keys.templateType;
const session = data.classInfo.Session;
const sessionName = session.replace('/', '-');
const students = data.students;
const className = data.classInfo.Class_Name;
const recordFirstTerm = data.firstTerm;
const recordSecondTerm = data.secondTerm;
const recordThirdTerm = data.thirdTerm;
let pdfCreatedList = [];
let isReset = false;
let totalResultsExpected = Object.keys(recordFirstTerm).length + Object.keys(recordSecondTerm).length + Object.keys(recordThirdTerm).length;
let totalResultsCount = 0;
let jsonForPdf = {};
let record = {};
let sRecord, path, id, fName, lName;
// get each student
let logEndOnce = true;
let logBeforeOnce = true;
logBeforeOnce && console.log('============== *** ================');
logBeforeOnce && console.log('======== PDF GENERATION BEGINS ================');
const computeResult = (page, setTerm, setRecord, setReset) => {
const termName = setTerm;
const record = setRecord;
let isReset = setReset;
logBeforeOnce && console.log(`====== ${termName} RESULTS BEGINS ======`);
for(let elem of students){
id = elem.id;
fName = elem.firstName;
lName = elem.lastName;
filename = `${lName} ${fName} _${termName} ${sessionName}.pdf`;
// sRecord = record.filter(function (entry) { return entry[id] !== undefined; });
sRecord = record[id];
path = getPath(termName, filename);
// create pdf
if(!FileExists(path) && !FileExists(pathJsonPdfList)){
// generate final JSON for the student
// isReset = (pdfCreatedList.includes(id))? false: true;
jsonForPdf = finalJson(elem, sRecord, data, termName);
(pdfFileList.length < 1) && WriteFile(pathJsonPdfContent, JSON.stringify(jsonForPdf));
pdfFileList.push({
'term': termName,
'file': filename
});
totalResultsCount = pdfFileList.length;
const pdfDate = new Date();
console.log(`${filename} (${totalResultsCount}/${totalResultsExpected}) at ${pdfDate.getHours()}hr${pdfDate.getHours()>1?'s':''} - ${pdfDate.getMinutes()}min${pdfDate.getMinutes()>1?'s':''} - ${pdfDate.getSeconds()}sec${pdfDate.getSeconds()>1?'s':''}`);
isActive = (totalResultsExpected === totalResultsCount)? true: false;
logEndOnce = false;
// cb(filename, className, termName, sessionName, isActive, pdfFileList);
// WriteFile(path, null);
isReset = true;
createPdf(page, jsonForPdf, templateType, filename, className, termName, sessionName, isActive, pdfFileList, cb);
}
}
logBeforeOnce && console.log(`====== ${termName} RESULTS ENDS ======`);
}
// get each student result for First Term
const computeFirstTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.firstTerm === '1') {
termName = firstTermDir;
record = recordFirstTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Second Term
const computeSecondTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.secondTerm === '1') {
termName = secondTermDir;
record = recordSecondTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Third Term
const computeThirdTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.thirdTerm === '1') {
termName = thirdTermDir;
record = recordThirdTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
if(totalResultsExpected === totalResultsCount && totalResultsCount !== 0 && !logEndOnce) {
logEndOnce = true;
logBeforeOnce = false;
console.log('======== PDF GENERATION ENDS ================');
}
} catch (error) {
console.log('==== ERROR IN PDF GENERATION: ', error)
}
}
module.exports = {
PdfGenerator: pdfGenerator
}
ERROR
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
lerna ERR! yarn run start stderr:
<--- Last few GCs --->
[9884:000002D68A73C6B0] 1665171 ms: Scavenge 44.1 (45.8) -> 43.2 (45.8) MB, 223.9 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1684089 ms: Scavenge 44.1 (45.8) -> 43.3 (45.8) MB, 587.3 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1749901 ms: Scavenge 44.2 (45.8) -> 43.3 (45.8) MB, 5099.0 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
<--- JS stacktrace --->
FATAL ERROR: Committing semi space failed. Allocation failed - JavaScript heap out of memory
1: 00007FF6ED61013F
2: 00007FF6ED59F396
3: 00007FF6ED5A024D
4: 00007FF6EDED19EE
5: 00007FF6EDEBBECD
6: 00007FF6EDD5F61C
7: 00007FF6EDD6933F
8: 00007FF6EDD5BF19
9: 00007FF6EDD5A0D0
10: 00007FF6EDD7EA06
11: 00007FF6EDAB1CD5
12: 00007FF6EDF5F3E1
13: 00007FF6EDF602E9
14: 000002D68C4EF69E
error Command failed with exit code 134.
Screenshot of Task Manager, Chromium running multiple instances of over 50.
I appreciate any help. I hope this can be resolved to give me a smooth PDF generation.
Thank you.
Example solution (limiting parallel browsers)
I created you a PdfPrinter class which you can integrate into your setup. It allows you to limit the amount of parallel pdf generation jobs and allows setting a limit and manages opening/closing the browser for you. The PdfPrinter class is also highly coupled and needed some modification for using it as a general queue. Logicwise this can be modified to be a general queue.
You can try to integrate that into your code. This is a fully working test example with simplified pdfs (without the part of getting the actual data from the excel..)
As far as I understood your code, you do not need to pass the page around all your functions. First create your html + css and then use the pdfPrinter and let it handle page creation + browser launching..
(I like to code stuff like this so I went straight ahead..)
var puppeteer = require('puppeteer')
const defaultPrinterOptions = {
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
class PdfPrinter {
maxBrowsers = 2
enqueuedPrintJobs = []
failedJobs = []
browserInstances = 0
// max browser instances in parallel
constructor(maxBrowsers) {
this.maxBrowsers = maxBrowsers
}
/**
*
* #param {*} html the html content to print
* #param {*} css to apply to the page
* #param {*} printOptions options passed to puppeteer
*/
// enqueues a print but the exact end moment cannot be known..
enqueuePrint = (html, css, path, done) => {
// merge custom options with defaultOptions..
const printOptions = {
...defaultPrinterOptions,
// add the path to the options.
path: path
}
// create a function which can be stored in an array
// it will later be grabbed by startPrinter() OR at the time any
// brwoser freed up..
// the function needs to be passed the actual used browser instance!
this.enqueuedPrintJobs.push(async(browser) => {
// catch the error which may be produced when printing something..
try {
// print the document
await this.print(browser, html, css, printOptions)
} catch (err) {
console.error('error when printing document..CLosing browser and starting a new job!!', printOptions.path)
console.error(err)
// store someting so you now what failed and coudl be retried or something..
this.failedJobs.push({ html, css, path: printOptions.path })
// puppeteer can run into erros too!!
// so close the browser and launch a new one!
await this.closeBrowser(browser)
browser = await this.launchBrowser()
}
// after the print, call done() so the promise is resovled in the right moment when
// this particular print has ended.!
done()
// start the next job right now if there are any left.
const job = this.enqueuedPrintJobs.shift()
if (!job) {
console.log('No print jobs available anymore. CLosing this browser instance.. Remaining browsers now:', this.maxBrowsers - this.browserInstances + 1)
await this.closeBrowser(browser)
return
}
// job is actually this function itself! It will be executed
// and automatically grab a new job after completion :)
// we pass the same browser instance to the next job!.
await job(browser)
})
// whenever a print job added make sure to start the printer
// this starts new browser instances if the limit is not exceeded resp. if no browser is instantiated yet,
// and does nothing if maximum browser count is reached..
this.tryStartPrinter()
}
// same as enqueuePrint except it wraps it in a promise so we can now the
// exact end moment and await it..
enqueuePrintPromise(html, css, path) {
return new Promise((resolve, reject) => {
try {
this.enqueuePrint(html, css, path, resolve)
} catch (err) {
console.error('unexpected error when setting up print job..', err)
reject(err)
}
})
}
// If browser instance limit is not reached will isntantiate a new one and run a print job with it.
// a print job will automatically grab a next job with the created browser if there are any left.
tryStartPrinter = async() => {
// Max browser count in use OR no jobs left.
if (this.browserInstances >= this.maxBrowsers || this.enqueuedPrintJobs.length === 0) {
return
}
// browser instances available!
// create a new one
console.log('launching new browser. Available after launch:', this.maxBrowsers - this.browserInstances - 1)
const browser = await this.launchBrowser()
// run job
const job = this.enqueuedPrintJobs.shift()
await job(browser)
}
closeBrowser = async(browser) => {
// decrement browsers in use!
// important to call before closing browser!!
this.browserInstances--
await browser.close()
}
launchBrowser = async() => {
// increment browsers in use!
// important to increase before actualy launching (async stuff..)
this.browserInstances++
// this code you have to adjust according your enviromnemt..
const browser = await puppeteer.launch({ headless: true })
return browser
}
// The actual print function which creates a pdf.
print = async(browser, html, css, printOptions) => {
console.log('Converting page to pdf. path:', printOptions.path)
// Run pdf creation in seperate page.
const page = await browser.newPage()
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.addStyleTag({ content: css });
await page.pdf(printOptions);
await page.close();
}
}
// testing the PDFPrinter with some jobs.
// make sure to run the printer in an `async` function so u can
// use await...
const testPrinterQueue = async() => {
// config
const maxOpenedBrowsers = 5 // amount of browser instances which are allowed to be opened in parallel
const testJobCount = 100 // amount of test pdf jobs to be created
const destDir = 'C:\\somepath' // the directory to store the pdfs in..
// create sample jobs for testing...
const jobs = []
for (let i = 0; i < testJobCount; i++) {
jobs.push({
html: `<h1>job number [${i}]</h1>`,
css: 'h1 { background-color: red; }',
path: require('path').join(destDir, `pdf_${i}.pdf`)
})
}
// track time
const label = 'printed a total of ' + testJobCount + ' pdfs!'
console.time(label)
// run the actual pdf generation..
const printer = new PdfPrinter(maxOpenedBrowsers)
const jobProms = []
for (let job of jobs) {
// run jobs in parallel. Each job wil be runned async and return a Promise therefor
jobProms.push(
printer.enqueuePrintPromise(job.html, job.css, job.path)
)
}
console.log('All jobs enqueued!! Wating for finish now.')
// helper function which awaits all the print jobs, resp. an array of promises.
await Promise.all(jobProms)
console.timeEnd(label)
// failed jobs::
console.log('jobs failed:', printer.failedJobs)
// as file:
await require('fs').promises.writeFile('failed-jobs.json', JSON.stringify(printer.failedJobs))
}
testPrinterQueue().then(() => {
console.log('done with everyting..')
}).catch(err => {
console.error('unexpected error occured while printing all pages...', err)
})
You only need to adjust the destDir / openedBrowsers and testJobCount vars in the beginning of testPrinterQueue() for getting this to work.
What caused the problem in your code
Let's have a look at this piece
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
You created an anonymous function which is executed immediatly. Within the function all the statements are correctly awaited using await. But if you run this whole piece within a synchronious part of your application, the whole function will start immediatly but NOT been awaited before running next code.
Checkout this example:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
AsyncFunction();
// what you have done in your code:
(async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction()
console.log('done')
Note the output:
Async named function started
Async anonymus function started
sync function ended. // => sync function already ended
done // sync function ended and code continues execution.
Async named function ended
Async anonymus function ended
To correctly await your async stuff you need to put your whole application in async scope:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
// this is now async!!
async function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
await AsyncFunction();
// what you have done in your code:
await (async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction().then(() => {
console.log('done')
}).catch(err => {
console.error('unexpected error occured..')
})
This output is what we want
sync function started
Async named function started
Async named function ended
Async anonymus function started
Async anonymus function ended
sync function ended.
done
Hope this helps you understand.
Feel free to leave a comment.

fetch an array of urls not recieving all data

I have an array of urls, I want to fetch every element of that array in order, but when I run the code I got the incorrect order, and some elements is missing
const fetch = require('node-fetch');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var chocolateList = [];
urlArray = [...] // 95 elements
urlArray.forEach((url, index) => {
fetch(url).then(res => res.text())
.then(async (success) => {
const dom = new JSDOM(success); // convert response into DOM
dom.window.document.querySelectorAll('h1')
.forEach((htmlH1, i) => { // get all h1 html tags
if (htmlH1.includes('chocolate')) {
chocolateList.push({name:htmlH1});
}
});
console.log(chocolateList)
})
});
```
outputs an array of approximately 20 elements instead of 95 elements
what im doing wrong?
Try this code, it works. You made a few mistakes in the above, I explained it in the code comments section
const fetch = require('node-fetch');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var chocolateList = [];
urlArray = ['https://www.example.com', 'https://www.example2.com'];
urlArray.forEach((url) => {
fetch(url).then(res => res.text())
.then(async (success) => {
const dom = new JSDOM(success);
//h1 tags array length
let len = dom.window.document.querySelectorAll('h1').length;
//h1 tags array
let h1Arr = dom.window.document.querySelectorAll('h1');
// Check for a length of an h1 tags array
if (len) {
h1Arr.forEach((htmlH1, i) => {
let h1Text = htmlH1.textContent;
//The content that you are recieving is with lot of new-line and carriage return characters.
//So, you should always sanitize the data before proceeding, and covert it to lower case as includes search is a case sensitive search
h1Text = h1Text.replace(/[\r\n]/g, '').trim().toLowerCase();
if (h1Text.includes('chocolate')) {
chocolateList.push({ name: htmlH1 });
}
});
} else {
console.log('No h1 element present on DOM');
}
console.log(chocolateList);
})
});

Access API through GraphQL - loop through multiple pages with Promise.all

I am able to fetch data from the API, and I can loop through the multiple pages of data if there are multiple pages. However, to speed it up, I would like to try and retrieve multiple pages at a time.
But I can't get this code to work.
async people() {
// https://swapi.co/api/people/
// The API returns 10 items per page
const perPage = 10
let promises = []
// Start with an empty array and add the results from each API call
let allResults = []
// Get first page and total number of pages
// based on total number of results (data.count)
let people = await fetch(`https://swapi.co/api/people`)
let data = await people.json()
const totalPages = Math.ceil(data.count / perPage)
// Add results to array
allResults = allResults.concat(data.results)
// If the total results is greater than the results per page,
// get the rest of the results and add to the aLLResults array
if (data.count > perPage) {
for (let page = 2; page <= totalPages; page++) {
promises.push(
new Promise((resolve, reject) => {
people = fetch(`https://swapi.co/api/people/?page=${page}`).
then(response => {
data = response.json()
},
response => {
allResults = allResults.concat(response.results)
}
)
})
)
}
return Promise.all(promises)
}
return allResults
},
then(response => {
data = response.json()
},
response => {
allResults = allResults.concat(response.results)
}
In this section, the allResults = allResults.concat(response.results) is only triggered as the error-handling callback, equivalent to a .catch((err) => { ... }). Is that what you intend?
`

Complex query on more than one child in cloud functions

Here is the structure of my firebase database:
/UserData
/DeviceMgmt
/Counters
/NumberOfAll:
/NumberOfSelected
/TotalDownloaded
...
/Devices
/pushId1
/uid
/signOutTime
/toSelect=true (optional)
/downloaded
/lastDownload
/pushId2
/pushId3
...
And this is my cloud function:
exports.markDevicesForDownload = functions.database.ref('/UserData/DeviceMgmt/Counters/NumberOfSelected').onUpdate( (change) => {
const changeRef = change.after.ref;
const deviceMgmtRef = changeRef.parent.parent; // /UserData/DeviceMgmt
if (change.after.val() === 0 ) { //NumberOfSelected gets 0 value
return deviceMgmtRef.once('value')
.then((snap) => {
const devicesRef = snap.child('Devices').ref;
var average;
var numberOfAllDevices;
var totalDownloaded;
numberOfAllDevices = snap.child('Counters/NumberOfAll').val();
totalDownloaded = snap.child('Counters/TotalDownloaded').val();
average = Math.round(totalDownloaded/numberOfAllDevices);
return devicesRef
.orderByChild('signOutTime')
.equalTo(0)
.once('value',(devices) => {
return devices.ref
.orderByChild('downloaded')
.endAt(average)
.once('value',(devices) => {
devices.forEach((device) => {
device.child('toSelect').ref.set(true);
});
});
});
});
} else {
return false;
}
});
The function triggers when the counter NumberOfSelected = 0;
This happens when under any of device pushId there is no child toSelect. Then the query on downloaded child makes all devices with downloaded less than average set toSelect=true.
I wanted to limit the devices only to those which have signOutTime equal 0.
Somehow that query does not work and all devices are considered.
What I did wrong???
I would push all async tasks into a promise array and then return them all when all tasks complete:
exports.markDevicesForDownload = functions.database.ref('/UserData/DeviceMgmt/Counters/NumberOfSelected').onUpdate((change) => {
const changeRef = change.after.ref;
const deviceMgmtRef = changeRef.parent.parent; // /UserData/DeviceMgmt
if (change.after.val() === 0) { //NumberOfSelected gets 0 value
return deviceMgmtRef.once('value')
.then((snap) => {
const promises = [];
const devicesRef = snap.child('Devices').ref;
var average;
var numberOfAllDevices;
var totalDownloaded;
numberOfAllDevices = snap.child('Counters/NumberOfAll').val();
totalDownloaded = snap.child('Counters/TotalDownloaded').val();
average = Math.round(totalDownloaded / numberOfAllDevices);
const dR = devicesRef
.orderByChild('signOutTime')
.equalTo(0)
.once('value', (devices) => {
const dW = devices.ref
.orderByChild('downloaded')
.endAt(average)
.once('value', (devices) => {
devices.forEach((device) => {
if (device.child("signOutTime").val() === 0){
promises.push(device.child('toSelect').ref.set(true));
}
});
});
promises.push(dW);
});
promises.push(dR);
return Promise.all(promises);
});
} else {
return false;
}
});

Resources