Related
I created an App that processes students results by extracting data from multiple excel workbooks. The problem is that using Puppeteer to generate the PDF files, throws the system into a loop till it hangs the system.
Actually, I have tested same codes below using PhantomJs which is bundled as pdf-creator-node, and was able to generate 150 PDF files comfortably in 3 minutes. The only challenge I dumped PhantomJs is that all the styling in the CSS file was not included, even when I inserted it as an inline style in the header, suing replace function of JS. Another, is that PhantomJs is no longer in active development. I searched the web, and found out that only Puppeteer is the valid solution with active development and support too.
I tried using page.close() at the end of pdfCreator() which is in a loop, and browser.close() at the end of pdfGenerator(). What I am doing wrong?
Here below are the codes in the server.js and PdfGenerator.js files, with a sample of the ERROR, and screenshot of my Task Manager after the system crawled out of hanging state. For HTML generation, I used Mustache. I excluded some lines of codes in server.js because the total character count was over 60k.
server.js
// [codes were removed here]
if(getCode == 'compute-result') {
// declare variable
let setData = null;
let setTitle = 'Results Computation...';
let setArgs = getArgs;
// dataFromFile = ReadFile(pathCodeTextFile);
// setArgs = Number(dataFromFile);
setCode = 'compute-result';
let setView = [];
let setNext = true;
let countTerms = [];
// if(getArg > 0) {
// Final Result computation
const getJson = ReadFile(pathJsonResults);
// const getCtrl = ReadFile(pathJsonCtrl);
const getResultObject = JSON.parse(getJson);
getResult = getResultObject;
const totalResults = getResult.firstTerm.length + getResult.secondTerm.length + getResult.thirdTerm.length;
if(setView.length < 1 && getResult != null) {
setData = 'PDFs for Students Results initiating...';
setView.unshift('Reading saved data...');
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: null, view: JSON.stringify(setView)});
}
Sleep(2000).then(() => {
if(getResult != null) {
setData = 'Students Results will be ready in a moment';
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
const wacthFiles = (file, className, termName, sessionName, completed, pdfList) => {
try {
if(typeof file == 'string' && !FileExists(pathJsonPdfList)) {
if(pdfList.length < 2){
setData = 'Saving PDFs to downladable files...';
}
if(className != null && termName != null && sessionName != null) {
setTitle = `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}...`;
setView.unshift(file);
if(!countTerms.includes(termName)) {
countTerms.push(termName)
}
// setCode = -1000 - pdfList.length;
// console.log('PDF PROGRESS: ', `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}... ${setCode}`);
// when all PDFs are created
if(completed) {
setTitle = setTitle.replace('...', ' [completed]');
setData = 'Result Download button is Active. You may click it now.';
setView.unshift('=== PDF GENERATION COMPLETED ===');
setView.unshift(`A total of ${pdfList.length} students' Results were generated`);
WriteFile(pathJsonPdfList, JSON.stringify(pdfList));
// set donwload button active
setCode = Number(codeTextFilePdfCompleted);
setNext = false;
getResult = null;
let termString = countTerms.toString();
termString = ReplaceAll(termString, '-term', '');
termString = ReplaceAll(termString, ',', '-');
const addTxt = `${className} _${termString} Term${countTerms.length>1?'s':''} (${sessionName})`;
WriteFile(pathCodeTextFile, addTxt);
// console.log('======== PDF GENERATION ENDS ================');
} else {
setCode = -1 * pdfList.length;
}
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
}
} catch (error) {
console.log('ERROR ON WATCHER: ', error);
}
}
if(!FileExists(pathJsonPdfList) && getResult !== null) {
PdfGenerator(getResult, wacthFiles);
}
// Watcher(pathWatchResults, setCode, wacthDir, 10000);
});
// }
}
}
} catch (error) {
})
client.on('disconnect', () => {
console.log('SERVER: Disconnected');
});
server.listen(portApi, () =>{
console.log('Server listens on port 8881')
});
// serve static files
app.use(express.static(pathPublic));
// [codes were removed here]
PdfGenerator.js
The problem lies in these functions: PdfGenerator & createPdf
'use strict';
process.setMaxListeners(Infinity) // fix for Puppeteer MaxListenerExceededWarning
const Puppeteer = require('puppeteer')
const {HtmlGenerator} = require('../components/HtmlGenerator')
const {WriteFile, FileExists, RandomNumber, RoundNumber, IsNumberFraction, ReadFile} = require('../components/Functions')
if (process.env.NODE_ENV !== 'production') {
require('dotenv').config();
}
const pathFirstTermResults = process.env.DIR_FIRST_TERM_RESULTS;
const pathSecondTermResults = process.env.DIR_SECOND_TERM_RESULTS;
const pathThirdTermResults = process.env.DIR_THIRD_TERM_RESULTS;
const publicDir = process.env.DIR_PUBLIC;
const cssFile = process.env.PATH_CSS_FILENAME;
const pathCssRaw = __dirname + '\\' + publicDir + '\\' + cssFile;
const pathCss = pathCssRaw.replace(`\\uploads`, '');
const tagCssReplace = process.env.TAG_CSS_REPLACE;
let jsonDir = process.env.PATH_JSON;
jsonDir = jsonDir.split('/').pop();
let htmlDir = process.env.DIR_HTML;
htmlDir = __dirname + '\\' + htmlDir.split('/').pop();
const htmlType1 = htmlDir + '\\' + process.env.HTML_TYPE1;
const htmlType2 = htmlDir + '\\' + process.env.HTML_TYPE2;
const htmlType3 = htmlDir + '\\' + process.env.HTML_TYPE3;
const pathJsonPdfList = './' + jsonDir + '/' + process.env.JSON_PDF_LIST_FILENAME;
const pathJsonPdfContent = __dirname + '\\' + jsonDir + '\\' + process.env.JSON_PDF_CONTENT;
const firstTermDir = 'first-term';
const secondTermDir = 'second-term';
const thirdTermDir = 'third-term';
let cumulativeFirstTermTotalList = {};
let cumulativeSecondTermTotalList = {};
let firstTermOnce = true;
let secondTermOnce = true;
let thirdTermOnce = true;
let isActive = false;
const getPath = (p, f) => {
let dir = pathFirstTermResults;
switch (p) {
case firstTermDir:
dir = pathFirstTermResults;
break;
case secondTermDir:
dir = pathSecondTermResults;
break;
case thirdTermDir:
dir = pathThirdTermResults;
break;
default:
break;
}
return dir + f
}
const resolution = {
x: 1920,
y: 1080
}
const args = [
'--disable-gpu',
`--window-size=${resolution.x},${resolution.y}`,
'--no-sandbox',
]
const createPdf = (page, content, templateType, filename, className, term, sessionName, isProcessActive, pdfFileList, cb) => {
let path, document, options;
path = getPath(term, filename);
if(path != null) {
let options = {
path: path,
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
let templateData = '';
switch (templateType) {
case '1':
templateData = ReadFile(htmlType1);
break;
case '2':
templateData = ReadFile(htmlType2);
break;
case '3':
templateData = ReadFile(htmlType3);
break;
default:
templateData = ReadFile(htmlType1);
break;
}
(async() => {
const html = HtmlGenerator(content, templateData);
if(html != undefined && html !== '' && html != null) {
// create PDF file
cb(filename, className, term, sessionName, isProcessActive, pdfFileList);
// get style from .css & replace
const css = ReadFile(pathCss);
await page.setContent(html, { waitUntil: 'networkidle0'});
await page.addStyleTag(css);
await page.pdf(options);
page.close();
}
})()
}
}
const pdfGenerator = (json, cb) => {
let data = {};
let pdfFileList = [];
if(typeof json == 'string') {
data = JSON.parse(json)
} else {
data = json;
}
try {
// declare defaults
let filename = 'Student' + '.pdf';
let termName = firstTermDir;
const templateType = data.keys.templateType;
const session = data.classInfo.Session;
const sessionName = session.replace('/', '-');
const students = data.students;
const className = data.classInfo.Class_Name;
const recordFirstTerm = data.firstTerm;
const recordSecondTerm = data.secondTerm;
const recordThirdTerm = data.thirdTerm;
let pdfCreatedList = [];
let isReset = false;
let totalResultsExpected = Object.keys(recordFirstTerm).length + Object.keys(recordSecondTerm).length + Object.keys(recordThirdTerm).length;
let totalResultsCount = 0;
let jsonForPdf = {};
let record = {};
let sRecord, path, id, fName, lName;
// get each student
let logEndOnce = true;
let logBeforeOnce = true;
logBeforeOnce && console.log('============== *** ================');
logBeforeOnce && console.log('======== PDF GENERATION BEGINS ================');
const computeResult = (page, setTerm, setRecord, setReset) => {
const termName = setTerm;
const record = setRecord;
let isReset = setReset;
logBeforeOnce && console.log(`====== ${termName} RESULTS BEGINS ======`);
for(let elem of students){
id = elem.id;
fName = elem.firstName;
lName = elem.lastName;
filename = `${lName} ${fName} _${termName} ${sessionName}.pdf`;
// sRecord = record.filter(function (entry) { return entry[id] !== undefined; });
sRecord = record[id];
path = getPath(termName, filename);
// create pdf
if(!FileExists(path) && !FileExists(pathJsonPdfList)){
// generate final JSON for the student
// isReset = (pdfCreatedList.includes(id))? false: true;
jsonForPdf = finalJson(elem, sRecord, data, termName);
(pdfFileList.length < 1) && WriteFile(pathJsonPdfContent, JSON.stringify(jsonForPdf));
pdfFileList.push({
'term': termName,
'file': filename
});
totalResultsCount = pdfFileList.length;
const pdfDate = new Date();
console.log(`${filename} (${totalResultsCount}/${totalResultsExpected}) at ${pdfDate.getHours()}hr${pdfDate.getHours()>1?'s':''} - ${pdfDate.getMinutes()}min${pdfDate.getMinutes()>1?'s':''} - ${pdfDate.getSeconds()}sec${pdfDate.getSeconds()>1?'s':''}`);
isActive = (totalResultsExpected === totalResultsCount)? true: false;
logEndOnce = false;
// cb(filename, className, termName, sessionName, isActive, pdfFileList);
// WriteFile(path, null);
isReset = true;
createPdf(page, jsonForPdf, templateType, filename, className, termName, sessionName, isActive, pdfFileList, cb);
}
}
logBeforeOnce && console.log(`====== ${termName} RESULTS ENDS ======`);
}
// get each student result for First Term
const computeFirstTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.firstTerm === '1') {
termName = firstTermDir;
record = recordFirstTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Second Term
const computeSecondTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.secondTerm === '1') {
termName = secondTermDir;
record = recordSecondTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Third Term
const computeThirdTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.thirdTerm === '1') {
termName = thirdTermDir;
record = recordThirdTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
if(totalResultsExpected === totalResultsCount && totalResultsCount !== 0 && !logEndOnce) {
logEndOnce = true;
logBeforeOnce = false;
console.log('======== PDF GENERATION ENDS ================');
}
} catch (error) {
console.log('==== ERROR IN PDF GENERATION: ', error)
}
}
module.exports = {
PdfGenerator: pdfGenerator
}
ERROR
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
lerna ERR! yarn run start stderr:
<--- Last few GCs --->
[9884:000002D68A73C6B0] 1665171 ms: Scavenge 44.1 (45.8) -> 43.2 (45.8) MB, 223.9 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1684089 ms: Scavenge 44.1 (45.8) -> 43.3 (45.8) MB, 587.3 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1749901 ms: Scavenge 44.2 (45.8) -> 43.3 (45.8) MB, 5099.0 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
<--- JS stacktrace --->
FATAL ERROR: Committing semi space failed. Allocation failed - JavaScript heap out of memory
1: 00007FF6ED61013F
2: 00007FF6ED59F396
3: 00007FF6ED5A024D
4: 00007FF6EDED19EE
5: 00007FF6EDEBBECD
6: 00007FF6EDD5F61C
7: 00007FF6EDD6933F
8: 00007FF6EDD5BF19
9: 00007FF6EDD5A0D0
10: 00007FF6EDD7EA06
11: 00007FF6EDAB1CD5
12: 00007FF6EDF5F3E1
13: 00007FF6EDF602E9
14: 000002D68C4EF69E
error Command failed with exit code 134.
Screenshot of Task Manager, Chromium running multiple instances of over 50.
I appreciate any help. I hope this can be resolved to give me a smooth PDF generation.
Thank you.
Example solution (limiting parallel browsers)
I created you a PdfPrinter class which you can integrate into your setup. It allows you to limit the amount of parallel pdf generation jobs and allows setting a limit and manages opening/closing the browser for you. The PdfPrinter class is also highly coupled and needed some modification for using it as a general queue. Logicwise this can be modified to be a general queue.
You can try to integrate that into your code. This is a fully working test example with simplified pdfs (without the part of getting the actual data from the excel..)
As far as I understood your code, you do not need to pass the page around all your functions. First create your html + css and then use the pdfPrinter and let it handle page creation + browser launching..
(I like to code stuff like this so I went straight ahead..)
var puppeteer = require('puppeteer')
const defaultPrinterOptions = {
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
class PdfPrinter {
maxBrowsers = 2
enqueuedPrintJobs = []
failedJobs = []
browserInstances = 0
// max browser instances in parallel
constructor(maxBrowsers) {
this.maxBrowsers = maxBrowsers
}
/**
*
* #param {*} html the html content to print
* #param {*} css to apply to the page
* #param {*} printOptions options passed to puppeteer
*/
// enqueues a print but the exact end moment cannot be known..
enqueuePrint = (html, css, path, done) => {
// merge custom options with defaultOptions..
const printOptions = {
...defaultPrinterOptions,
// add the path to the options.
path: path
}
// create a function which can be stored in an array
// it will later be grabbed by startPrinter() OR at the time any
// brwoser freed up..
// the function needs to be passed the actual used browser instance!
this.enqueuedPrintJobs.push(async(browser) => {
// catch the error which may be produced when printing something..
try {
// print the document
await this.print(browser, html, css, printOptions)
} catch (err) {
console.error('error when printing document..CLosing browser and starting a new job!!', printOptions.path)
console.error(err)
// store someting so you now what failed and coudl be retried or something..
this.failedJobs.push({ html, css, path: printOptions.path })
// puppeteer can run into erros too!!
// so close the browser and launch a new one!
await this.closeBrowser(browser)
browser = await this.launchBrowser()
}
// after the print, call done() so the promise is resovled in the right moment when
// this particular print has ended.!
done()
// start the next job right now if there are any left.
const job = this.enqueuedPrintJobs.shift()
if (!job) {
console.log('No print jobs available anymore. CLosing this browser instance.. Remaining browsers now:', this.maxBrowsers - this.browserInstances + 1)
await this.closeBrowser(browser)
return
}
// job is actually this function itself! It will be executed
// and automatically grab a new job after completion :)
// we pass the same browser instance to the next job!.
await job(browser)
})
// whenever a print job added make sure to start the printer
// this starts new browser instances if the limit is not exceeded resp. if no browser is instantiated yet,
// and does nothing if maximum browser count is reached..
this.tryStartPrinter()
}
// same as enqueuePrint except it wraps it in a promise so we can now the
// exact end moment and await it..
enqueuePrintPromise(html, css, path) {
return new Promise((resolve, reject) => {
try {
this.enqueuePrint(html, css, path, resolve)
} catch (err) {
console.error('unexpected error when setting up print job..', err)
reject(err)
}
})
}
// If browser instance limit is not reached will isntantiate a new one and run a print job with it.
// a print job will automatically grab a next job with the created browser if there are any left.
tryStartPrinter = async() => {
// Max browser count in use OR no jobs left.
if (this.browserInstances >= this.maxBrowsers || this.enqueuedPrintJobs.length === 0) {
return
}
// browser instances available!
// create a new one
console.log('launching new browser. Available after launch:', this.maxBrowsers - this.browserInstances - 1)
const browser = await this.launchBrowser()
// run job
const job = this.enqueuedPrintJobs.shift()
await job(browser)
}
closeBrowser = async(browser) => {
// decrement browsers in use!
// important to call before closing browser!!
this.browserInstances--
await browser.close()
}
launchBrowser = async() => {
// increment browsers in use!
// important to increase before actualy launching (async stuff..)
this.browserInstances++
// this code you have to adjust according your enviromnemt..
const browser = await puppeteer.launch({ headless: true })
return browser
}
// The actual print function which creates a pdf.
print = async(browser, html, css, printOptions) => {
console.log('Converting page to pdf. path:', printOptions.path)
// Run pdf creation in seperate page.
const page = await browser.newPage()
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.addStyleTag({ content: css });
await page.pdf(printOptions);
await page.close();
}
}
// testing the PDFPrinter with some jobs.
// make sure to run the printer in an `async` function so u can
// use await...
const testPrinterQueue = async() => {
// config
const maxOpenedBrowsers = 5 // amount of browser instances which are allowed to be opened in parallel
const testJobCount = 100 // amount of test pdf jobs to be created
const destDir = 'C:\\somepath' // the directory to store the pdfs in..
// create sample jobs for testing...
const jobs = []
for (let i = 0; i < testJobCount; i++) {
jobs.push({
html: `<h1>job number [${i}]</h1>`,
css: 'h1 { background-color: red; }',
path: require('path').join(destDir, `pdf_${i}.pdf`)
})
}
// track time
const label = 'printed a total of ' + testJobCount + ' pdfs!'
console.time(label)
// run the actual pdf generation..
const printer = new PdfPrinter(maxOpenedBrowsers)
const jobProms = []
for (let job of jobs) {
// run jobs in parallel. Each job wil be runned async and return a Promise therefor
jobProms.push(
printer.enqueuePrintPromise(job.html, job.css, job.path)
)
}
console.log('All jobs enqueued!! Wating for finish now.')
// helper function which awaits all the print jobs, resp. an array of promises.
await Promise.all(jobProms)
console.timeEnd(label)
// failed jobs::
console.log('jobs failed:', printer.failedJobs)
// as file:
await require('fs').promises.writeFile('failed-jobs.json', JSON.stringify(printer.failedJobs))
}
testPrinterQueue().then(() => {
console.log('done with everyting..')
}).catch(err => {
console.error('unexpected error occured while printing all pages...', err)
})
You only need to adjust the destDir / openedBrowsers and testJobCount vars in the beginning of testPrinterQueue() for getting this to work.
What caused the problem in your code
Let's have a look at this piece
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
You created an anonymous function which is executed immediatly. Within the function all the statements are correctly awaited using await. But if you run this whole piece within a synchronious part of your application, the whole function will start immediatly but NOT been awaited before running next code.
Checkout this example:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
AsyncFunction();
// what you have done in your code:
(async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction()
console.log('done')
Note the output:
Async named function started
Async anonymus function started
sync function ended. // => sync function already ended
done // sync function ended and code continues execution.
Async named function ended
Async anonymus function ended
To correctly await your async stuff you need to put your whole application in async scope:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
// this is now async!!
async function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
await AsyncFunction();
// what you have done in your code:
await (async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction().then(() => {
console.log('done')
}).catch(err => {
console.error('unexpected error occured..')
})
This output is what we want
sync function started
Async named function started
Async named function ended
Async anonymus function started
Async anonymus function ended
sync function ended.
done
Hope this helps you understand.
Feel free to leave a comment.
I am trying to send a lot of https requests and processing that causes my code to crash. I know I can increase my memory but that won't scale. Uncommenting the code below causes OOM crash at some point. The solution should probably be to flush the buffer or something, but I am learning nodejs so not sure what to do.
var https = require('https');
// url anonymized for example
var urlArray = ["https://example.com/blah", ....] // 5000 urls here
var options = {
headers: { "x-api-key": "mykey" }
};
for (let dest of urlArray) {
https.request(dest, options, (res) => {
if (res.statusCode != 200) {
console.log(res.statusCode+" "+res.statusMessage+" at "+dest)
}
})
// uncommenting below causes a crash during runtime
// .on("error", (err) =>
// console.log(err.stack))
.end();
}
NodeJs being non-blocking, is not waiting for the ith http.request to finish before it moves to the i+1th. So the request keeps on accumulating in the memory, and as the memory is not big enough, it crashes. So what we can do here is execute the requests in batches and wait for that batch to finish before starting with the next batch. With this, at any instant, there will be at most n requests present in the memory (n is the batch size).
The code will look something like this:
const request = require('request-promise');
const urlArray = ["https://example.com/blah", ....];
async function batchProcess (urlArray){
const batchSize = 100;
// ^^^^^^^^^^
let i=0;
while(i<urlArray.length) {
let batch = [];
for(let j=0; j<batchSize && i<urlArray.length; j++, i++){
batch.push(request({
uri: urlArray[i],
headers: {
'User-Agent': 'Request-Promise',
"x-api-key": "mykey"
},
json: true // Automatically parses the JSON string in the response
}));
}
let batchResult = await Promise.all(batch);
console.log(batchResult);
}
}
batchProcess(urlArray);
One way would be to turn them into an async iterable where you can run them after another and process them as they return (Apologies for the TypeScript, just pass them through playground to transpile if you don't know TS):
import fetch from "node-fetch";
class MyParallelCalls implements AsyncIterable<any> {
constructor(private readonly urls: string[]) {}
[Symbol.asyncIterator](): AsyncIterator<any> {
return this.iterator();
}
async *iterator(): AsyncGenerator<any> {
for (const url of this.urls) {
yield (await fetch(url, {headers: { "x-api-key": "mykey" }})).json();
}
}
}
async function processAll() {
const calls = new MyParallelCalls(urls);
for await (const call of calls) {
// deal with them as the happen: e.g. pipe them into a process or a destination
console.log(call);
}
}
processAll();
If you want I can modify the above to batch your calls too. It's easy just add an option to the constructor for batch size and you can set how many calls you want to do in your batch and use promise.all for doing the yield.
It will look something like below (Refactored a little so its more generic):
import fetch from "node-fetch";
interface MyParallelCallsOptions {
urls: string[];
batchSize: number;
requestInit: RequestInit;
}
class MyParallelCalls<T> implements AsyncIterable<T[]> {
private batchSize = this.options.batchSize;
private currentIndex = 0;
constructor(private readonly options: MyParallelCallsOptions) {}
[Symbol.asyncIterator](): AsyncIterator<T[]> {
return this.iterator();
}
private getBatch(): string[] {
const batch = this.options.urls.slice(this.currentIndex, this.currentIndex + this.batchSize);
this.setNextBatch();
return batch;
}
private setNextBatch(): void {
this.currentIndex = this.currentIndex + this.batchSize;
}
private isLastBatch(): boolean {
return this.currentIndex === this.options.urls.length;
}
async *iterator(): AsyncGenerator<T[]> {
while (!this.isLastBatch()) {
const batch = this.getBatch();
const requests = batch.map(async (url) => (await fetch(url, this.options.requestInit)).json());
yield Promise.all(requests);
}
}
}
async function processAll() {
const batches = new MyParallelCalls<any>({
urls, // string array of your urls
batchSize: 5,
requestInit: { headers: { "x-api-key": "mykey" } }
});
for await (const batch of batches) {
console.log(batch);
}
}
processAll();
I have a Node.js restful API built in express.js framework. It is usually hosted by pm2.
One of the services has very long process. When front end called the service, the process started up. Since there is an error in database, the process won't be done properly and the error would be caught. However, before the process reached the error, another exactly same process started with same parameters. So in the meantime, two processes were both running while one was ahead of the other. After a long time, the first process reached error point and returned error. Then the second one returned exactly the same thing.
I checked front end Network and noticed there was actually only one request sent. Where did the second request come from?
Edit 1:
The whole process is: first process sends query to db -> long time wait -> second process starts up -> second process sends query to db -> long time wait -> first process receives db response -> long time wait -> second process receives db response
Edit 2:
The code of the service is as follow:
import { Express, Request, Response } from "express";
import * as multer from "multer";
import * as fs from "fs";
import { Readable, Duplex } from "stream";
import * as uid from "uid";
import { Client } from "pg";
import * as gdal from "gdal";
import * as csv from "csv";
import { SuccessPayload, ErrorPayload } from "../helpers/response";
import { postgresQuery } from "../helpers/database";
import Config from "../config";
export default class ShapefileRoute {
constructor(app: Express) {
// Upload a shapefile
/**
* #swagger
* /shapefile:
* post:
* description: Returns the homepage
* responses:
* 200:
*/
app.post("/shapefile", (req: Request, res: Response, next: Function): void => {
// Create instance of multer
const multerInstance = multer().array("files");
multerInstance(req, res, (err: Error) => {
if (err) {
let payload: ErrorPayload = {
code: 4004,
errorMessage: "Multer upload file error.",
errorDetail: err.message,
hints: "Check error detail"
};
req.reservePayload = payload;
next();
return;
}
// Extract files
let files: any = req.files;
// Extract body
let body: any = JSON.parse(req.body.filesInfo);
// Other params
let writeFilePromises: Promise<any>[] = [];
let copyFilePromises: Promise<any>[] = [];
let rootDirectory: string = Config.uploadRoot;
let outputId: string = uid(4);
// Reset index of those files
let namesIndex: string[] = [];
files.forEach((item: Express.Multer.File, index: number) => {
if(item.originalname.split(".")[1] === "csv" || item.originalname.split(".")[1] === "txt" || item.originalname.split(".")[1] === "shp") {
namesIndex.push(item.originalname);
}
})
// Process and write all files to disk
files.forEach((item: Express.Multer.File, outterIndex: number) => {
if(item.originalname.split(".")[1] === "csv" || item.originalname.split(".")[1] === "txt") {
namesIndex.forEach((indexItem, index) => {
if(indexItem === item.originalname) {
ShapefileRoute.csv(item, index, writeFilePromises, body, rootDirectory, outputId,);
}
})
} else if (item.originalname.split(".")[1] === "shp") {
namesIndex.forEach((indexItem, index) => {
if(indexItem === item.originalname) {
ShapefileRoute.shp(item, index, writeFilePromises, body, rootDirectory, outputId,);
}
})
} else {
ShapefileRoute.shp(item, outterIndex, writeFilePromises, body, rootDirectory, outputId,);
}
})
// Copy files from disk to database
ShapefileRoute.copyFiles(req, res, next, writeFilePromises, copyFilePromises, req.reserveSuperPg, () => {
ShapefileRoute.loadFiles(req, res, next, copyFilePromises, body, outputId)
});
})
});
}
// Process csv file
static csv(file: Express.Multer.File, index: number, writeFilePromises: Promise<any>[], body: any, rootDirectory: string, outputId: string) {
// Streaming file to pivotcsv
writeFilePromises.push(new Promise((resolve, reject) => {
// Get specification from body
let delimiter: string;
let spec: any;
let lrsColumns: string[] = [null, null, null, null, null, null];
body.layers.forEach((jsonItem, i) => {
if (jsonItem.name === file.originalname.split(".")[0]) {
delimiter = jsonItem.file_spec.delimiter;
spec = jsonItem
jsonItem.lrs_cols.forEach((lrsCol) => {
switch(lrsCol.lrs_type){
case "rec_id":
lrsColumns[0] = lrsCol.name;
break;
case "route_id":
lrsColumns[1] = lrsCol.name;
break;
case "f_meas":
lrsColumns[2] = lrsCol.name;
break;
case "t_meas":
lrsColumns[3] = lrsCol.name;
break;
case "b_date":
lrsColumns[4] = lrsCol.name;
break;
case "e_date":
lrsColumns[5] = lrsCol.name;
break;
}
})
}
});
// Pivot csv file
ShapefileRoute.pivotCsv(file.buffer, `${rootDirectory}/${outputId}_${index}`, index, delimiter, outputId, lrsColumns, (path) => {
console.log("got pivotCsv result");
spec.order = index;
resolve({
path: path,
spec: spec
});
}, reject);
}));
}
// Process shapefile
static shp(file: Express.Multer.File, index: number, writeFilePromises: Promise<any>[], body: any, rootDirectory: string, outputId: string) {
// Write file to disk and then call shp2csv to gennerate csv
writeFilePromises.push(new Promise((resolve, reject) => {
// Write shpefile to disk
fs.writeFile(`${rootDirectory}/shps/${file.originalname}`, file.buffer, (err) => {
// If it is .shp file, resolve it's path and spec
if(file.originalname.split(".")[1] === "shp") {
// Find spec of the shapefile from body
body.layers.forEach((jsonItem, i) => {
if (jsonItem.name === file.originalname.split(".")[0]) {
let recordColumn: string = null;
let routeIdColumn: string = null;
jsonItem.lrs_cols.forEach((lrsLayer) => {
if (lrsLayer.lrs_type === "rec_id") {
recordColumn = lrsLayer.name;
}
if (lrsLayer.lrs_type === "route_id") {
routeIdColumn = lrsLayer.name;
}
})
// Transfer shp to csv
ShapefileRoute.shp2csv(`${rootDirectory}/shps/${file.originalname}`, `${rootDirectory}/${outputId}_${index}`, index, outputId, recordColumn, routeIdColumn, (path, srs) => {
// Add coordinate system, geom column and index of this file to spec
jsonItem.file_spec.proj4 = srs;
jsonItem.file_spec.geom_col = "geom";
jsonItem.order = index;
// Return path and spec
resolve({
path: path,
spec: jsonItem
})
}, (err) => {
reject;
})
}
});
} else {
resolve(null);
}
})
}));
}
// Copy files to database
static copyFiles(req: Request, res: Response, next: Function, writeFilePromises: Promise<any>[], copyFilePromises: Promise<any>[], client: Client, callback: () => void) {
// Take all files generated by writefile processes
Promise.all(writeFilePromises)
.then((results) => {
// Remove null results. They are from .dbf .shx etc of shapefile.
const files: any = results.filter(arr => arr);
// Create promise array. This will be triggered after all files are written to database.
files.forEach((file) => {
copyFilePromises.push(new Promise((copyResolve, copyReject) => {
let query: string = `copy lbo.lbo_temp from '${file.path}' WITH NULL AS 'null';`;
// Create super user call
postgresQuery(client, query, (data) => {
copyResolve(file.spec);
}, copyReject);
}));
});
// Trigger upload query
callback()
})
.catch((err) => {
// Response as error if any file generating is wrong
let payload: ErrorPayload = {
code: 4004,
errorMessage: "Something wrong when processing csv and/or shapefile.",
errorDetail: err.message,
hints: "Check error detail"
};
req.reservePayload = payload;
next();
})
}
// Load layers in database
static loadFiles(req: Request, res: Response, next: Function, copyFilePromises: Promise<any>[], body: any, outputId: string) {
Promise.all(copyFilePromises)
.then((results) => {
// Resort all results by the order assigned when creating files
results.sort((a, b) => {
return a.order - b.order;
});
results.forEach((result) => {
delete result.order;
});
// Create JSON for load layer database request
let taskJson = body;
taskJson.layers = results;
let query: string = `select lbo.load_layers2(p_session_id := '${outputId}', p_layers := '${JSON.stringify(taskJson)}'::json)`;
postgresQuery(req.reservePg, query, (data) => {
// Get result
let result = data.rows[0].load_layers2.result;
// Return 4003 error if no result
if (!result) {
let payload: ErrorPayload = {
code: 4003,
errorMessage: "Load layers error.",
errorDetail: data.rows[0].load_layers2.error ? data.rows[0].load_layers2.error.message : "Load layers returns no result.",
hints: "Check error detail"
};
req.reservePayload = payload;
next();
return;
}
let payload: SuccessPayload = {
type: "string",
content: "Upload files done."
};
req.reservePayload = payload;
next();
}, (err) => {
req.reservePayload = err;
next();
});
})
.catch((err) => {
// Response as error if any file generating is wrong
let payload: ErrorPayload = {
code: 4004,
errorMessage: "Something wrong when copy files to database.",
errorDetail: err,
hints: "Check error detail"
};
req.reservePayload = payload;
next();
})
}
// Pivot csv process. Write output csv to disk and return path of the file.
static pivotCsv(buffer: Buffer, outputPath: string, inputIndex: number, delimiter: string, outputId: string, lrsColumns: string[], callback: (path: string) => void, errCallback: (err: Error) => void) {
let inputStream: Duplex = new Duplex();
// Define output stream
let output = fs.createWriteStream(outputPath, {flags: "a"});
// Callback when output stream is done
output.on("finish", () => {
console.log("output stream finish");
callback(outputPath);
});
// Define parser stream
let parser = csv.parse({
delimiter: delimiter
});
// Close output stream when parser stream is end
parser.on("end", () => {
console.log("parser stream end");
output.end();
});
// Write data when a chunck is parsed
let header = [null, null, null, null, null, null];
let attributesHeader = [];
let i = 0;
let datumIndex: boolean = true;
parser.on("data", (chunk) => {
console.log("parser received on chunck: ", i);
if (datumIndex) {
chunk.forEach((datum, index) => {
if (lrsColumns.includes(datum)) {
header[lrsColumns.indexOf(datum)] = index;
} else {
attributesHeader.push({
name: datum,
index: index
})
}
});
datumIndex = false;
} else {
i ++;
// let layer_id = ;
let rec_id = header[0] ? chunk[header[0]] : i;
let route_id = header[1] ? chunk[header[1]] : null;
let f_meas = header[2] ? chunk[header[2]] : null;
let t_meas = header[3] ? chunk[header[3]] : null;
let b_date = header[4] ? chunk[header[4]] : null;
let e_date = header[5] ? chunk[header[5]] : null;
let attributes = {};
attributesHeader.forEach((attribute) => {
attributes[attribute.name] = chunk[attribute.index];
});
let attributesOrdered = {};
Object.keys(attributes).sort().forEach((key) => {
attributesOrdered[key] = attributes[key];
});
let outputData = `${outputId}\t${inputIndex}\t${rec_id}\t${route_id}\tnull\t${f_meas}\t${t_meas}\t${b_date}\t${e_date}\tnull\t${JSON.stringify(attributesOrdered)}\n`;
output.write(outputData);
}
});
inputStream.push(buffer);
inputStream.push(null);
inputStream.pipe(parser);
}
// Write shp and transfer to database format. Return file path and projection.
static shp2csv(inputPath: string, outputPath: string, i: number, ouputId: string, recordColumn: string, routeIdColumn: string, callback: (path: string, prj: string) => void, errCallback: (err: Error) => void) {
let dataset = gdal.open(inputPath);
let layercount = dataset.layers.count();
let layer = dataset.layers.get(0);
let output = fs.createWriteStream(outputPath, {flags: "a"});
output.on("finish", () => {
callback(outputPath, layer.srs.toProj4());
});
layer.features.forEach((feature, featureId) => {
let geom;
let recordId: number = null;
let routeId: string = null;
try {
let geomWKB = feature.getGeometry().toWKB();
let geomWKBString = geomWKB.toString("hex");
geom = geomWKBString;
if (recordColumn) {
recordId = feature.fields.get(recordColumn);
}
if (routeIdColumn) {
routeId = feature.fields.get(routeIdColumn);
}
}
catch (err) {
console.log(err);
}
let attributes = {};
let attributesOrdered = {};
feature.fields.forEach((value, field) => {
if (field != recordColumn && field != routeIdColumn) {
attributes[field] = value;
}
});
Object.keys(attributes).sort().forEach((key) => {
attributesOrdered[key] = attributes[key];
});
output.write(`${ouputId}\t${i.toString()}\t${recordId ? recordId : (featureId + 1).toString()}\t${routeId}\tnull\tnull\tnull\tnull\tnull\t${geom}\t${JSON.stringify(attributesOrdered)}\n`);
});
output.end();
}
}
The browser retries some requests if the server doesn't send a response and the browser hits its timeout value. Each browser may be configured with its own timeout, but 2 minutes sounds like it's probably the browser timeout.
You can't control the browser's timeout from your server. Two minutes is just too long to ask it to wait. You need a different design that responds sooner and then communicates back the eventual result later when it's ready. Either client polling or server push with webSocket/socket.io.
For client polling, you could have the server respond immediately from your first request and return back a token (some unique string). Then, the client can ask the server for the response for that token every minute until the server eventually has the response. If the server doesn't yet have the response, it just immediately returns back a code that means no response yet. If so, the client sets a timer and tries again in a minute, sending the token each time so the server knows which request it is asking about.
For server push, the client creates a persistent webSocket or socket.io connection to the server. When the client makes it's long running request, the server just immediately returns the same type of token described above. Then, when the server is done with the request, it sends the token and the final data over the socket.io connection. The client is listening for incoming messages on that socket.io connection and will receive the final response there.
I have a use case that needs to use Headless Chrome Network (https://chromedevtools.github.io/devtools-protocol/tot/Network/) to intercept all images requests and find out the image size before saving it (basically discard small images such as icons).
However, I am unable to figure out a way to load the image data in memory before saving it. I need to load it in Img object to get width and height. The Network.getResponseBody is taking requestId which I don't have access in Network.requestIntercepted. Also Network.loadingFinished always gives me "0" in encodedDataLength variable. I have no idea why. So my questions are:
How to intercept all responses from jpg/png request and get the image data? Without saving the file via URL string to the disk and load back.
BEST: how to get image dimension from header response? Then I don't have to read the data into memory at all.
My code is below:
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');
(async function() {
async function launchChrome() {
return await chromeLauncher.launch({
chromeFlags: [
'--disable-gpu',
'--headless'
]
});
}
const chrome = await launchChrome();
const protocol = await CDP({
port: chrome.port
});
const {
DOM,
Network,
Page,
Emulation,
Runtime
} = protocol;
await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
await Network.setRequestInterceptionEnabled({enabled: true});
Network.requestIntercepted(({interceptionId, request, resourceType}) => {
if ((request.url.indexOf('.jpg') >= 0) || (request.url.indexOf('.png') >= 0)) {
console.log(JSON.stringify(request));
console.log(resourceType);
if (request.url.indexOf("/unspecified.jpg") >= 0) {
console.log("FOUND unspecified.jpg");
console.log(JSON.stringify(interceptionId));
// console.log(JSON.stringify(Network.getResponseBody(interceptionId)));
}
}
Network.continueInterceptedRequest({interceptionId});
});
Network.loadingFinished(({requestId, timestamp, encodedDataLength}) => {
console.log(requestId);
console.log(timestamp);
console.log(encodedDataLength);
});
Page.navigate({
url: 'https://www.yahoo.com/'
});
Page.loadEventFired(async() => {
protocol.close();
chrome.kill();
});
})();
This should get you 90% of the way there. It gets the body of each image request. You'd still need to base64decode, check size and save etc...
const CDP = require('chrome-remote-interface');
const sizeThreshold = 1024;
async function run() {
try {
var client = await CDP();
const { Network, Page } = client;
// enable events
await Promise.all([Network.enable(), Page.enable()]);
// commands
const _url = "https://google.co.za";
let _pics = [];
Network.responseReceived(async ({requestId, response}) => {
let url = response ? response.url : null;
if ((url.indexOf('.jpg') >= 0) || (url.indexOf('.png') >= 0)) {
const {body, base64Encoded} = await Network.getResponseBody({ requestId }); // throws promise error returning null/undefined so can't destructure. Must be different in inspect shell to app?
_pics.push({ url, body, base64Encoded });
console.log(url, body, base64Encoded);
}
});
await Page.navigate({ url: _url });
await sleep(5000);
// TODO: process _pics - base64Encoded, check body.length > sizeThreshold, save etc...
} catch (err) {
if (err.message && err.message === "No inspectable targets") {
console.error("Either chrome isn't running or you already have another app connected to chrome - e.g. `chrome-remote-interface inspect`")
} else {
console.error(err);
}
} finally {
if (client) {
await client.close();
}
}
}
function sleep(miliseconds = 1000) {
if (miliseconds == 0)
return Promise.resolve();
return new Promise(resolve => setTimeout(() => resolve(), miliseconds))
}
run();
The standard says headers are case insensitive.
Ruby and node both force lower case headers.
We are using an outside server program that expects headers 'AuthToken' to be case sensitive, using .NET framework, and apparently both don't follow standards. We need headers to be up case in this instance.
At the time of writing, the following setHeader was copied from
the _http_outgoing page of node's core lib
var http = require('http');
http.OutgoingMessage.prototype.setHeader = function(name, value) {
if (arguments.length < 2) {
throw new Error('`name` and `value` are required for setHeader().');
}
if (this._header) {
throw new Error('Can\'t set headers after they are sent.');
}
// NO LOWER CASE
var key = name//.toLowerCase();
this._headers = this._headers || {};
this._headerNames = this._headerNames || {};
this._headers[key] = value;
this._headerNames[key] = name;
// Since we're re-defining the method, we can't use this part anymore
//if (automaticHeaders[key]) {
// this._removedHeader[key] = false;
//}
};
Commented out part for lowercase
So.. if you get this problem. require http and override this method with the version you're currently using.
It should then work properly. You could do a similar thing of overriding a method in ruby, but it won't be a quick and easy
Then this will work:
require('request')
request({url: 'http://myurl.com', headers: {UpperCaseWorks: 'Yay'}})
EDIT: here's for the newer version of node
OutgoingMessage.prototype.setHeader = function setHeader(name, value) {
if (this._header) {
throw new errors.Error('ERR_HTTP_HEADERS_SENT', 'set');
}
validateHeader(name, value);
if (!this[outHeadersKey])
this[outHeadersKey] = {};
// no more lower case
const key = name//.toLowerCase();
this[outHeadersKey][key] = [name, value];
switch (key.length) {
case 10:
if (key === 'connection')
this._removedConnection = false;
break;
case 14:
if (key === 'content-length')
this._removedContLen = false;
break;
case 17:
if (key === 'transfer-encoding')
this._removedTE = false;
break;
}
};
Looks like it calls this local method, which'll need to be defined as well
function validateHeader(name, value) {
let err;
if (typeof name !== 'string' || !name || !checkIsHttpToken(name)) {
err = new errors.TypeError('ERR_INVALID_HTTP_TOKEN', 'Header name', name);
} else if (value === undefined) {
err = new errors.TypeError('ERR_HTTP_INVALID_HEADER_VALUE', value, name);
} else if (checkInvalidHeaderChar(value)) {
debug('Header "%s" contains invalid characters', name);
err = new errors.TypeError('ERR_INVALID_CHAR', 'header content', name);
}
if (err !== undefined) {
Error.captureStackTrace(err, validateHeader);
throw err;
}
}
And this
const { outHeadersKey } = require('internal/http');
Anyway, check your version of node for what you are overriding
Piggybacking on Funkodebat's answer, here's my solution for Node 16:
const http = require('http');
// https://github.com/nodejs/node/blob/v16.x/lib/_http_outgoing.js#L574-L587
const { validateHeaderName, validateHeaderValue } = http;
http.OutgoingMessage.prototype.setHeader = function setHeader(name, value) {
if (this._header) {
throw new Error('Cannot set headers after they are sent to the client');
}
validateHeaderName(name);
validateHeaderValue(name, value);
// Extra logic to find kOutHeaders symbol in `this`
const kOutHeaders = Object.getOwnPropertySymbols(this).find(
(sym) => sym.toString() === 'Symbol(kOutHeaders)'
);
let headers = this[kOutHeaders];
if (headers === null) this[kOutHeaders] = headers = Object.create(null);
headers[name] = [name, value]; // toLowerCase removed from here
return this;
};
By looking at the source of NodeJS library on github, you do not need to override the OutgoingMessage.prototype.setHeader
Instead of passing the headers as an Object, you should send them as an Array. Here is a working example :
const http = require('http');
const postData = JSON.stringify({
'msg': 'Hello World!'
});
const options = {
hostname: 'www.google.com',
port: 80,
path: '/upload',
method: 'POST',
// use an Array instead of Object to avoid lowercase transformation
headers: [
['Host' ,'localhost' ],
['X-CustomHeaderFancy' , 'valueForFancyHeader'],
['Content-Type', 'application/json'],
['Content-Length', Buffer.byteLength(postData)]
}
};
const req = http.request(options, (res) => {
console.log(`STATUS: ${res.statusCode}`);
console.log(`HEADERS: ${JSON.stringify(res.headers)}`);
res.setEncoding('utf8');
res.on('data', (chunk) => {
console.log(`BODY: ${chunk}`);
});
res.on('end', () => {
console.log('No more data in response.');
});
});
req.on('error', (e) => {
console.error(`problem with request: ${e.message}`);
});
// Write data to request body
req.write(postData);
req.end();
inside the source code of https://github.com/nodejs/node/blob/v16.x/lib/_http_client.js#L249 there is a test to know if the headers are an array, if it is the case, then it bypass the lowercase transformation.
I do not know why it is not documented ? It's a very useful feature.