Uploading series of large files in node to a web API

Uploading series of large files in node to a web API - node.js

I am trying to create a node script to traverse a folder of large files (mp3s containing 1 hour music sets) and upload them to Mixcloud via their API. It works except in the case where it hits the API rate limit and needs to wait x seconds - I set a timeout and loop back to beginning, but after the set amount of time, the process exits and doesn't log any useful output. Is this due to a bug in my code, or the way I'm running it? I'm currently running it with sudo nohup node index.js > log.log, is this incorrect? my end goal is a script that can be run at the end of every day of our radio station and upload the archived files.
const restler = require('restler');
const fs = require('fs');
const readDir = require('readdir');
const powerOff = require('power-off');
const options = {
folder: 'files',
completefolder: 'complete',
accesstoken: 'xxxxxxxxxxxxxxxxxxxxxxx'
}
// shut down computer
const shutDown = () => {powerOff((err, stderr, stdout) => {
if(!err && !stderr) {
console.log(stdout);
}
})
};
// uploadFile uploads file with restler to mixcloud, if api returns rate limiting object, try again in x seconds
const uploadFile = (folder, filename) => {
const filepath = `./${folder}/${filename}`
fs.stat(`./${folder}/${filename}`, function(err, stats) {
const size = stats.size;
restler.post(`https://api.mixcloud.com/upload/?access_token=${options.accesstoken}`, {
multipart: true,
data: {
"mp3": restler.file(`./${folder}/${filename}`, null, size, null, 'audio/mpeg'),
"name": filename,
// "unlisted": true
// more data can be added here depending on changes in workflow, automate images etc
}
}).on("complete", function(data) {
const returned = JSON.parse(data);
if (returned.error) {
if (returned.error.type == "RateLimitException") {
// try again in x seconds
console.log(`uploading too fast, retrying upload of ${filename}after ${returned.error.retry_after} seconds`);
setTimeout(() => {(folder, filename) => uploadFile}, returned.error.retry_after*1000);
}
else {
console.log('non-rate-limiting error');
console.log(returned);
}
}
else {
console.log('Success!');
console.log(returned);
// move uploaded files into completed folder
fs.rename(`./${folder}/${filename}`, `./${options.completefolder}/${filename}`, (err)=> {
if (err) {
console.log(err)
}
else {
counter += 1;
console.log(counter);
if (counter === files.length) {
console.log('done');
shutDown();
}
}
})
}
});
});
};
// get all mp3s and upload all of them
const files = readDir.readSync(`./${options.folder}`, ['**.mp3'] );
let counter = 0;
for (var i = 0; i < files.length; i++) {
uploadFile(options.folder, files[i])
};

Related

Node.js on multi-core machines for file I/O operations

I'm a bit confused because all the examples I read about Node cluster module only seem to apply to webservers and concurrent requests. Otherwise to CPU intensive application it is recommended to use the worker_threads module.
And what about I/O file operations? Imagine I have an array with 1 million filenames: ['1.txt', '2.txt', etc., ..., '1000000.txt'] and I need to do heavy processing and then write the result file content?
What would be the method to efficiently use all the cores of the CPU to spread the processing towards different cores amongst different filenames?
Normally I would use this:
const fs = require('fs')
const fs = require('async')
const heavyProcessing = require('./heavyProcessing.js')
const files = ['1.txt', '2.txt', ..., '1000000.txt']
async.each(files, function (file, cb) {
fs.writeFile(file, heavyProcessing(file), function (err) {
if (!err) cb()
})
}
Should I use now the cluster or the worker_threads? And how should I use it?
Does this work?
const fs = require('fs')
const fs = require('async')
const heavyProcessing = require('./heavyProcessing.js')
const cluster = require('node:cluster');
const http = require('node:http');
const numCPUs = require('node:os').cpus().length;
const process = require('node:process');
if (cluster.isPrimary) {
console.log(`Primary ${process.pid} is running`);
// Fork workers.
for (let i = 0; i < numCPUs; i++) {
cluster.fork();
}
cluster.on('exit', (worker, code, signal) => {
console.log(`worker ${worker.process.pid} died`);
});
} else {
const files = ['1.txt', '2.txt', ..., '1000000.txt']
async.each(files, function (file, cb) {
fs.writeFile(file, heavyProcessing(file), function (err) {
if (!err) cb()
})
}
}

Just for everyone to know, if they are interested, you need to use the npm module piscina.
In this gist I explain everything. NodeJS is a powerful tool for backend developers, but you must be aware of multi-core processing in order to maximize the potential of your CPU.
This NodeJS multi-core feature is mostly used for webservers and NodeJS has already out of the box the cluster module thereto.
Although NodeJS has also out of the box the module threads, it's not so easy to deal with.
Let's create a project that will test single-thread and multi-thread CPU intensive data and write some random data to file.
Create the project:
mkdir test-threads && cd test-threads
npm init -y
Install dependencies and create dist/ directory
npm install async progress piscina command-line-args
mkdir dist
Create the file index.js at the root of the project directory
const path = require('path')
const async = require('async')
const ProgressBar = require('progress')
const Piscina = require('piscina')
const commandLineArgs = require('command-line-args')
console.time('main')
const worker = require(path.resolve(__dirname, 'worker.js'))
const piscina = new Piscina({
filename: path.resolve(__dirname, 'worker.js')
})
const argvOptions = commandLineArgs([
{ name: 'multi-thread', type: Boolean },
{ name: 'iterations', alias: 'i', type: Number }
])
const files = []
for (let i=0; i < (argvOptions.iterations || 1000); i++) {
files.push(path.join(__dirname, 'dist', i + '.txt'))
}
var bar = new ProgressBar(':bar', { total: files.length, width: 80 });
async.each(files, function (file, cb) {
(async function() {
try {
const err = argvOptions['multi-thread'] ? (await piscina.run(file)) : worker(file)
bar.tick()
if (err) cb(Error(err)); else cb()
} catch(err) {
cb(Error(err))
}
})();
}, (err) => {
if (err) {
console.error('There was an error: ', err)
process.exitCode = 1
} else {
bar.terminate()
console.log('Success')
console.timeEnd('main')
process.exitCode = 0
}
})
Create now worker.js also at the root of the project directory
const fs = require('fs')
// some CPU intensive function; the higher is baseNumber, the higher is the time elapsed
function mySlowFunction(baseNumber) {
let result = 0
for (var i = Math.pow(baseNumber, 7); i >= 0; i--) {
result += Math.atan(i) * Math.tan(i)
}
}
module.exports = (file) => {
try {
mySlowFunction(parseInt(Math.random() * 10 + 1))
fs.writeFileSync(file, Math.random().toString())
return null
} catch (e) {
return Error(e)
}
}
Now just run on single thread and check time elapsed, for 1000 and 10000 iterations (one iteration equals to data processing and file creation)
node index.js -i 1000
node index.js -i 10000
Now compare with the great advantage of multi-thread
node index.js --multi-thread -i 1000
node index.js --multi-thread -i 10000
With the test I did (16 cores CPU), the difference is huge, it went with 1000 iterations from 1:27.061 (m:ss.mmm) for single thread to 8.884s with multi-thread. Check also the files inside dist/ to be sure they were created correctly.

Puppeteer create PDF files from HTML data hangs Windows 10 system

I created an App that processes students results by extracting data from multiple excel workbooks. The problem is that using Puppeteer to generate the PDF files, throws the system into a loop till it hangs the system.
Actually, I have tested same codes below using PhantomJs which is bundled as pdf-creator-node, and was able to generate 150 PDF files comfortably in 3 minutes. The only challenge I dumped PhantomJs is that all the styling in the CSS file was not included, even when I inserted it as an inline style in the header, suing replace function of JS. Another, is that PhantomJs is no longer in active development. I searched the web, and found out that only Puppeteer is the valid solution with active development and support too.
I tried using page.close() at the end of pdfCreator() which is in a loop, and browser.close() at the end of pdfGenerator(). What I am doing wrong?
Here below are the codes in the server.js and PdfGenerator.js files, with a sample of the ERROR, and screenshot of my Task Manager after the system crawled out of hanging state. For HTML generation, I used Mustache. I excluded some lines of codes in server.js because the total character count was over 60k.
server.js
// [codes were removed here]
if(getCode == 'compute-result') {
// declare variable
let setData = null;
let setTitle = 'Results Computation...';
let setArgs = getArgs;
// dataFromFile = ReadFile(pathCodeTextFile);
// setArgs = Number(dataFromFile);
setCode = 'compute-result';
let setView = [];
let setNext = true;
let countTerms = [];
// if(getArg > 0) {
// Final Result computation
const getJson = ReadFile(pathJsonResults);
// const getCtrl = ReadFile(pathJsonCtrl);
const getResultObject = JSON.parse(getJson);
getResult = getResultObject;
const totalResults = getResult.firstTerm.length + getResult.secondTerm.length + getResult.thirdTerm.length;
if(setView.length < 1 && getResult != null) {
setData = 'PDFs for Students Results initiating...';
setView.unshift('Reading saved data...');
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: null, view: JSON.stringify(setView)});
}
Sleep(2000).then(() => {
if(getResult != null) {
setData = 'Students Results will be ready in a moment';
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
const wacthFiles = (file, className, termName, sessionName, completed, pdfList) => {
try {
if(typeof file == 'string' && !FileExists(pathJsonPdfList)) {
if(pdfList.length < 2){
setData = 'Saving PDFs to downladable files...';
}
if(className != null && termName != null && sessionName != null) {
setTitle = `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}...`;
setView.unshift(file);
if(!countTerms.includes(termName)) {
countTerms.push(termName)
}
// setCode = -1000 - pdfList.length;
// console.log('PDF PROGRESS: ', `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}... ${setCode}`);
// when all PDFs are created
if(completed) {
setTitle = setTitle.replace('...', ' [completed]');
setData = 'Result Download button is Active. You may click it now.';
setView.unshift('=== PDF GENERATION COMPLETED ===');
setView.unshift(`A total of ${pdfList.length} students' Results were generated`);
WriteFile(pathJsonPdfList, JSON.stringify(pdfList));
// set donwload button active
setCode = Number(codeTextFilePdfCompleted);
setNext = false;
getResult = null;
let termString = countTerms.toString();
termString = ReplaceAll(termString, '-term', '');
termString = ReplaceAll(termString, ',', '-');
const addTxt = `${className} _${termString} Term${countTerms.length>1?'s':''} (${sessionName})`;
WriteFile(pathCodeTextFile, addTxt);
// console.log('======== PDF GENERATION ENDS ================');
} else {
setCode = -1 * pdfList.length;
}
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
}
} catch (error) {
console.log('ERROR ON WATCHER: ', error);
}
}
if(!FileExists(pathJsonPdfList) && getResult !== null) {
PdfGenerator(getResult, wacthFiles);
}
// Watcher(pathWatchResults, setCode, wacthDir, 10000);
});
// }
}
}
} catch (error) {
})
client.on('disconnect', () => {
console.log('SERVER: Disconnected');
});
server.listen(portApi, () =>{
console.log('Server listens on port 8881')
});
// serve static files
app.use(express.static(pathPublic));
// [codes were removed here]
PdfGenerator.js
The problem lies in these functions: PdfGenerator & createPdf
'use strict';
process.setMaxListeners(Infinity) // fix for Puppeteer MaxListenerExceededWarning
const Puppeteer = require('puppeteer')
const {HtmlGenerator} = require('../components/HtmlGenerator')
const {WriteFile, FileExists, RandomNumber, RoundNumber, IsNumberFraction, ReadFile} = require('../components/Functions')
if (process.env.NODE_ENV !== 'production') {
require('dotenv').config();
}
const pathFirstTermResults = process.env.DIR_FIRST_TERM_RESULTS;
const pathSecondTermResults = process.env.DIR_SECOND_TERM_RESULTS;
const pathThirdTermResults = process.env.DIR_THIRD_TERM_RESULTS;
const publicDir = process.env.DIR_PUBLIC;
const cssFile = process.env.PATH_CSS_FILENAME;
const pathCssRaw = __dirname + '\\' + publicDir + '\\' + cssFile;
const pathCss = pathCssRaw.replace(`\\uploads`, '');
const tagCssReplace = process.env.TAG_CSS_REPLACE;
let jsonDir = process.env.PATH_JSON;
jsonDir = jsonDir.split('/').pop();
let htmlDir = process.env.DIR_HTML;
htmlDir = __dirname + '\\' + htmlDir.split('/').pop();
const htmlType1 = htmlDir + '\\' + process.env.HTML_TYPE1;
const htmlType2 = htmlDir + '\\' + process.env.HTML_TYPE2;
const htmlType3 = htmlDir + '\\' + process.env.HTML_TYPE3;
const pathJsonPdfList = './' + jsonDir + '/' + process.env.JSON_PDF_LIST_FILENAME;
const pathJsonPdfContent = __dirname + '\\' + jsonDir + '\\' + process.env.JSON_PDF_CONTENT;
const firstTermDir = 'first-term';
const secondTermDir = 'second-term';
const thirdTermDir = 'third-term';
let cumulativeFirstTermTotalList = {};
let cumulativeSecondTermTotalList = {};
let firstTermOnce = true;
let secondTermOnce = true;
let thirdTermOnce = true;
let isActive = false;
const getPath = (p, f) => {
let dir = pathFirstTermResults;
switch (p) {
case firstTermDir:
dir = pathFirstTermResults;
break;
case secondTermDir:
dir = pathSecondTermResults;
break;
case thirdTermDir:
dir = pathThirdTermResults;
break;
default:
break;
}
return dir + f
}
const resolution = {
x: 1920,
y: 1080
}
const args = [
'--disable-gpu',
`--window-size=${resolution.x},${resolution.y}`,
'--no-sandbox',
]
const createPdf = (page, content, templateType, filename, className, term, sessionName, isProcessActive, pdfFileList, cb) => {
let path, document, options;
path = getPath(term, filename);
if(path != null) {
let options = {
path: path,
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
let templateData = '';
switch (templateType) {
case '1':
templateData = ReadFile(htmlType1);
break;
case '2':
templateData = ReadFile(htmlType2);
break;
case '3':
templateData = ReadFile(htmlType3);
break;
default:
templateData = ReadFile(htmlType1);
break;
}
(async() => {
const html = HtmlGenerator(content, templateData);
if(html != undefined && html !== '' && html != null) {
// create PDF file
cb(filename, className, term, sessionName, isProcessActive, pdfFileList);
// get style from .css & replace
const css = ReadFile(pathCss);
await page.setContent(html, { waitUntil: 'networkidle0'});
await page.addStyleTag(css);
await page.pdf(options);
page.close();
}
})()
}
}
const pdfGenerator = (json, cb) => {
let data = {};
let pdfFileList = [];
if(typeof json == 'string') {
data = JSON.parse(json)
} else {
data = json;
}
try {
// declare defaults
let filename = 'Student' + '.pdf';
let termName = firstTermDir;
const templateType = data.keys.templateType;
const session = data.classInfo.Session;
const sessionName = session.replace('/', '-');
const students = data.students;
const className = data.classInfo.Class_Name;
const recordFirstTerm = data.firstTerm;
const recordSecondTerm = data.secondTerm;
const recordThirdTerm = data.thirdTerm;
let pdfCreatedList = [];
let isReset = false;
let totalResultsExpected = Object.keys(recordFirstTerm).length + Object.keys(recordSecondTerm).length + Object.keys(recordThirdTerm).length;
let totalResultsCount = 0;
let jsonForPdf = {};
let record = {};
let sRecord, path, id, fName, lName;
// get each student
let logEndOnce = true;
let logBeforeOnce = true;
logBeforeOnce && console.log('============== *** ================');
logBeforeOnce && console.log('======== PDF GENERATION BEGINS ================');
const computeResult = (page, setTerm, setRecord, setReset) => {
const termName = setTerm;
const record = setRecord;
let isReset = setReset;
logBeforeOnce && console.log(`====== ${termName} RESULTS BEGINS ======`);
for(let elem of students){
id = elem.id;
fName = elem.firstName;
lName = elem.lastName;
filename = `${lName} ${fName} _${termName} ${sessionName}.pdf`;
// sRecord = record.filter(function (entry) { return entry[id] !== undefined; });
sRecord = record[id];
path = getPath(termName, filename);
// create pdf
if(!FileExists(path) && !FileExists(pathJsonPdfList)){
// generate final JSON for the student
// isReset = (pdfCreatedList.includes(id))? false: true;
jsonForPdf = finalJson(elem, sRecord, data, termName);
(pdfFileList.length < 1) && WriteFile(pathJsonPdfContent, JSON.stringify(jsonForPdf));
pdfFileList.push({
'term': termName,
'file': filename
});
totalResultsCount = pdfFileList.length;
const pdfDate = new Date();
console.log(`${filename} (${totalResultsCount}/${totalResultsExpected}) at ${pdfDate.getHours()}hr${pdfDate.getHours()>1?'s':''} - ${pdfDate.getMinutes()}min${pdfDate.getMinutes()>1?'s':''} - ${pdfDate.getSeconds()}sec${pdfDate.getSeconds()>1?'s':''}`);
isActive = (totalResultsExpected === totalResultsCount)? true: false;
logEndOnce = false;
// cb(filename, className, termName, sessionName, isActive, pdfFileList);
// WriteFile(path, null);
isReset = true;
createPdf(page, jsonForPdf, templateType, filename, className, termName, sessionName, isActive, pdfFileList, cb);
}
}
logBeforeOnce && console.log(`====== ${termName} RESULTS ENDS ======`);
}
// get each student result for First Term
const computeFirstTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.firstTerm === '1') {
termName = firstTermDir;
record = recordFirstTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Second Term
const computeSecondTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.secondTerm === '1') {
termName = secondTermDir;
record = recordSecondTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Third Term
const computeThirdTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.thirdTerm === '1') {
termName = thirdTermDir;
record = recordThirdTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
if(totalResultsExpected === totalResultsCount && totalResultsCount !== 0 && !logEndOnce) {
logEndOnce = true;
logBeforeOnce = false;
console.log('======== PDF GENERATION ENDS ================');
}
} catch (error) {
console.log('==== ERROR IN PDF GENERATION: ', error)
}
}
module.exports = {
PdfGenerator: pdfGenerator
}
ERROR
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
lerna ERR! yarn run start stderr:
<--- Last few GCs --->
[9884:000002D68A73C6B0] 1665171 ms: Scavenge 44.1 (45.8) -> 43.2 (45.8) MB, 223.9 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1684089 ms: Scavenge 44.1 (45.8) -> 43.3 (45.8) MB, 587.3 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1749901 ms: Scavenge 44.2 (45.8) -> 43.3 (45.8) MB, 5099.0 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
<--- JS stacktrace --->
FATAL ERROR: Committing semi space failed. Allocation failed - JavaScript heap out of memory
1: 00007FF6ED61013F
2: 00007FF6ED59F396
3: 00007FF6ED5A024D
4: 00007FF6EDED19EE
5: 00007FF6EDEBBECD
6: 00007FF6EDD5F61C
7: 00007FF6EDD6933F
8: 00007FF6EDD5BF19
9: 00007FF6EDD5A0D0
10: 00007FF6EDD7EA06
11: 00007FF6EDAB1CD5
12: 00007FF6EDF5F3E1
13: 00007FF6EDF602E9
14: 000002D68C4EF69E
error Command failed with exit code 134.
Screenshot of Task Manager, Chromium running multiple instances of over 50.
I appreciate any help. I hope this can be resolved to give me a smooth PDF generation.
Thank you.

Example solution (limiting parallel browsers)
I created you a PdfPrinter class which you can integrate into your setup. It allows you to limit the amount of parallel pdf generation jobs and allows setting a limit and manages opening/closing the browser for you. The PdfPrinter class is also highly coupled and needed some modification for using it as a general queue. Logicwise this can be modified to be a general queue.
You can try to integrate that into your code. This is a fully working test example with simplified pdfs (without the part of getting the actual data from the excel..)
As far as I understood your code, you do not need to pass the page around all your functions. First create your html + css and then use the pdfPrinter and let it handle page creation + browser launching..
(I like to code stuff like this so I went straight ahead..)
var puppeteer = require('puppeteer')
const defaultPrinterOptions = {
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
class PdfPrinter {
maxBrowsers = 2
enqueuedPrintJobs = []
failedJobs = []
browserInstances = 0
// max browser instances in parallel
constructor(maxBrowsers) {
this.maxBrowsers = maxBrowsers
}
/**
*
* #param {*} html the html content to print
* #param {*} css to apply to the page
* #param {*} printOptions options passed to puppeteer
*/
// enqueues a print but the exact end moment cannot be known..
enqueuePrint = (html, css, path, done) => {
// merge custom options with defaultOptions..
const printOptions = {
...defaultPrinterOptions,
// add the path to the options.
path: path
}
// create a function which can be stored in an array
// it will later be grabbed by startPrinter() OR at the time any
// brwoser freed up..
// the function needs to be passed the actual used browser instance!
this.enqueuedPrintJobs.push(async(browser) => {
// catch the error which may be produced when printing something..
try {
// print the document
await this.print(browser, html, css, printOptions)
} catch (err) {
console.error('error when printing document..CLosing browser and starting a new job!!', printOptions.path)
console.error(err)
// store someting so you now what failed and coudl be retried or something..
this.failedJobs.push({ html, css, path: printOptions.path })
// puppeteer can run into erros too!!
// so close the browser and launch a new one!
await this.closeBrowser(browser)
browser = await this.launchBrowser()
}
// after the print, call done() so the promise is resovled in the right moment when
// this particular print has ended.!
done()
// start the next job right now if there are any left.
const job = this.enqueuedPrintJobs.shift()
if (!job) {
console.log('No print jobs available anymore. CLosing this browser instance.. Remaining browsers now:', this.maxBrowsers - this.browserInstances + 1)
await this.closeBrowser(browser)
return
}
// job is actually this function itself! It will be executed
// and automatically grab a new job after completion :)
// we pass the same browser instance to the next job!.
await job(browser)
})
// whenever a print job added make sure to start the printer
// this starts new browser instances if the limit is not exceeded resp. if no browser is instantiated yet,
// and does nothing if maximum browser count is reached..
this.tryStartPrinter()
}
// same as enqueuePrint except it wraps it in a promise so we can now the
// exact end moment and await it..
enqueuePrintPromise(html, css, path) {
return new Promise((resolve, reject) => {
try {
this.enqueuePrint(html, css, path, resolve)
} catch (err) {
console.error('unexpected error when setting up print job..', err)
reject(err)
}
})
}
// If browser instance limit is not reached will isntantiate a new one and run a print job with it.
// a print job will automatically grab a next job with the created browser if there are any left.
tryStartPrinter = async() => {
// Max browser count in use OR no jobs left.
if (this.browserInstances >= this.maxBrowsers || this.enqueuedPrintJobs.length === 0) {
return
}
// browser instances available!
// create a new one
console.log('launching new browser. Available after launch:', this.maxBrowsers - this.browserInstances - 1)
const browser = await this.launchBrowser()
// run job
const job = this.enqueuedPrintJobs.shift()
await job(browser)
}
closeBrowser = async(browser) => {
// decrement browsers in use!
// important to call before closing browser!!
this.browserInstances--
await browser.close()
}
launchBrowser = async() => {
// increment browsers in use!
// important to increase before actualy launching (async stuff..)
this.browserInstances++
// this code you have to adjust according your enviromnemt..
const browser = await puppeteer.launch({ headless: true })
return browser
}
// The actual print function which creates a pdf.
print = async(browser, html, css, printOptions) => {
console.log('Converting page to pdf. path:', printOptions.path)
// Run pdf creation in seperate page.
const page = await browser.newPage()
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.addStyleTag({ content: css });
await page.pdf(printOptions);
await page.close();
}
}
// testing the PDFPrinter with some jobs.
// make sure to run the printer in an `async` function so u can
// use await...
const testPrinterQueue = async() => {
// config
const maxOpenedBrowsers = 5 // amount of browser instances which are allowed to be opened in parallel
const testJobCount = 100 // amount of test pdf jobs to be created
const destDir = 'C:\\somepath' // the directory to store the pdfs in..
// create sample jobs for testing...
const jobs = []
for (let i = 0; i < testJobCount; i++) {
jobs.push({
html: `<h1>job number [${i}]</h1>`,
css: 'h1 { background-color: red; }',
path: require('path').join(destDir, `pdf_${i}.pdf`)
})
}
// track time
const label = 'printed a total of ' + testJobCount + ' pdfs!'
console.time(label)
// run the actual pdf generation..
const printer = new PdfPrinter(maxOpenedBrowsers)
const jobProms = []
for (let job of jobs) {
// run jobs in parallel. Each job wil be runned async and return a Promise therefor
jobProms.push(
printer.enqueuePrintPromise(job.html, job.css, job.path)
)
}
console.log('All jobs enqueued!! Wating for finish now.')
// helper function which awaits all the print jobs, resp. an array of promises.
await Promise.all(jobProms)
console.timeEnd(label)
// failed jobs::
console.log('jobs failed:', printer.failedJobs)
// as file:
await require('fs').promises.writeFile('failed-jobs.json', JSON.stringify(printer.failedJobs))
}
testPrinterQueue().then(() => {
console.log('done with everyting..')
}).catch(err => {
console.error('unexpected error occured while printing all pages...', err)
})
You only need to adjust the destDir / openedBrowsers and testJobCount vars in the beginning of testPrinterQueue() for getting this to work.
What caused the problem in your code
Let's have a look at this piece
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
You created an anonymous function which is executed immediatly. Within the function all the statements are correctly awaited using await. But if you run this whole piece within a synchronious part of your application, the whole function will start immediatly but NOT been awaited before running next code.
Checkout this example:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
AsyncFunction();
// what you have done in your code:
(async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction()
console.log('done')
Note the output:
Async named function started
Async anonymus function started
sync function ended. // => sync function already ended
done // sync function ended and code continues execution.
Async named function ended
Async anonymus function ended
To correctly await your async stuff you need to put your whole application in async scope:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
// this is now async!!
async function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
await AsyncFunction();
// what you have done in your code:
await (async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction().then(() => {
console.log('done')
}).catch(err => {
console.error('unexpected error occured..')
})
This output is what we want
sync function started
Async named function started
Async named function ended
Async anonymus function started
Async anonymus function ended
sync function ended.
done
Hope this helps you understand.
Feel free to leave a comment.

Write data from Socket.io to Google Cloud Storage

I get audio data from this package in my React Native project and I then send the data chunks over socket.io to my Node.js server.
AudioRecord.on('data', data => {
// base64-encoded audio data chunks
const binary = Buffer.from(data, 'base64').toString('utf-8')
socketClient.io.emit('binaryData', binary)
})
The data that my server receives is readable because I send that data through this transform onto a speech recognition service that returns accurate results.
const speechAudioInputStreamTransform = new Transform({
transform: (chunk, encoding, callback) => {
if (newStream && lastAudioInput.length !== 0) {
// Approximate math to calculate time of chunks
const chunkTime = streamingLimit / lastAudioInput.length
if (chunkTime !== 0) {
if (bridgingOffset < 0) {
bridgingOffset = 0
}
if (bridgingOffset > finalRequestEndTime) {
bridgingOffset = finalRequestEndTime
}
const chunksFromMS = Math.floor(
(finalRequestEndTime - bridgingOffset) / chunkTime
)
bridgingOffset = Math.floor(
(lastAudioInput.length - chunksFromMS) * chunkTime
)
for (let i = chunksFromMS; i < lastAudioInput.length; i++) {
if (recognizeStream) {
recognizeStream.write(lastAudioInput[i])
}
}
}
newStream = false
}
audioInput.push(chunk)
if (recognizeStream) {
recognizeStream.write(chunk)
}
callback()
},
})
However when I write this data to Google Cloud Storage the file saves and contains data but seems corrupted as it is unplayable.
const { Storage } = require('#google-cloud/storage')
const storage = new Storage(storageOptions)
const bucket = storage.bucket(bucketName)
const file = filename + '.wav'
const storageFile = bucket.file.createWriteStream()
client.on('binaryData', (data) => {
const buffer = Buffer.from(data)
if (recognizeStream != null) {
storageFile.write(buffer)
speechAudioInputStreamTransform.write(buffer)
}
})
When the user on the RN side terminates the session, storageFile.end() is called.
Any help would be greatly appreciated. Thanks

Problems when I upload images and show them in angular and node using the GraphicsMagick

I use this code to upload images in Node:
req.file('image[]')
.upload({
maxBytes: 5000000, // Files limit(in bytes)
dirname: path.resolve(sails.config.appPath, 'assets/images/user') // Path to copy the files
}, function whenDone(err, uploadedFiles) {
if (err) {
// here must delete the created user before the error
console.log(err);
}
var image_real_name = '';
var json = [];
for(var i = 0; i < uploadedFiles.length; i++){
image_real_name = 'images/user/'+path.basename(uploadedFiles[i].fd);
json.push(image_real_name);
}
res(json);
});
but I needed to compress the images to gain space on the server so I used the gm GraphicsMagick:
`
var receiver = new Writable({objectMode: true});
receiver._write = function(file, enc, cb) {
// The output stream to pipe to
var output = require('fs').createWriteStream('assets/images/user/' + file.fd);
gm(file).resize('200', '200').stream().pipe(output);
cb();
};
req.file('image[]').upload(receiver, function(err, files){
if (err) {
// here must delete the created user before the error
console.log(err);
}
var image_real_name = '';
var json = [];
for(var i = 0; i < files.length; i++){
image_real_name = 'images/user/'+path.basename(files[i].fd);
json.push(image_real_name);
}
res(json);
});
`
I am using angular and when I change the direction of the image by binding the new image should be shown but it is not loaded, with the first code all perfect but using gm does not work anymore. I have to restart the page so that the image is displayed.
track:
apparently the image has not been fully processed, if a setTime is entered, the image is displayed correctly:
setTimeout((dataService: DataService, image: any) => {
dataService.getUser().image = image;
}, 1000, this.data, dataI.json().url);
any idea?

There is a finish event when the file is fully loaded, I solved the problem by adding the following:
output.on("finish", function() { return cb(); });

Node.js Readable stream slows over time, CPU usage falls

I am trying to launch a cluster that will stream files (new line delimited JSON) from google cloud storage and transform each row after fetching data from MongoDB. After transforming the row, i want to store it in Google's bigquery - 10000 rows at a time. All of this is working fine but the issue is that the rate at which the streamed files are being processed decreases significantly over time.
I have setup the node application on one server and mongodb on another. Both 8 core machines with 30GB RAM. When the script is executed, initially the CPU usage for the application server and mongodb server is around 70%-75%. After 30 minutes, the CPU usage falls down to 10% and then finally 1%. The application generates no exceptions. I can see the application log and find that it finished processing a few files and took up new files for processing. One execution can be observered below a little later than 3:00PM and a almost upto 5:20PM.
var cluster = require('cluster'),
os = require('os'),
numCPUs = os.cpus().length,
async = require('async'),
fs = require('fs'),
google = require('googleapis'),
bigqueryV2 = google.bigquery('v2'),
gcs = require('#google-cloud/storage')({
projectId: 'someproject',
keyFilename: __dirname + '/auth.json'
}),
dataset = bigquery.dataset('somedataset'),
bucket = gcs.bucket('somebucket.appspot.com'),
JSONStream = require('JSONStream'),
Transform = require('stream').Transform,
MongoClient = require('mongodb').MongoClient,
mongoUrl = 'mongodb://localhost:27017/bigquery',
mDb,
groupA,
groupB;
var rows = [],
rowsLen = 0;
function transformer() {
var t = new Transform({ objectMode: true });
t._transform = function(row, encoding, cb) {
// Get some information from mongodb and attach it to the row
if (row) {
groupA.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.OA_SA': 1, _id: 0 }
}, function(err, a) {
if (err) return cb();
groupB.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.WZ11CD': 1, _id: 0 }
}, function(err, b) {
if (err) return cb();
row.groupA = a ? a.properties.OA_SA : null;
row.groupB = b ? b.properties.WZ11CD : null;
// cache processed rows in memory
rows[rowsLen++] = { json: row };
if (rowsLen >= 10000) {
// batch insert rows in bigquery table
// and free memory
log('inserting 10000')
insertRowsAsStream(rows.splice(0, 10000));
rowsLen = rows.length;
}
cb();
});
});
} else {
cb();
}
};
return t;
}
var log = function(str) {
console.log(str);
}
function insertRowsAsStream(rows, callback) {
bigqueryV2.tabledata.insertAll({
"projectId": 'someproject',
"datasetId": 'somedataset',
"tableId": 'sometable',
"resource": {
"kind": "bigquery#tableDataInsertAllRequest",
"rows": rows
}
}, function(err, res) {
if (res && res.insertErrors && res.insertErrors.length) {
console.log(res.insertErrors[0].errors)
err = err || new Error(JSON.stringify(res.insertErrors));
}
});
}
function startStream(fileName, cb) {
// stream a file from Google cloud storage
var file = bucket.file(fileName),
called = false;
log(`Processing file ${fileName}`);
file.createReadStream()
.on('data', noop)
.on('end', function() {
if (!called) {
called = true;
cb();
}
})
.pipe(JSONStream.parse())
.pipe(transformer())
.on('finish', function() {
log('transformation ended');
if (!called) {
called = true;
cb();
}
});
}
function processFiles(files, cpuIdentifier) {
if (files.length == 0) return;
var fn = [];
for (var i = 0; i < files.length; i++) {
fn.push(function(cb) {
startStream(files.pop(), cb);
});
}
// process 3 files in parallel
async.parallelLimit(fn, 3, function() {
log(`child process ${cpuIdentifier} completed the task`);
fs.appendFile(__dirname + '/complete_count.txt', '1');
});
}
if (cluster.isMaster) {
for (var ii = 0; ii < numCPUs; ii++) {
cluster.fork();
}
} else {
MongoClient.connect(mongoUrl, function(err, db) {
if (err) throw (err);
mDb = db;
groupA = mDb.collection('groupageo');
groupB = mDb.collection('groupbgeo');
processFiles(files, process.pid);
// `files` is an array of file names
// each file is in newline json delimited format
// ["1478854974993/000000000000.json","1478854974993/000000000001.json","1478854974993/000000000002.json","1478854974993/000000000003.json","1478854974993/000000000004.json","1478854974993/000000000005.json"]
});
}

Okay, I have found the culprit! Google APIs Node.js client library makes use of a module called "stream-events" which implements Streams 0.8. Streams 0.8 do not control the rate at which it emits the 'data' event based on the consumer's ability to consume data. The rate controlling feature was introduced in Streams 1.0. So this essentially meant that the readable stream was throwing data at MongoDB at a rate that it was not able to process.
Solution:
I used the 'request' module instead of Google's client library. I supplied a signed URL to the request module which in turn fetched results as a stream which I could pipe into my transformer.
Take Away:
Always check the modules you use for the stream versions they are using.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Uploading series of large files in node to a web API - node.js

Related

Node.js on multi-core machines for file I/O operations

Puppeteer create PDF files from HTML data hangs Windows 10 system

Write data from Socket.io to Google Cloud Storage

Problems when I upload images and show them in angular and node using the GraphicsMagick

Node.js Readable stream slows over time, CPU usage falls

Categories

Resources