Puppeteer create PDF files from HTML data hangs Windows 10 system - node.js

I created an App that processes students results by extracting data from multiple excel workbooks. The problem is that using Puppeteer to generate the PDF files, throws the system into a loop till it hangs the system.
Actually, I have tested same codes below using PhantomJs which is bundled as pdf-creator-node, and was able to generate 150 PDF files comfortably in 3 minutes. The only challenge I dumped PhantomJs is that all the styling in the CSS file was not included, even when I inserted it as an inline style in the header, suing replace function of JS. Another, is that PhantomJs is no longer in active development. I searched the web, and found out that only Puppeteer is the valid solution with active development and support too.
I tried using page.close() at the end of pdfCreator() which is in a loop, and browser.close() at the end of pdfGenerator(). What I am doing wrong?
Here below are the codes in the server.js and PdfGenerator.js files, with a sample of the ERROR, and screenshot of my Task Manager after the system crawled out of hanging state. For HTML generation, I used Mustache. I excluded some lines of codes in server.js because the total character count was over 60k.
server.js
// [codes were removed here]
if(getCode == 'compute-result') {
// declare variable
let setData = null;
let setTitle = 'Results Computation...';
let setArgs = getArgs;
// dataFromFile = ReadFile(pathCodeTextFile);
// setArgs = Number(dataFromFile);
setCode = 'compute-result';
let setView = [];
let setNext = true;
let countTerms = [];
// if(getArg > 0) {
// Final Result computation
const getJson = ReadFile(pathJsonResults);
// const getCtrl = ReadFile(pathJsonCtrl);
const getResultObject = JSON.parse(getJson);
getResult = getResultObject;
const totalResults = getResult.firstTerm.length + getResult.secondTerm.length + getResult.thirdTerm.length;
if(setView.length < 1 && getResult != null) {
setData = 'PDFs for Students Results initiating...';
setView.unshift('Reading saved data...');
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: null, view: JSON.stringify(setView)});
}
Sleep(2000).then(() => {
if(getResult != null) {
setData = 'Students Results will be ready in a moment';
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
const wacthFiles = (file, className, termName, sessionName, completed, pdfList) => {
try {
if(typeof file == 'string' && !FileExists(pathJsonPdfList)) {
if(pdfList.length < 2){
setData = 'Saving PDFs to downladable files...';
}
if(className != null && termName != null && sessionName != null) {
setTitle = `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}...`;
setView.unshift(file);
if(!countTerms.includes(termName)) {
countTerms.push(termName)
}
// setCode = -1000 - pdfList.length;
// console.log('PDF PROGRESS: ', `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}... ${setCode}`);
// when all PDFs are created
if(completed) {
setTitle = setTitle.replace('...', ' [completed]');
setData = 'Result Download button is Active. You may click it now.';
setView.unshift('=== PDF GENERATION COMPLETED ===');
setView.unshift(`A total of ${pdfList.length} students' Results were generated`);
WriteFile(pathJsonPdfList, JSON.stringify(pdfList));
// set donwload button active
setCode = Number(codeTextFilePdfCompleted);
setNext = false;
getResult = null;
let termString = countTerms.toString();
termString = ReplaceAll(termString, '-term', '');
termString = ReplaceAll(termString, ',', '-');
const addTxt = `${className} _${termString} Term${countTerms.length>1?'s':''} (${sessionName})`;
WriteFile(pathCodeTextFile, addTxt);
// console.log('======== PDF GENERATION ENDS ================');
} else {
setCode = -1 * pdfList.length;
}
client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
}
}
} catch (error) {
console.log('ERROR ON WATCHER: ', error);
}
}
if(!FileExists(pathJsonPdfList) && getResult !== null) {
PdfGenerator(getResult, wacthFiles);
}
// Watcher(pathWatchResults, setCode, wacthDir, 10000);
});
// }
}
}
} catch (error) {
})
client.on('disconnect', () => {
console.log('SERVER: Disconnected');
});
server.listen(portApi, () =>{
console.log('Server listens on port 8881')
});
// serve static files
app.use(express.static(pathPublic));
// [codes were removed here]
PdfGenerator.js
The problem lies in these functions: PdfGenerator & createPdf
'use strict';
process.setMaxListeners(Infinity) // fix for Puppeteer MaxListenerExceededWarning
const Puppeteer = require('puppeteer')
const {HtmlGenerator} = require('../components/HtmlGenerator')
const {WriteFile, FileExists, RandomNumber, RoundNumber, IsNumberFraction, ReadFile} = require('../components/Functions')
if (process.env.NODE_ENV !== 'production') {
require('dotenv').config();
}
const pathFirstTermResults = process.env.DIR_FIRST_TERM_RESULTS;
const pathSecondTermResults = process.env.DIR_SECOND_TERM_RESULTS;
const pathThirdTermResults = process.env.DIR_THIRD_TERM_RESULTS;
const publicDir = process.env.DIR_PUBLIC;
const cssFile = process.env.PATH_CSS_FILENAME;
const pathCssRaw = __dirname + '\\' + publicDir + '\\' + cssFile;
const pathCss = pathCssRaw.replace(`\\uploads`, '');
const tagCssReplace = process.env.TAG_CSS_REPLACE;
let jsonDir = process.env.PATH_JSON;
jsonDir = jsonDir.split('/').pop();
let htmlDir = process.env.DIR_HTML;
htmlDir = __dirname + '\\' + htmlDir.split('/').pop();
const htmlType1 = htmlDir + '\\' + process.env.HTML_TYPE1;
const htmlType2 = htmlDir + '\\' + process.env.HTML_TYPE2;
const htmlType3 = htmlDir + '\\' + process.env.HTML_TYPE3;
const pathJsonPdfList = './' + jsonDir + '/' + process.env.JSON_PDF_LIST_FILENAME;
const pathJsonPdfContent = __dirname + '\\' + jsonDir + '\\' + process.env.JSON_PDF_CONTENT;
const firstTermDir = 'first-term';
const secondTermDir = 'second-term';
const thirdTermDir = 'third-term';
let cumulativeFirstTermTotalList = {};
let cumulativeSecondTermTotalList = {};
let firstTermOnce = true;
let secondTermOnce = true;
let thirdTermOnce = true;
let isActive = false;
const getPath = (p, f) => {
let dir = pathFirstTermResults;
switch (p) {
case firstTermDir:
dir = pathFirstTermResults;
break;
case secondTermDir:
dir = pathSecondTermResults;
break;
case thirdTermDir:
dir = pathThirdTermResults;
break;
default:
break;
}
return dir + f
}
const resolution = {
x: 1920,
y: 1080
}
const args = [
'--disable-gpu',
`--window-size=${resolution.x},${resolution.y}`,
'--no-sandbox',
]
const createPdf = (page, content, templateType, filename, className, term, sessionName, isProcessActive, pdfFileList, cb) => {
let path, document, options;
path = getPath(term, filename);
if(path != null) {
let options = {
path: path,
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
let templateData = '';
switch (templateType) {
case '1':
templateData = ReadFile(htmlType1);
break;
case '2':
templateData = ReadFile(htmlType2);
break;
case '3':
templateData = ReadFile(htmlType3);
break;
default:
templateData = ReadFile(htmlType1);
break;
}
(async() => {
const html = HtmlGenerator(content, templateData);
if(html != undefined && html !== '' && html != null) {
// create PDF file
cb(filename, className, term, sessionName, isProcessActive, pdfFileList);
// get style from .css & replace
const css = ReadFile(pathCss);
await page.setContent(html, { waitUntil: 'networkidle0'});
await page.addStyleTag(css);
await page.pdf(options);
page.close();
}
})()
}
}
const pdfGenerator = (json, cb) => {
let data = {};
let pdfFileList = [];
if(typeof json == 'string') {
data = JSON.parse(json)
} else {
data = json;
}
try {
// declare defaults
let filename = 'Student' + '.pdf';
let termName = firstTermDir;
const templateType = data.keys.templateType;
const session = data.classInfo.Session;
const sessionName = session.replace('/', '-');
const students = data.students;
const className = data.classInfo.Class_Name;
const recordFirstTerm = data.firstTerm;
const recordSecondTerm = data.secondTerm;
const recordThirdTerm = data.thirdTerm;
let pdfCreatedList = [];
let isReset = false;
let totalResultsExpected = Object.keys(recordFirstTerm).length + Object.keys(recordSecondTerm).length + Object.keys(recordThirdTerm).length;
let totalResultsCount = 0;
let jsonForPdf = {};
let record = {};
let sRecord, path, id, fName, lName;
// get each student
let logEndOnce = true;
let logBeforeOnce = true;
logBeforeOnce && console.log('============== *** ================');
logBeforeOnce && console.log('======== PDF GENERATION BEGINS ================');
const computeResult = (page, setTerm, setRecord, setReset) => {
const termName = setTerm;
const record = setRecord;
let isReset = setReset;
logBeforeOnce && console.log(`====== ${termName} RESULTS BEGINS ======`);
for(let elem of students){
id = elem.id;
fName = elem.firstName;
lName = elem.lastName;
filename = `${lName} ${fName} _${termName} ${sessionName}.pdf`;
// sRecord = record.filter(function (entry) { return entry[id] !== undefined; });
sRecord = record[id];
path = getPath(termName, filename);
// create pdf
if(!FileExists(path) && !FileExists(pathJsonPdfList)){
// generate final JSON for the student
// isReset = (pdfCreatedList.includes(id))? false: true;
jsonForPdf = finalJson(elem, sRecord, data, termName);
(pdfFileList.length < 1) && WriteFile(pathJsonPdfContent, JSON.stringify(jsonForPdf));
pdfFileList.push({
'term': termName,
'file': filename
});
totalResultsCount = pdfFileList.length;
const pdfDate = new Date();
console.log(`${filename} (${totalResultsCount}/${totalResultsExpected}) at ${pdfDate.getHours()}hr${pdfDate.getHours()>1?'s':''} - ${pdfDate.getMinutes()}min${pdfDate.getMinutes()>1?'s':''} - ${pdfDate.getSeconds()}sec${pdfDate.getSeconds()>1?'s':''}`);
isActive = (totalResultsExpected === totalResultsCount)? true: false;
logEndOnce = false;
// cb(filename, className, termName, sessionName, isActive, pdfFileList);
// WriteFile(path, null);
isReset = true;
createPdf(page, jsonForPdf, templateType, filename, className, termName, sessionName, isActive, pdfFileList, cb);
}
}
logBeforeOnce && console.log(`====== ${termName} RESULTS ENDS ======`);
}
// get each student result for First Term
const computeFirstTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.firstTerm === '1') {
termName = firstTermDir;
record = recordFirstTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Second Term
const computeSecondTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.secondTerm === '1') {
termName = secondTermDir;
record = recordSecondTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
// get each student result for Third Term
const computeThirdTerm = (p) => {
return new Promise((resolve) => {
if(data.keys.thirdTerm === '1') {
termName = thirdTermDir;
record = recordThirdTerm;
pdfCreatedList = [];
isReset = false;
computeResult(p, termName, record, isReset)
}
resolve()
})
}
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
if(totalResultsExpected === totalResultsCount && totalResultsCount !== 0 && !logEndOnce) {
logEndOnce = true;
logBeforeOnce = false;
console.log('======== PDF GENERATION ENDS ================');
}
} catch (error) {
console.log('==== ERROR IN PDF GENERATION: ', error)
}
}
module.exports = {
PdfGenerator: pdfGenerator
}
ERROR
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
lerna ERR! yarn run start stderr:
<--- Last few GCs --->
[9884:000002D68A73C6B0] 1665171 ms: Scavenge 44.1 (45.8) -> 43.2 (45.8) MB, 223.9 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1684089 ms: Scavenge 44.1 (45.8) -> 43.3 (45.8) MB, 587.3 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0] 1749901 ms: Scavenge 44.2 (45.8) -> 43.3 (45.8) MB, 5099.0 / 0.0 ms (average mu = 0.956, current mu = 0.952) allocation failure
<--- JS stacktrace --->
FATAL ERROR: Committing semi space failed. Allocation failed - JavaScript heap out of memory
1: 00007FF6ED61013F
2: 00007FF6ED59F396
3: 00007FF6ED5A024D
4: 00007FF6EDED19EE
5: 00007FF6EDEBBECD
6: 00007FF6EDD5F61C
7: 00007FF6EDD6933F
8: 00007FF6EDD5BF19
9: 00007FF6EDD5A0D0
10: 00007FF6EDD7EA06
11: 00007FF6EDAB1CD5
12: 00007FF6EDF5F3E1
13: 00007FF6EDF602E9
14: 000002D68C4EF69E
error Command failed with exit code 134.
Screenshot of Task Manager, Chromium running multiple instances of over 50.
I appreciate any help. I hope this can be resolved to give me a smooth PDF generation.
Thank you.

Example solution (limiting parallel browsers)
I created you a PdfPrinter class which you can integrate into your setup. It allows you to limit the amount of parallel pdf generation jobs and allows setting a limit and manages opening/closing the browser for you. The PdfPrinter class is also highly coupled and needed some modification for using it as a general queue. Logicwise this can be modified to be a general queue.
You can try to integrate that into your code. This is a fully working test example with simplified pdfs (without the part of getting the actual data from the excel..)
As far as I understood your code, you do not need to pass the page around all your functions. First create your html + css and then use the pdfPrinter and let it handle page creation + browser launching..
(I like to code stuff like this so I went straight ahead..)
var puppeteer = require('puppeteer')
const defaultPrinterOptions = {
format: 'A4',
printBackground: true,
margin: {
left: '0px',
top: '0px',
right: '0px',
bottom: '0px'
}
}
class PdfPrinter {
maxBrowsers = 2
enqueuedPrintJobs = []
failedJobs = []
browserInstances = 0
// max browser instances in parallel
constructor(maxBrowsers) {
this.maxBrowsers = maxBrowsers
}
/**
*
* #param {*} html the html content to print
* #param {*} css to apply to the page
* #param {*} printOptions options passed to puppeteer
*/
// enqueues a print but the exact end moment cannot be known..
enqueuePrint = (html, css, path, done) => {
// merge custom options with defaultOptions..
const printOptions = {
...defaultPrinterOptions,
// add the path to the options.
path: path
}
// create a function which can be stored in an array
// it will later be grabbed by startPrinter() OR at the time any
// brwoser freed up..
// the function needs to be passed the actual used browser instance!
this.enqueuedPrintJobs.push(async(browser) => {
// catch the error which may be produced when printing something..
try {
// print the document
await this.print(browser, html, css, printOptions)
} catch (err) {
console.error('error when printing document..CLosing browser and starting a new job!!', printOptions.path)
console.error(err)
// store someting so you now what failed and coudl be retried or something..
this.failedJobs.push({ html, css, path: printOptions.path })
// puppeteer can run into erros too!!
// so close the browser and launch a new one!
await this.closeBrowser(browser)
browser = await this.launchBrowser()
}
// after the print, call done() so the promise is resovled in the right moment when
// this particular print has ended.!
done()
// start the next job right now if there are any left.
const job = this.enqueuedPrintJobs.shift()
if (!job) {
console.log('No print jobs available anymore. CLosing this browser instance.. Remaining browsers now:', this.maxBrowsers - this.browserInstances + 1)
await this.closeBrowser(browser)
return
}
// job is actually this function itself! It will be executed
// and automatically grab a new job after completion :)
// we pass the same browser instance to the next job!.
await job(browser)
})
// whenever a print job added make sure to start the printer
// this starts new browser instances if the limit is not exceeded resp. if no browser is instantiated yet,
// and does nothing if maximum browser count is reached..
this.tryStartPrinter()
}
// same as enqueuePrint except it wraps it in a promise so we can now the
// exact end moment and await it..
enqueuePrintPromise(html, css, path) {
return new Promise((resolve, reject) => {
try {
this.enqueuePrint(html, css, path, resolve)
} catch (err) {
console.error('unexpected error when setting up print job..', err)
reject(err)
}
})
}
// If browser instance limit is not reached will isntantiate a new one and run a print job with it.
// a print job will automatically grab a next job with the created browser if there are any left.
tryStartPrinter = async() => {
// Max browser count in use OR no jobs left.
if (this.browserInstances >= this.maxBrowsers || this.enqueuedPrintJobs.length === 0) {
return
}
// browser instances available!
// create a new one
console.log('launching new browser. Available after launch:', this.maxBrowsers - this.browserInstances - 1)
const browser = await this.launchBrowser()
// run job
const job = this.enqueuedPrintJobs.shift()
await job(browser)
}
closeBrowser = async(browser) => {
// decrement browsers in use!
// important to call before closing browser!!
this.browserInstances--
await browser.close()
}
launchBrowser = async() => {
// increment browsers in use!
// important to increase before actualy launching (async stuff..)
this.browserInstances++
// this code you have to adjust according your enviromnemt..
const browser = await puppeteer.launch({ headless: true })
return browser
}
// The actual print function which creates a pdf.
print = async(browser, html, css, printOptions) => {
console.log('Converting page to pdf. path:', printOptions.path)
// Run pdf creation in seperate page.
const page = await browser.newPage()
await page.setContent(html, { waitUntil: 'networkidle0' });
await page.addStyleTag({ content: css });
await page.pdf(printOptions);
await page.close();
}
}
// testing the PDFPrinter with some jobs.
// make sure to run the printer in an `async` function so u can
// use await...
const testPrinterQueue = async() => {
// config
const maxOpenedBrowsers = 5 // amount of browser instances which are allowed to be opened in parallel
const testJobCount = 100 // amount of test pdf jobs to be created
const destDir = 'C:\\somepath' // the directory to store the pdfs in..
// create sample jobs for testing...
const jobs = []
for (let i = 0; i < testJobCount; i++) {
jobs.push({
html: `<h1>job number [${i}]</h1>`,
css: 'h1 { background-color: red; }',
path: require('path').join(destDir, `pdf_${i}.pdf`)
})
}
// track time
const label = 'printed a total of ' + testJobCount + ' pdfs!'
console.time(label)
// run the actual pdf generation..
const printer = new PdfPrinter(maxOpenedBrowsers)
const jobProms = []
for (let job of jobs) {
// run jobs in parallel. Each job wil be runned async and return a Promise therefor
jobProms.push(
printer.enqueuePrintPromise(job.html, job.css, job.path)
)
}
console.log('All jobs enqueued!! Wating for finish now.')
// helper function which awaits all the print jobs, resp. an array of promises.
await Promise.all(jobProms)
console.timeEnd(label)
// failed jobs::
console.log('jobs failed:', printer.failedJobs)
// as file:
await require('fs').promises.writeFile('failed-jobs.json', JSON.stringify(printer.failedJobs))
}
testPrinterQueue().then(() => {
console.log('done with everyting..')
}).catch(err => {
console.error('unexpected error occured while printing all pages...', err)
})
You only need to adjust the destDir / openedBrowsers and testJobCount vars in the beginning of testPrinterQueue() for getting this to work.
What caused the problem in your code
Let's have a look at this piece
(async () => {
browser = await Puppeteer.launch({
headless: true,
handleSIGINT: false,
args: args,
});
const page = await browser.newPage();
await page.setViewport({
width: resolution.x,
height: resolution.y,
})
await computeFirstTerm(page);
await computeSecondTerm(page);
await computeThirdTerm(page);
browser.close()
})()
You created an anonymous function which is executed immediatly. Within the function all the statements are correctly awaited using await. But if you run this whole piece within a synchronious part of your application, the whole function will start immediatly but NOT been awaited before running next code.
Checkout this example:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
AsyncFunction();
// what you have done in your code:
(async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction()
console.log('done')
Note the output:
Async named function started
Async anonymus function started
sync function ended. // => sync function already ended
done // sync function ended and code continues execution.
Async named function ended
Async anonymus function ended
To correctly await your async stuff you need to put your whole application in async scope:
//utility
function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
const AsyncFunction = async() => {
console.log('Async named function started')
// simulate execution time of 2 seconds
await wait(2000)
console.log('Async named function ended')
};
// this is now async!!
async function SyncFunction() {
console.log('sync function started')
// example of async function execution within a sync function..
await AsyncFunction();
// what you have done in your code:
await (async() => {
console.log('Async anonymus function started')
await wait(3000)
console.log('Async anonymus function ended')
})()
// what
console.log('sync function ended.')
}
SyncFunction().then(() => {
console.log('done')
}).catch(err => {
console.error('unexpected error occured..')
})
This output is what we want
sync function started
Async named function started
Async named function ended
Async anonymus function started
Async anonymus function ended
sync function ended.
done
Hope this helps you understand.
Feel free to leave a comment.

Related

Deleting child's child elements while web scrapping and writing it to a html file using NodeJS puppeteer

I'm doing webscarping and writing the data to another HTML file.
On line " const content = await page.$eval('.eApVPN', e => e.innerHTML);" I'm fetching the inner html of a div, this div has multiple p tag, inside those p tags there are multiple hyperlink(a) tags
I want to remove those tags href, but I'm unable to do so
const fs = require('fs').promises;
const helps = require('./_helpers');
const OUTDIR = './results/dataset/'
fs.stat(OUTDIR).catch(async (err) => {
if (err.message.includes('Result Director Doesnt Exist')) {
await fs.mkdir(OUTDIR);
}
await fs.mkdir(OUTDIR);
});
const scraperObject = {
async scraper(browser){
const dataSet = await helps.readCSV('./results/dataset.csv');
console.log("dataset is : ", dataset);
var cookies = null
let page = await browser.newPage();
for (let i = 0; i < dataSet.length ; i++) {
let url = dataSet[i].coinPage
const filename = dataSet[i].symbol;
try{
console.log(`Navigating to ${url}...`);
await page.goto(url);
if (cookies == null){
cookies = await page.cookies();
await fs.writeFile('./storage/cookies', JSON.stringify(cookies, null, 2));
}
await helps.autoScroll(page);
await page.waitForSelector('.eApVPN');
const content = await page.$eval('.eApVPN', e => e.innerHTML);
await fs.writeFile(`${OUTDIR}${filename}.html`, content, (error) => { console.log(error); });
console.log("Written to HTML successfully!");
} catch (err){
console.log(err, '------->', dataSet[i].symbol);
}
}
await page.close();
}
}
module.exports = scraperObject;
Unfortunately Puppeteer doesn't have native functionality to remove nodes. However, you can use .evaluate method to evaluate any javascript script against the current document. For example a script which removes your nodes would look something like this:
await page.evaluate((sel) => {
var elements = document.querySelectorAll(sel);
for(var i=0; i< elements.length; i++){
elements[i].remove()
}
}, ".eApVPN>a")
The above code will remove any <a> nodes directly under a node with eApVPN class. Then you can extract the data with your $eval selector.

Do node js worker never times out?

I have an iteration that can take up to hours to complete.
Example:
do{
//this is an api action
let response = await fetch_some_data;
// other database action
await perform_operation();
next = response.next;
}while(next);
I am assuming that the operation doesn't times out. But I don't know it exactly.
Any kind of explanation of nodejs satisfying this condition is highly appreciated. Thanks.
Update:
The actual development code is as under:
const Shopify = require('shopify-api-node');
const shopServices = require('../../../../services/shop_services/shop');
const { create } = require('../../../../controllers/products/Products');
exports.initiate = async (redis_client) => {
redis_client.lpop(['sync'], async function (err, reply) {
if (reply === null) {
console.log("Queue Empty");
return true;
}
let data = JSON.parse(reply),
shopservices = new shopServices(data),
shop_data = await shopservices.get()
.catch(error => {
console.log(error);
});
const shopify = new Shopify({
shopName: shop_data.name,
accessToken: shop_data.access_token,
apiVersion: '2020-04',
autoLimit: false,
timeout: 60 * 1000
});
let params = { limit: 250 };
do {
try {
let response = await shopify.product.list(params);
if (await create(response, shop_data)) {
console.log(`${data.current}`);
};
data.current += data.offset;
params = response.nextPageParameters;
} catch (error) {
console.log("here");
console.log(error);
params = false;
};
} while (params);
});
}
Everything is working fine till now. I am just making sure that the execution will ever happen in node or not. This function is call by a cron every minute, and data for processing is provided by queue data.

NodeJS scraper needing to increment page and re-run

I'm building a simple NodeJS web scraper, and I want to re-run the function like a 'for loop' until pageNum = totalNumberOfPages... im having a brain fart, and unable to re-run the function from inside itself, since it returns an array fragment and kills itself. Could someone help me overcome this obstacle? I'm pretty sure it's very simple.
I looked at this and this but didn't figure it out...
const cheerio = require("cheerio");
const axios = require("axios");
let pageNum = 0;
let siteUrl = "https://whatever.com?&page=" + pageNum + "&viewAll=true";
let productArray = [];
let vendor = [];
let productTitle = [];
let plantType = [];
let thcRange = [];
let cbdRange = [];
let price = [];
let totalNumberOfPages = undefined;
// called by getResults()
const fetchData = async () => {
const result = await axios.get(siteUrl);
return cheerio.load(result.data);
};
// this function is called from index.js
const getResults = async () => {
// >>>>>>>>>>>>>>>>>> HOW DO I RERUN FROM HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<
const $ = await fetchData();
// first check how many total pages there are
totalNumberOfPages = parseInt($('.pagination li:nth-last-child(2)').text());
// use fetched data to grab elements (and their text) and push into arrays defined above
$('.product-tile__vendor').each((index, element) => {
vendor.push($(element).text());
});
$('.product-tile__title').each((index, element) => {
productTitle.push($(element).text());
});
$('.product-tile__plant-type').each((index, element) => {
plantType.push($(element).text());
});
$('.product-tile__properties li:nth-child(2) p').each((index, element) => {
thcRange.push($(element).text());
});
$('.product-tile__properties li:nth-child(3) p').each((index, element) => {
cbdRange.push($(element).text());
});
$('.product-tile__price').each((index, element) => {
price.push($(element).text());
});
// increment page number to get more products if the page count is less than total number of pages
if (pageNum < totalNumberOfPages) {
pageNum ++;
};
//Convert to an array so that we can sort the results.
productArray.push ({
vendors: [...vendor],
productTitle: [...productTitle],
plantType: [...plantType],
thcRange: [...thcRange],
cbdRange: [...cbdRange],
price: [...price],
pageNum
});
// >>>>>>>>>>>>>>>>>> UNTIL HERE I THINK <<<<<<<<<<<<<<<<<<<<<<<<<<<
return productArray;
};
module.exports = getResults;
you can use recursion concept in your code:
which means the function itself will call itself
so what you can do is
const getResults = async () => {
// >>>>>>>>>>>>>>>>>> HOW DO I RERUN FROM HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<
const $ = await fetchData();
// first check how many total pages there are
totalNumberOfPages = parseInt($('.pagination li:nth-last-child(2)').text());
// use fetched data to grab elements (and their text) and push into arrays defined above
$('.product-tile__vendor').each((index, element) => {
vendor.push($(element).text());
});
$('.product-tile__title').each((index, element) => {
productTitle.push($(element).text());
});
$('.product-tile__plant-type').each((index, element) => {
plantType.push($(element).text());
});
$('.product-tile__properties li:nth-child(2) p').each((index, element) => {
thcRange.push($(element).text());
});
$('.product-tile__properties li:nth-child(3) p').each((index, element) => {
cbdRange.push($(element).text());
});
$('.product-tile__price').each((index, element) => {
price.push($(element).text());
});
// increment page number to get more products if the page count is less than total number of pages
if (pageNum < totalNumberOfPages) {
pageNum ++;
};
//Convert to an array so that we can sort the results.
productArray.push ({
vendors: [...vendor],
productTitle: [...productTitle],
plantType: [...plantType],
thcRange: [...thcRange],
cbdRange: [...cbdRange],
price: [...price],
pageNum
});
// >>>>>>>>>>>>>>>>>> UNTIL HERE I THINK <<<<<<<<<<<<<<<<<<<<<<<<<<<
if(pageNum >= totalNumberOfPages) getResults()
return productArray;
};

Deleting all messages in discord.js text channel

Ok, so I searched for a while, but I couldn't find any information on how to delete all messages in a discord channel. And by all messages I mean every single message ever written in that channel. Any clues?
Try this
async () => {
let fetched;
do {
fetched = await channel.fetchMessages({limit: 100});
message.channel.bulkDelete(fetched);
}
while(fetched.size >= 2);
}
Discord does not allow bots to delete more than 100 messages, so you can't delete every message in a channel. You can delete less then 100 messages, using BulkDelete.
Example:
const Discord = require("discord.js");
const client = new Discord.Client();
const prefix = "!";
client.on("ready" () => {
console.log("Successfully logged into client.");
});
client.on("message", msg => {
if (msg.content.toLowerCase().startsWith(prefix + "clearchat")) {
async function clear() {
msg.delete();
const fetched = await msg.channel.fetchMessages({limit: 99});
msg.channel.bulkDelete(fetched);
}
clear();
}
});
client.login("BOT_TOKEN");
Note, it has to be in a async function for the await to work.
Here's my improved version that is quicker and lets you know when its done in the console but you'll have to run it for each username that you used in a channel (if you changed your username at some point):
// Turn on Developer Mode under User Settings > Appearance > Developer Mode (at the bottom)
// Then open the channel you wish to delete all of the messages (could be a DM) and click the three dots on the far right.
// Click "Copy ID" and paste that instead of LAST_MESSAGE_ID.
// Copy / paste the below script into the JavaScript console.
var before = 'LAST_MESSAGE_ID';
var your_username = ''; //your username
var your_discriminator = ''; //that 4 digit code e.g. username#1234
var foundMessages = false;
clearMessages = function(){
const authToken = document.body.appendChild(document.createElement`iframe`).contentWindow.localStorage.token.replace(/"/g, "");
const channel = window.location.href.split('/').pop();
const baseURL = `https://discordapp.com/api/channels/${channel}/messages`;
const headers = {"Authorization": authToken };
let clock = 0;
let interval = 500;
function delay(duration) {
return new Promise((resolve, reject) => {
setTimeout(() => resolve(), duration);
});
}
fetch(baseURL + '?before=' + before + '&limit=100', {headers})
.then(resp => resp.json())
.then(messages => {
return Promise.all(messages.map((message) => {
before = message.id;
foundMessages = true;
if (
message.author.username == your_username
&& message.author.discriminator == your_discriminator
) {
return delay(clock += interval).then(() => fetch(`${baseURL}/${message.id}`, {headers, method: 'DELETE'}));
}
}));
}).then(() => {
if (foundMessages) {
foundMessages = false;
clearMessages();
} else {
console.log('DONE CHECKING CHANNEL!!!')
}
});
}
clearMessages();
The previous script I found for deleting your own messages without a bot...
// Turn on Developer Mode under User Settings > Appearance > Developer Mode (at the bottom)
// Then open the channel you wish to delete all of the messages (could be a DM) and click the three dots on the far right.
// Click "Copy ID" and paste that instead of LAST_MESSAGE_ID.
// Copy / paste the below script into the JavaScript console.
// If you're in a DM you will receive a 403 error for every message the other user sent (you don't have permission to delete their messages).
var before = 'LAST_MESSAGE_ID';
clearMessages = function(){
const authToken = document.body.appendChild(document.createElement`iframe`).contentWindow.localStorage.token.replace(/"/g, "");
const channel = window.location.href.split('/').pop();
const baseURL = `https://discordapp.com/api/channels/${channel}/messages`;
const headers = {"Authorization": authToken };
let clock = 0;
let interval = 500;
function delay(duration) {
return new Promise((resolve, reject) => {
setTimeout(() => resolve(), duration);
});
}
fetch(baseURL + '?before=' + before + '&limit=100', {headers})
.then(resp => resp.json())
.then(messages => {
return Promise.all(messages.map((message) => {
before = message.id;
return delay(clock += interval).then(() => fetch(`${baseURL}/${message.id}`, {headers, method: 'DELETE'}));
}));
}).then(() => clearMessages());
}
clearMessages();
Reference: https://gist.github.com/IMcPwn/0c838a6248772c6fea1339ddad503cce
This will work on discord.js version 12.2.0
Just put this inside your client on message event
and type the command: !nuke-this-channel
Every message on channel will get wiped
then, a kim jong un meme will be posted.
if (msg.content.toLowerCase() == '!nuke-this-channel') {
async function wipe() {
var msg_size = 100;
while (msg_size == 100) {
await msg.channel.bulkDelete(100)
.then(messages => msg_size = messages.size)
.catch(console.error);
}
msg.channel.send(`<#${msg.author.id}>\n> ${msg.content}`, { files: ['http://www.quickmeme.com/img/cf/cfe8938e72eb94d41bbbe99acad77a50cb08a95e164c2b7163d50877e0f86441.jpg'] })
}
wipe()
}
This will work so long your bot has appropriate permissions.
module.exports = {
name: "clear",
description: "Clear messages from the channel.",
args: true,
usage: "<number greater than 0, less than 100>",
execute(message, args) {
const amount = parseInt(args[0]) + 1;
if (isNaN(amount)) {
return message.reply("that doesn't seem to be a valid number.");
} else if (amount <= 1 || amount > 100) {
return message.reply("you need to input a number between 1 and 99.");
}
message.channel.bulkDelete(amount, true).catch((err) => {
console.error(err);
message.channel.send(
"there was an error trying to prune messages in this channel!"
);
});
},
};
In case you didn't read the DiscordJS docs, you should have an index.js file that looks a little something like this:
const Discord = require("discord.js");
const { prefix, token } = require("./config.json");
const client = new Discord.Client();
client.commands = new Discord.Collection();
const commandFiles = fs
.readdirSync("./commands")
.filter((file) => file.endsWith(".js"));
for (const file of commandFiles) {
const command = require(`./commands/${file}`);
client.commands.set(command.name, command);
}
//client portion:
client.once("ready", () => {
console.log("Ready!");
});
client.on("message", (message) => {
if (!message.content.startsWith(prefix) || message.author.bot) return;
const args = message.content.slice(prefix.length).split(/ +/);
const commandName = args.shift().toLowerCase();
if (!client.commands.has(commandName)) return;
const command = client.commands.get(commandName);
if (command.args && !args.length) {
let reply = `You didn't provide any arguments, ${message.author}!`;
if (command.usage) {
reply += `\nThe proper usage would be: \`${prefix}${command.name} ${command.usage}\``;
}
return message.channel.send(reply);
}
try {
command.execute(message, args);
} catch (error) {
console.error(error);
message.reply("there was an error trying to execute that command!");
}
});
client.login(token);
Another approach could be cloning the channel and deleting the one with the messages you want deleted:
// Clears all messages from a channel by cloning channel and deleting old channel
async function clearAllMessagesByCloning(channel) {
// Clone channel
const newChannel = await channel.clone()
console.log(newChannel.id) // Do with this new channel ID what you want
// Delete old channel
channel.delete()
}
I prefer this method rather than the ones listed on this thread because it most likely takes less time to process and (I'm guessing) puts the Discord API under less stress. Also, channel.bulkDelete() is only able to delete messages that are newer than two weeks, which means you won't be able to delete every channel message in case your channel has messages that are older than two weeks.
The possible downside is the channel changing id. In case you rely on storing ids in a database or such, don't forget to update those documents with the id of the newly cloned channel!
Here's #Kiyokodyele answer but with some changes from #user8690818 answer.
(async () => {
let deleted;
do {
deleted = await channel.bulkDelete(100);
} while (deleted.size != 0);
})();

Use headless chrome to intercept image request data

I have a use case that needs to use Headless Chrome Network (https://chromedevtools.github.io/devtools-protocol/tot/Network/) to intercept all images requests and find out the image size before saving it (basically discard small images such as icons).
However, I am unable to figure out a way to load the image data in memory before saving it. I need to load it in Img object to get width and height. The Network.getResponseBody is taking requestId which I don't have access in Network.requestIntercepted. Also Network.loadingFinished always gives me "0" in encodedDataLength variable. I have no idea why. So my questions are:
How to intercept all responses from jpg/png request and get the image data? Without saving the file via URL string to the disk and load back.
BEST: how to get image dimension from header response? Then I don't have to read the data into memory at all.
My code is below:
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');
(async function() {
async function launchChrome() {
return await chromeLauncher.launch({
chromeFlags: [
'--disable-gpu',
'--headless'
]
});
}
const chrome = await launchChrome();
const protocol = await CDP({
port: chrome.port
});
const {
DOM,
Network,
Page,
Emulation,
Runtime
} = protocol;
await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
await Network.setRequestInterceptionEnabled({enabled: true});
Network.requestIntercepted(({interceptionId, request, resourceType}) => {
if ((request.url.indexOf('.jpg') >= 0) || (request.url.indexOf('.png') >= 0)) {
console.log(JSON.stringify(request));
console.log(resourceType);
if (request.url.indexOf("/unspecified.jpg") >= 0) {
console.log("FOUND unspecified.jpg");
console.log(JSON.stringify(interceptionId));
// console.log(JSON.stringify(Network.getResponseBody(interceptionId)));
}
}
Network.continueInterceptedRequest({interceptionId});
});
Network.loadingFinished(({requestId, timestamp, encodedDataLength}) => {
console.log(requestId);
console.log(timestamp);
console.log(encodedDataLength);
});
Page.navigate({
url: 'https://www.yahoo.com/'
});
Page.loadEventFired(async() => {
protocol.close();
chrome.kill();
});
})();
This should get you 90% of the way there. It gets the body of each image request. You'd still need to base64decode, check size and save etc...
const CDP = require('chrome-remote-interface');
const sizeThreshold = 1024;
async function run() {
try {
var client = await CDP();
const { Network, Page } = client;
// enable events
await Promise.all([Network.enable(), Page.enable()]);
// commands
const _url = "https://google.co.za";
let _pics = [];
Network.responseReceived(async ({requestId, response}) => {
let url = response ? response.url : null;
if ((url.indexOf('.jpg') >= 0) || (url.indexOf('.png') >= 0)) {
const {body, base64Encoded} = await Network.getResponseBody({ requestId }); // throws promise error returning null/undefined so can't destructure. Must be different in inspect shell to app?
_pics.push({ url, body, base64Encoded });
console.log(url, body, base64Encoded);
}
});
await Page.navigate({ url: _url });
await sleep(5000);
// TODO: process _pics - base64Encoded, check body.length > sizeThreshold, save etc...
} catch (err) {
if (err.message && err.message === "No inspectable targets") {
console.error("Either chrome isn't running or you already have another app connected to chrome - e.g. `chrome-remote-interface inspect`")
} else {
console.error(err);
}
} finally {
if (client) {
await client.close();
}
}
}
function sleep(miliseconds = 1000) {
if (miliseconds == 0)
return Promise.resolve();
return new Promise(resolve => setTimeout(() => resolve(), miliseconds))
}
run();

Resources