How to interrupt puppeteer-cluster execution inside an infinite loop? - node.js

I'm learning how to use Puppeteer cluster and I have a question.
How can I interrupt a puppeteer cluster execution running in an infinite loop, by using a key press?
The code would be something like this:
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs').promises;
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function run() {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2,
monitor: true,
});
await cluster.task(async ({ page, data: acc }) => {
// Do task ~2 minutes
});
// In case of problems, log them
cluster.on('taskerror', (err, data) => {
console.log(` Error crawling ${data}: ${err.message}`);
});
// Read the accs.csv file from the current directory
const csvFile = await fs.readFile(__dirname + '/accs.csv', 'utf8');
const lines = csvFile.split('\n');
while(true){
//for each account in the file
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
cluster.queue(line);
}
// sleep for a moment...
await sleep(60000);
}
await cluster.idle();
await cluster.close();
};
try{
run();
} catch(e) {
console.log(e.message());
}

I manage to do it as I usually do, using readline. I thought it didn't work because of the monitor shown on the terminal.
If anyone need an example on how it's done, check this: Break infinite loop user input nodejs

Related

NodeJS Worker Threads - message sending before function finished?

I am using a pool of workers to complete some CPU intensive tasks in Node.
However, I have a problem in my code below. When a task is first run, everything goes as expected, the pool is created, then is called which sends a message to a thread that runs the task successfully and responds with a message. However, on a second call, it seems to run the callback function of the worker (via the parentPort.postMessage) before the work in the threaded file is done. This seems to be an issue with using promises here? I see the "This run was complete in X ms" log before the "done" message shows. Why is this happening? Is something wrong with my workerPool class?
Am I not able to use any async/promise logic within the worker?
The version of Node I'm using is 14.
I followed largely the example set in this doc: https://nodejs.org/api/async_context.html#class-asyncresource
workerPool.js:
const { AsyncResource } = require("async_hooks");
const { EventEmitter } = require("events");
const path = require("path");
const { Worker } = require("worker_threads");
const kTaskInfo = Symbol("kTaskInfo");
const kWorkerFreedEvent = Symbol("kWorkerFreedEvent");
const { MONGODB_URI } = process.env;
class WorkerPoolTaskInfo extends AsyncResource {
constructor(callback) {
super("WorkerPoolTaskInfo");
this.callback = callback;
}
done(err, result) {
console.log("<<<<<<<<<<<");
this.runInAsyncScope(this.callback, null, err, result);
this.emitDestroy(); // TaskInfos are used only once.
}
}
class WorkerPool extends EventEmitter {
constructor(numThreads, workerFile) {
super();
this.numThreads = numThreads;
this.workerFile = workerFile;
this.workers = [];
this.freeWorkers = [];
for (let i = 0; i < numThreads; i++) this.addNewWorker();
}
addNewWorker() {
const worker = new Worker(path.resolve(__dirname, this.workerFile), {
workerData: { MONGODB_URI },
});
worker.on("message", (result) => {
// In case of success: Call the callback that was passed to `runTest`,
// remove the `TaskInfo` associated with the Worker, and mark it as free
// again.
worker[kTaskInfo].done(null, result);
worker[kTaskInfo] = null;
this.freeWorkers.push(worker);
this.emit(kWorkerFreedEvent);
});
worker.on("error", (err) => {
// In case of an uncaught exception: Call the callback that was passed to
// `runTest` with the error.
if (worker[kTaskInfo]) worker[kTaskInfo].done(err, null);
else this.emit("error", err);
// Remove the worker from the list and start a new Worker to replace the
// current one.
this.workers.splice(this.workers.indexOf(worker), 1);
this.addNewWorker();
});
this.workers.push(worker);
this.freeWorkers.push(worker);
this.emit(kWorkerFreedEvent);
}
runTest(data, callback) {
if (this.freeWorkers.length === 0) {
// No free threads, wait until a worker thread becomes free.
console.log("No free threads. Process queued.");
this.once(kWorkerFreedEvent, () => this.runTest(data, callback));
return;
}
const worker = this.freeWorkers.pop();
worker[kTaskInfo] = new WorkerPoolTaskInfo(callback);
worker.postMessage(data);
}
close() {
for (const worker of this.workers) worker.terminate();
}
}
module.exports = WorkerPool;
The index.js file where workers are called:
return new Promise(async (resolve, reject) => {
threadPool.runTest(
{ ...some data },
(err, result) => {
if (err) {
console.error("Bad error from test:", err);
reject(err)
}
const endTime = new Date();
// this callback is fired before the message is supposed to be posted...
console.log("Run Test Complete");
console.log(`This run took ${endTime - startTime} ms`);
resolve(true);
}
);
});
};
The code that actually runs in the worker:
const { parentPort, threadId, workerData } = require("worker_threads");
parentPort.on("message", async (data) => {
// await func1(...)
// await func2(...)
console.log("Done");
parentPort.postMessage('Done stuff);
});

NodeJS no code running after worker code finishes

I'm having an issue with my worker code, currently i have some code that scans through two database tables and finds some matches and then adds some data from one table to the other creating a new table. This is a large set of data so i'm using worker threads to speed this up.
This is all working fine however once the worker threads are complete no other code runs i've tried adding the function LogData everywhere i can and it will not run i've even add the console.log("Finished building merge table") and that doesn't run either. Even the parentResolve does happen as i don't see the console.log("parentResolve") message.
if anyone can help me I would really appreciate it.
const calculateFactorialwithWorker = async () => {
const SCCM = await ProgramDev.find({ "program name": { $not: { $regex: ".*[(]KB*[)]*" } } }).limit(8000)
const sccmLength = SCCM.length
mongoose.connection.close()
return new Promise(async (parentResolve, parentReject) => {
const numbers = [...new Array(sccmLength)].map((_, i) => i);
const segmentSize = Math.ceil(sccmLength / userCPUCount);
const segments = [];
for (let segmentIndex = 0; segmentIndex < userCPUCount; segmentIndex++) {
const start = segmentIndex * segmentSize;
const end = start + segmentSize;
const segment = numbers.slice(start, end)
segments.push(segment);
}
try {
const results = await Promise.all(
segments.map(
segment =>
new Promise((resolve, reject) => {
const worker = new Worker(workerPath, {
workerData: segment,
});
worker.on('message', resolve);
worker.on('error', reject);
worker.on('exit', (code) => {
if (code !== 0)
reject(new Error(`Worker stopped with exit code ${code}`));
});
})
));
parentResolve(() => {
console.log("parentResolve")
})
} catch (e) {
parentReject(e)
}
});
};
calculateFactorialwithWorker().then(() => {
console.log("Finished building merge table")
LogData
})
Add if else block in worker exit event. When exit fired with code === 0 , there is no resolve/reject to handle it. The promises will not be resolved/rejected.
Ref. https://nodejs.org/api/worker_threads.html#worker_threads_event_exit
Also, I rewrite your codes a bit because some promises wrapper is unnecessary.
const calculateFactorialwithWorker = async () => {
try {
const SCCM = await ProgramDev.find({
"program name": { $not: { $regex: ".*[(]KB*[)]*" } },
}).limit(8000);
const sccmLength = SCCM.length;
const numbers = [...new Array(sccmLength)].map((_, i) => i);
const segmentSize = Math.ceil(sccmLength / userCPUCount);
const segments = [];
for (let segmentIndex = 0; segmentIndex < userCPUCount; segmentIndex++) {
const start = segmentIndex * segmentSize;
const end = start + segmentSize;
const segment = numbers.slice(start, end);
segments.push(segment);
}
const promises = segments.map(
segment =>
new Promise((resolve, reject) => {
const worker = new Worker(workerPath, {
workerData: segment,
});
worker.on("message", resolve);
worker.on("error", reject);
worker.on("exit", code => {
if (code !== 0) {
reject(new Error(`Worker stopped with exit code ${code}`));
} else {
resolve();
}
});
})
);
await Promise.all(promises);
} catch (err) {
throw new Error(err);
}
};
calculateFactorialwithWorker()
.then(() => {
console.log("Finished building merge table");
LogData();
})
.catch(console.log)
.finally(() => {
mongoose.connection.close();
});

debug in chromium / puppeteer doesn't populate evaluate script

So I have the following code launching
this.browser = await puppeteer.launch( { headless: false, devtools: true, slowMo: 200});
this.page = await this.browser.newPage();
await this.page.goto(pageUrl);
let result = await this.page.evaluate(() => {
const labels = document.querySelectorAll("li.product-item");
let productList = [];
for(let product of labels) {
productList.push(product);
}
debugger;
//filter only product stacks that have a price
const productStacks = productList.filter(product => product.querySelector("span.price-wrapper") !== null);
let results = productStacks.map(product => {
return product.querySelector("span.price-wrapper").getAttribute("data-price-amount");
});
return results;
});
So chromium starts up and pauses at the appropriate code (as best as I can tell), I can even see the local variables populate with the expect result and step through the code, however the open file puppeteer_evaluation_script is not populated with the evaluation script and remains with the contents, so I'm stepping through blind.
//# sourceURL=__puppeteer_evaluation_script__
Occasionally after many minutes it sometimes does actually populate with the code. I have no idea what's wrong, I've tried updating the latest node lts and puppeteer but have the same behavior.
I don't know what causes this issue, but here is a pair of possible solutions.
To avoid getting:
//# sourceURL=__puppeteer_evaluation_script__
You can expose a function:
const puppeteer = require('puppeteer');
var browser = null;
var page = null;
(async () =>
{
browser = await puppeteer.launch(
{
headless: false,
devtools: true,
});
page = await browser.newPage();
await page.goto("https://google.com/");
// Expose a function
page.exposeFunction("nothing", () => null);
await page.evaluate(async function()
{
debugger;
console.log("Do task");
});
})();
In case that fails in the future. I made a wrapper for using eval() since the source appears using that.
It works with both sync and async functions and supports passing args and returning values.
function evaluateFixed(page, realFunction)
{
return page.evaluate(async function(realFunction, args)
{
var func = eval(`(${realFunction})`);
if(func.constructor.name === "AsyncFunction")
return await func(...args);
else
return func(...args);
},
realFunction.toString(), Array.from(arguments).slice(2));
}
(async () =>
{
browser = await puppeteer.launch(
{
headless: false,
devtools: true,
});
page = await browser.newPage();
await page.goto("https://google.com/");
console.log("Doing test");
let res = await evaluateFixed(page, async function(x, y, z)
{
debugger;
console.log("Do task", x, y, z);
function sleep(amount)
{
return new Promise((resolve) => setTimeout(resolve, amount));
}
for(let i = 0; i < 10; i++)
{
console.log("on seconds", i);
await sleep(1000);
}
return { "fee": "foo" };
}, 1, "two", { "three": 3});
console.log("Res 1", res);
res = await evaluateFixed(page, () =>
{
debugger;
return 1 + 2;
});
console.log("Res 2", res);
})();
I was experiencing the same problem.
I was thinking it could be the timeout to open external files was not sufficient.
So i have added the parameter 'slowMo: 1000' and it solved for me.
Good Luck.

Puppeteer waitForSelector is not working inside loop

My task is form submission with different data. So, I am using puppeteer and for of loop.
Code example:
const puppeteer = require('puppeteer')
const data = require('data.json') // ~30 products
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
ignoreHTTPSErrors: true,
defaultViewport: null,
});
const page2 = await browser.newPage();
await page2.goTo('mywebsite', {waitUntil: 'domcontentloaded'} )
for (let product of data) {
// Waiting for some selector, after that do something with it
await page2.waitForSelector("#someSelector", { visible: true });
await page2.type("#someSelector", product.someData);
//
//... A lot of code that similar to above is here
//
// Go back after all things done
await page2.waitFor(2000);
await page2.waitForSelector('[title="home"]', { visible: true });
await page2.click('[title="home"]', { clickCount: 1 });
counter++;
console.log(
`===========================================================================${counter}`
);
}
} catch (err) {
throw new Error(err);
}
})();
The problem is that this is works, but not always works, for example, a loop can work 15 times, and then fall off or go through a full cycle without failing.
The error is always the same:
UnhandledPromiseRejectionWarning: Error: TimeoutError: waiting for selector "#someSelector" failed: timeout 30000ms exceeded
However, if I check the page, then everything is there, the elements are on the page, but puppeteer does not see them. How Can I fix this?
My current solution for this is a retry function:
const chalk = require("chalk");
const util = require("util");
const delay = util.promisify(setTimeout);
async function retry(fn, retryDelay = 200, numRetries = 5) {
for (let i = 0; i < numRetries; i++) {
try {
return await fn();
} catch (err) {
console.log(
"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
);
console.log(chalk.yellow(err));
if (i === numRetries - 1) throw err;
await delay(retryDelay);
retryDelay = retryDelay * 2;
}
}
}

Handling promises inside the forEach loop

I am trying to run a series of tasks. Each task is dynamic, and could have different rules to follow. This will be executed on AWS-Lambda.
I have an array of JSON. It has a body with task name in it, and it also has attributes.
I need to dynamically load a javascript file with the name inside the body.
I need to wait until all is finished inside that task. Or it failed (regardless where). If the fail happens, I will need to write that data inside the current record inside the forEach loop.
I have old issue, where my forEach is finished first without waiting for the task to complete.
This is the forEach loop:
const jobLoader = require('./Helpers/jobLoader');
event.Records.forEach(record => {
const { body: jobName } = record;
const { messageAttributes } = record;
const job = jobLoader.loadJob(jobName);
job.runJob(messageAttributes).then(res => {
console.log('Show results');
return; // resume another record from forEach
}).catch(err => {
record.failed = true;
record.failureMessage = err.message;
console.log('I errored');
});
console.log('All Done');
});
The problem is that message All Done is printed, and then the message show results is printed. I get results from the database once it comes for execution.
This is the file that loads a task:
exports.loadJob = (jobName) => {
const job = require(`../Tasks/${jobName}`);
return job;
};
This is the file that contains actual task:
const mySqlConnector = require('../Storage/mySql');
exports.runJob = async (params) => {
let payload = {};
let dataToSend = await getUserName(params.userId.stringValue);
payload.dataToSend = dataToSend;
let moreDataToSend = await getEvenMoreData(params.userId.stringValue);
payload.moreDataToSend = moreDataToSend;
return await sendData(payload);
};
const getUserName = async (userId) => {
const query = 'SELECT * FROM user_data';
return await mySqlConnector.handler(query);
};
const getEvenMoreData = async (userId) => {
const query = 'SELECT * FROM user_data';
return await mySqlConnector.handler(query);
};
const sendData = (payload) => {
//this should be Axios sending data
};
And this is the mySql connector itself:
const mysql = require('promise-mysql');
exports.handler = async (query) => {
return mysql.createConnection({
host : '127.0.0.1',
user : 'root',
password : '',
database: 'crm'
}).then(conn =>{
let result = conn.query(query);
conn.end();
return result;
}).then(rows => {
//console.log("These are rows:" + rows);
return rows;
}).catch(error => {
return error;
});
};
The task file can have any number of things it needs to complete, which will be different when I start adding tasks.
I need that job.runJob completes, or that it catches an error, from whatever location it originated, so I can continue with the forEach.
I have tried using map and what not, but the end result is always the same.
What am I doing wrong?
You can use Promise.all method :
const promises = event.Records.map(record => {
const { body: jobName } = record;
const { messageAttributes } = record;
const job = jobLoader.loadJob(jobName);
return job.runJob(messageAttributes).then(res => {
console.log('Show results', res);
}).catch(err => {
record.failed = true;
record.failureMessage = err.message;
console.log('I errored');
throw new Error('Your error !');
});
});
try {
const results = await Promise.all(promises);
console.log('All done');
} catch (e) {
console.log('Something has an error', e);
}
don't forget to make your function async !
I managed to solve it, and still keep details about the execution:
Something like this:
for (let prop in event.Records){
const { body: jobName } = event.Records[prop];
const { messageAttributes } = event.Records[prop];
const job = jobLoader.loadJob(jobName);
await job.runJob(messageAttributes).then(res => {
console.log('Show results', res);
}).catch(err => {
event.Records[prop].failed = true;
event.Records[prop].failed = err.message;
console.log('I errored');
});
}

Resources