node.js async/await or generic-pool causes infinite loop? - node.js

I was trying to create an automation script for work, it is supposed to use multiple puppeteer instances to process input strings simultaneously.
the task queue and number of puppeteer instances are controlled by the package generic-pool,
strangely, when i run the script on ubuntu or debian, it seems that it fells into an infinite loop. tries to run infinite number of puppeteer instances. while when run on windows, the output was normal.
const puppeteer = require('puppeteer');
const genericPool = require('generic-pool');
const faker = require('faker');
let options = require('./options');
let i = 0;
let proxies = [...options.proxy];
const pool = genericPool.createPool({
create: async () => {
i++;
console.log(`create instance ${i}`);
if (!proxies.length) {
proxies = [...options.proxy];
}
let {control = null, proxy} = proxies.pop();
let instance = await puppeteer.launch({
headless: true,
args: [
`--proxy-server=${proxy}`,
]
});
instance._own = {
proxy,
tor: control,
numInstance: i,
};
return instance;
},
destroy: async instance => {
console.log('destroy instance', instance._own.numInstance);
await instance.close()
},
}, {
max: 3,
min: 1,
});
async function run(emails = []) {
console.log('Processing', emails.length);
const promises = emails.map(email => {
console.log('Processing', email)
pool.acquire()
.then(browser => {
console.log(`${email} handled`)
pool.destroy(browser);})
})
await Promise.all(promises)
await pool.drain();
await pool.clear();
}
let emails = [a,b,c,d,e,];
run(emails)
Output
create instance 1
Processing 10
Processing Stacey_Haley52
Processing Polly.Block
create instance 2
Processing Shanny_Hudson59
Processing Vivianne36
Processing Jayda_Ullrich
Processing Cheyenne_Quitzon
Processing Katheryn20
Processing Jamarcus74
Processing Lenore.Osinski
Processing Hobart75
create instance 3
create instance 4
create instance 5
create instance 6
create instance 7
create instance 8
create instance 9
is it because of my async functions? How can I fix it?
Appreciate your help!
Edit 1. modified according to #James suggested

The main problem you are trying to solve,
It is supposed to use multiple puppeteer instances to process input strings simultaneously.
Promise Queue
You can use a rather simple solution that involves a simple promise queue. We can use p-queue package to limit the concurrency as we wish. I used this on multiple scraping projects to always test things out.
Here is how you can use it.
// emails to handle
let emails = [a, b, c, d, e, ];
// create a promise queue
const PQueue = require('p-queue');
// create queue with concurrency, ie: how many instances we want to run at once
const queue = new PQueue({
concurrency: 1
});
// single task processor
const createInstance = async (email) => {
let instance = await puppeteer.launch({
headless: true,
args: [
`--proxy-server=${proxy}`,
]
});
instance._own = {
proxy,
tor: control,
numInstance: i,
};
console.log('email:', email)
return instance;
}
// add tasks to queue
for (let email of emails) {
queue.add(async () => createInstance(email))
}
Generic Pool Infinite Loop Problem
I removed all kind of puppeteer related code from your sample code and saw how it was still producing the infinite output to console.
create instance 70326
create instance 70327
create instance 70328
create instance 70329
create instance 70330
create instance 70331
...
Now, if you test few times, you will see it will throw the loop only if you something on your code is crashing. The culprit is this pool.acquire() promise, which is just re queuing on error.
To find what is causing the crash, use the following events,
pool.on("factoryCreateError", function(err) {
console.log('factoryCreateError',err);
});
pool.on("factoryDestroyError", function(err) {
console.log('factoryDestroyError',err);
});
There are some issues related to this:
acquire() never resolves/rejects if factory always rejects, here.
About the acquire function in pool.js, here.
.acquire() doesn't reject when resource creation fails, here.
Good luck!

You want to return from your map rather than await, also don't await inside the destroy call, return the result and you can chain these e.g.
const promises = emails.map(e => pool.acquire().then(pool.destroy));
Or alternatively, you could just get rid of destroy completely e.g.
pool.acquire().then(b => b.close())

Related

Async/Await doesn't await

Building a node.js CLI application. Users should choose some tasks to run and based on that, tasks should work and then spinners (using ora package) should show success and stop spin.
The issue here is spinner succeed while tasks are still going on. Which means it doesn't wait.
Tried using typical Async/Await as to have an async function and await each function under condition. Didn't work.
Tried using promise.all. Didn't work.
Tried using waterfall. Same.
Here's the code of the task runner, I create an array of functions and pass it to waterfall (Async-waterfall package) or promise.all() method.
const runner = async () => {
let tasks = [];
spinner.start('Running tasks');
if (syncOptions.includes('taskOne')) {
tasks.push(taskOne);
}
if (syncOptions.includes('taskTwo')) {
tasks.push(taskTwo);
}
if (syncOptions.includes('taskThree')) {
tasks.push(taskThree);
}
if (syncOptions.includes('taskFour')) {
tasks.push(taskFour);
}
// Option One
waterfall(tasks, () => {
spinner.succeed('Done');
});
// Option Two
Promise.all(tasks).then(() => {
spinner.succeed('Done');
});
};
Here's an example of one of the functions:
const os = require('os');
const fs = require('fs');
const homedir = os.homedir();
const outputDir = `${homedir}/output`;
const file = `${homedir}/.file`;
const targetFile = `${outputDir}/.file`;
module.exports = async () => {
await fs.writeFileSync(targetFile, fs.readFileSync(file));
};
I tried searching concepts. Talked to the best 5 people I know who can write JS properly. No clue.. What am I doing wrong ?
You don't show us all your code, but the first warning sign is that it doesn't appear you are actually running taskOne(), taskTwo(), etc...
You are pushing what look like functions into an array with code like:
tasks.push(taskFour);
And, then attempting to do:
Promise.all(tasks).then(...)
That won't do anything useful because the tasks themselves are never executed. To use Promise.all(), you need to pass it an array of promises, not an array of functions.
So, you would use:
tasks.push(taskFour());
and then:
Promise.all(tasks).then(...);
And, all this assumes that taskOne(), taskTwo(), etc... are function that return a promise that resolves/rejects when their asynchronous operation is complete.
In addition, you also need to either await Promise.all(...) or return Promise.all() so that the caller will be able to know when they are all done. Since this is the last line of your function, I'd generally just use return Promise.all(...) and this will let the caller get the resolved results from all the tasks (if those are relevant).
Also, this doesn't make much sense:
module.exports = async () => {
await fs.writeFileSync(targetFile, fs.readFileSync(file));
};
You're using two synchronous file operations. They are not asynchronous and do not use promises so there's no reason to put them in an async function or to use await with them. You're mixing two models incorrectly. If you want them to be synchronous, then you can just do this:
module.exports = () => {
fs.writeFileSync(targetFile, fs.readFileSync(file));
};
If you want them to be asynchronous and return a promise, then you can do this:
module.exports = async () => {
return fs.promises.writeFile(targetFile, await fs.promises.readFile(file));
};
Your implementation was attempting to be half and half. Pick one architecture or the other (synchronous or asynchronous) and be consistent in the implementation.
FYI, the fs module now has multiple versions of fs.copyFile() so you could also use that and let it do the copying for you. If this file was large, copyFile() would likely use less memory in doing so.
As for your use of waterfall(), it is probably not necessary here and waterfall uses a very different calling model than Promise.all() so you certainly can't use the same model with Promise.all() as you do with waterfall(). Also, waterfall() runs your functions in sequence (one after the other) and you pass it an array of functions that have their own calling convention.
So, assuming that taskOne, taskTwo, etc... are functions that return a promise that resolve/reject when their asynchronous operations are done, then you would do this:
const runner = () => {
let tasks = [];
spinner.start('Running tasks');
if (syncOptions.includes('taskOne')) {
tasks.push(taskOne());
}
if (syncOptions.includes('taskTwo')) {
tasks.push(taskTwo());
}
if (syncOptions.includes('taskThree')) {
tasks.push(taskThree());
}
if (syncOptions.includes('taskFour')) {
tasks.push(taskFour());
}
return Promise.all(tasks).then(() => {
spinner.succeed('Done');
});
};
This would run the tasks in parallel.
If you want to run the tasks in sequence (one after the other), then you would do this:
const runner = async () => {
spinner.start('Running tasks');
if (syncOptions.includes('taskOne')) {
await taskOne();
}
if (syncOptions.includes('taskTwo')) {
await taskTwo();
}
if (syncOptions.includes('taskThree')) {
await taskThree();
}
if (syncOptions.includes('taskFour')) {
await taskFour();
}
spinner.succeed('Done');
};

Can a SharedArrayBuffer be picked up by garbage collection in Node?

I'm trying to build a Node application using worker threads, divided into three parts.
The primary thread that delegates tasks
A dedicated worker thread that updates shared data
A pool of worker threads that run calculations on shared data
The shared data is in the form of several SharedArrayBuffer objects operating like a pseudo-database. I would like to be able to update the data without needing to pause calculations, and I'm ok with a few tasks using slightly stale data. The flow I've come up with is:
Primary thread passes data to update thread
Update thread creates a whole new SharedArrayBuffer and populates it with updated data.
Update thread returns a pointer to the new buffer back to primary thread.
Primary thread caches the latest pointer in a variable, overwriting its previous value, and passes it to each worker thread with each task.
Worker threads don't retain these pointers at all after executing their operations.
The problem is, this seems to create a memory leak in the resident state stack when I run a prototype that frequently makes updates and swaps out the shared buffers. Garbage collection appears to make a couple of passes removing the discarded buffers, but then it climbs continuously until the application slows and eventually hangs or crashes.
How can I guarantee that a SharedArrayBuffer will get picked up by garbage collection when I'm done with it, or it it even possible? I've seen hints to the effect that as long as all references to it are removed from all threads it will eventually get picked up, but not a clear answer.
I'm using the threads.js library to abstract the worker thread operations. Here's a summary of my prototype:
app.ts:
import { ModuleThread, Pool, spawn, Worker } from "threads";
import { WriterModule } from "./workers/writer-worker";
import { CalculateModule } from "./workers/calculate-worker";
class App {
calculatePool = Pool<ModuleThread<CalculateModule>>
(() => spawn(new Worker('./workers/calculate-worker')), { size: 6 });
writerThread: ModuleThread<WriterModule>;
sharedBuffer: SharedArrayBuffer;
dataView: DataView;
constructor() {
this.sharedBuffer = new SharedArrayBuffer(1000000);
this.dataView = new DataView(this.sharedBuffer);
}
async start(): Promise<void> {
this.writerThread = await spawn<WriterModule>(new Worker('./workers/writer-worker'));
await this.writerThread.init(this.sharedBuffer);
await this.update();
// Arbitrary delay between updates
setInterval(() => this.update(), 5000);
while (true) {
// Arbitrary delay between tasks
await new Promise<void>(resolve => setTimeout(() => resolve(), 250));
this.calculate();
}
}
async update(): Promise<void> {
const updates: any[] = [];
// generates updates
this.sharedBuffer = await this.writerThread.update(updates);
this.dataView = new DataView(this.sharedBuffer);
}
async calculate(): Promise<void> {
const task = this.calculatePool.queue(async (calc) => calc.calculate(this.sharedBuffer));
const sum: number = await task;
// Use result
}
}
const app = new App();
app.start();
writer-worker.ts:
import { expose } from "threads";
let sharedBuffer: SharedArrayBuffer;
const writerModule = {
async init(startingBuffer: SharedArrayBuffer): Promise<void> {
sharedBuffer = startingBuffer;
},
async update(data: any[]): Promise<SharedArrayBuffer> {
// Arbitrary update time
await new Promise<void>(resolve => setTimeout(() => resolve(), 500));
const newSharedBuffer = new SharedArrayBuffer(1000000);
// Copy some values from the old buffer over, perform some mutations, etc.
sharedBuffer = newSharedBuffer;
return sharedBuffer;
},
}
export type WriterModule = typeof writerModule;
expose(writerModule);
calculate-worker.ts
import { expose } from "threads";
const calculateModule = {
async calculate(sharedBuffer: SharedArrayBuffer): Promise<number> {
const view = new DataView(sharedBuffer);
// Arbitrary calculation time
await new Promise<void>(resolve => setTimeout(() => resolve(), 100));
// Run arbitrary calculation
return sum;
}
}
export type CalculateModule = typeof calculateModule;
expose(calculateModule);

How to stop async code from running Node.JS

I'm creating a program where I constantly run and stop async code, but I need a good way to stop the code.
Currently, I have tried to methods:
Method 1:
When a method is running, and another method is called to stop the first method, I start an infinite loop to stop that code from running and then remove the method from the queue(array)
I'm 100% sure that this is the worst way to accomplish it, and it works very buggy.
Code:
class test{
async Start(){
const response = await request(options);
if(stopped){
while(true){
await timeout(10)
}
}
}
}
Code 2:
var tests = [];
Start(){
const test = new test();
tests.push(test)
tests.Start();
}
Stop(){
tests.forEach((t, i) => {t.stopped = true;};
tests = [];
}
Method 2:
I load the different methods into Workers, and when I need to stop the code, I just terminate the Worker.
It always takes a lot of time(1 sec) to create the Worker, and therefore not the best way, since I need the code to run without 1-2 sec pauses.
Code:
const Worker = require("tiny-worker");
const code = new Worker(path.resolve(__dirname, "./Code/Code.js"))
Stopping:
code.terminate()
Is there any other way that I can stop async code?
The program contains Request using nodejs Request-promise module, so program is waiting for requests, it's hard to stop the code without one of the 2 methods.
Is there any other way that I can stop async code?
Keep in mind the basic of how Nodejs works. I think there is some misunderstanding here.
It execute the actual function in the actual context, if encounters an async operation the event loop will schedule it's execetution somewhere in the future. There is no way to remove that scheduled execution.
More info on event loop here.
In general for manage this kind of situations you shuold use flags or semaphores.
The program contains Request using nodejs Request-promise module, so program is waiting for requests, it's hard to stop the code
If you need to hard "stop the code" you can do something like
func stop() {
process.exit()
}
But if i'm getting it right, you're launching requests every x time, at some point you need to stop sending the request without managing the response.
You can't de-schedule the response managemente portion, but you can add some logic in it to (when it will be runned) check if the "request loop" has been stopped.
let loop_is_stopped = false
let sending_loop = null
func sendRequest() {
const response = await request(options) // "wait here"
// following lines are scheduled after the request promise is resolved
if (loop_is_stopped) {
return
}
// do something with the response
}
func start() {
sending_loop = setInterval(sendRequest, 1000)
}
func stop() {
loop_is_stopped = true
clearInterval(sending_loop)
}
module.exports = { start, stop }
We can use Promise.all without killing whole app (process.exit()), here is my example (you can use another trigger for calling controller.abort()):
const controller = new AbortController();
class Workflow {
static async startTask() {
await new Promise((res) => setTimeout(() => {
res(console.log('RESOLVE'))
}, 3000))
}
}
class ScheduleTask {
static async start() {
return await Promise.all([
new Promise((_res, rej) => { if (controller.signal.aborted) return rej('YAY') }),
Workflow.startTask()
])
}
}
setTimeout(() => {
controller.abort()
console.log("ABORTED!!!");
}, 1500)
const run = async () => {
try {
await ScheduleTask.start()
console.log("DONE")
} catch (err) {
console.log("ERROR", err.name)
}
}
run()
// ABORTED!!!
// RESOLVE
"DONE" will never be showen.
res will be complited
Maybe would be better to run your code as script with it's own process.pid and when we need to interrupt this functionality we can kill this process by pid in another place of your code process.kill.

what is the right way to fork a loop in node.js

So i have created server which collect data and write it into db in never ending loop.
server.listen(3001, () => {
doFullScan();
});
async function doFullScan() {
while (true) {
await collectAllData();
}
}
collectAllData() is a method which check for available projects, loop through each project collect some data and write it into db.
async function collectAllData() {
//doing soemhting
const projectNames = ['array with projects name'];
//this loop takes too much of time
for(let project in projectNames){
await collectProjectData(project);
}
//doing something
}
The problem is that whole loop is taking too much time. So i would like to speed it up by multithreading loop and use all of my computer cores on it.
How should i do it?
There is cluster library with examples on https://nodejs.org/docs/latest/api/cluster.html but i don't want to create new servers. I want to spawn childrens, which will do a task and exit after they have done its job.
So there is const { fork } = require('child_process'); but I'm not exactly sure how to make each fork run only collectProjectData() method.
You can do it natively without any third party libraries.
Right now, your for...loop is running each one after the other.
Option 1
Use Promise.all and .map
await Promise.all(projectNames.map(async(projectName) => {
await collectProjectData(projectName);
});
Note, if you use .map, it will kick off all of them, all at the same time, which might be too much if projectNames continue to grow.
This is the complete opposite of what yours is doing currently.
Option 2
There is a middle way...running batches in sequence, but items inside each batch asynchronously.
const chunk = (a, l) => a.length === 0 ? [] : [a.slice(0, l)].concat(chunk(a.slice(l), l));
const batchSize = 10;
const projectNames = ['array with projects name'];
let projectNamesInChunks = chunk(projectNames, batchSize);
for(let chunk of projectNamesInChunks){
await Promise.all(chunk.map(async(projectName) => {
await collectProjectData(projectName);
});
}
I recommend using Promise.map
http://bluebirdjs.com/docs/api/promise.map.html
that way you can control the level of concurrency as you wish like this:
await Promise.map(projectNames, collectProjectData, {concurrency: 3})

ES6 - make multiple requests for multiple user accounts in parallel

I am building an express.js web application, and for one of the API requests I need to make multiple requests for multiple user accounts in parallel and return one object.
I tried using generators and Promise.all but I have 2 problems:
I don't run in parallel for all user accounts.
My code ends after the response has already returned.
Here is the code I wrote:
function getAccountsDetails(req, res) {
let accounts = [ '1234567890', '7856239487'];
let response = { accounts: [] };
_.forEach(accounts, Promise.coroutine(function *(accountId) {
let [ firstResponse, secondResponse, thirdResponse ] = yield Promise.all([
firstRequest(accountId),
secondRequest(accountId),
thirdRequest(accountId)
]);
let userObject = Object.assign(
{},
firstResponse,
secondResponse,
thirdResponse
);
response.accounts.push(userObject);
}));
res.json(response);
}
_.forEach is not aware of Promise.coroutine and it is not using the return values.
Since you're already using bluebird, you can use its promises aware helpers instead:
function getAccountsDetails(req, res) {
let accounts = [ '1234567890', '7856239487'];
let response = { accounts: [] };
return Promise.map(accounts, (account) => Promise.props({ // wait for object
firstResponse: firstRequest(accountId),
secondResponse: secondRequest(accountId),
thirdResponse: thirdRespones(accountId)
})).tap(r => res.json(r); // it's useful to still return the promise
}
And that should be the code in its entirety.
Coroutines are great, but they're useful for synchronizing asynchronous stuff - in your case you actually do want the concurrency features.

Resources