Loop through an api get request with variable URL - node.js

I am trying to call CompaniesHouse API and fetch companies registered between November and February. The approach I took is to pick a starting index(a company registered in November) and a stop index(a company registered in February) and loop through to get the companies registered between the start and stop index. Like so:
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
for(idx = startIdx; idx < stopIdx; idx++)
{
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
.then(function(data) {
})
.catch(function(err) {
console.log('Call the locksmith!' + err)
})
}
But this doesn't work as gives either a timeout or socket hangup error.
The API is currently in beta and some features are still yet to be implemented.

Because the for loop runs synchronously and your calls to needle() are asynchronous and therefore do not block, you end up attempting to start more than 100,000 network requests at once. This overwhelms either your local computer or the target server and you start getting socket errors.
For this many requests, you need to run them X at a time so no more than X are in flight at the same time. To maximize performance, you will have to figure out what value of X you want to use because it will depend upon the target server and how it handles lots of simultaneous requests. It is generally safe to start with a value of 5 and then increase it from there to test higher values.
If you were processing an array, there are a number of pre-built options to run X requests at once. The simplest is to use a pre-built concurrency management operation such as Bluebird. Or you can write your own. You can see examples of both here: Make several requests to an API that can only handle 20 request a minute
But, since you are not processing an array, but are just incrementing a number for each successive request, I couldn't find a pre-built option that does that. So, I wrote a general purpose one where you can fill in the function that will increment your index:
// fn gets called on each iteration - must return a promise
// limit is max number of requests to be in flight at once
// cnt is number of times to call fn
// options is optional and can be {continueOnError: true}
// runN returns a promise that resolves with results array.
// If continueOnError is set, then results array
// contains error values too (presumed to be instanceof Error so caller can discern
// them from regular values)
function runN(fn, limit, cnt, options = {}) {
return new Promise((resolve, reject) => {
let inFlightCntr = 0;
let results = [];
let cntr = 0;
let doneCnt = 0;
function run() {
while (inFlightCntr < limit && cntr < cnt) {
let resultIndex = cntr++;
++inFlightCntr;
fn().then(result => {
--inFlightCntr;
++doneCnt;
results[resultIndex] = result;
run(); // run any more that still need to be run
}).catch(err => {
--inFlightCntr;
++doneCnt;
if (options.continueOnError) {
// assumes error is instanceof Error so caller can tell the
// difference between a genuine result and an error
results[resultIndex] = err;
run(); // run any more that still need to be run
} else {
reject(err);
}
});
}
if (doneCnt === cnt) {
resolve(results);
}
}
run();
});
}
Then, you could use this like this:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
To minimize memory use, you can use a .then() handler on your call to needle() and trim down the response to only what you need in the final array:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
}).then(response => {
// construct the smallest possible response here and then return it
// to minimize memory use for your 100,000+ requests
return response.someProperty;
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});

var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
const promises = [];
for(idx = startIdx; idx < stopIdx; idx++)
{
promises.push(
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
)
}
Promise.all(promises).then(results => {console.log(results);}).catch(err => console.log(err));
A simple Promise.all implementation can help.

Related

Can't break while loop from within after obtaining result form a spawned process

I have been trying to make a video listing function that makes use of node-js's spawn to spawn a yt-dlp process, whose output gets stored in a database.
Now it works but not as expected (the save order gets messed up even then) when I give it the size of the playlist it must process, but when the submitted playlist size is not known I can't stop the while loop that I have been using to run it.
Here it the function:
const { Sequelize, DataTypes } = require('sequelize'); // including this just in case
const { spawn } = require("child_process");
async function list_background(body_url, start_num, stop_num, chunk_size) {
// sleep just to make it possible to catch
// await sleep(2 * 1000);
console.log('\nlisting in background');
var i = 0;
var dont_stop = true;
// need to find a way to make the loop work only until the time we get a response
// empty response means we should stop
// while (dont_stop) { // this is disastrous as the variable never gets updated
while (i < 10) {
// prepare an empty string to append all the data to
var response = '';
// make the start and stop numbers
start_num = parseInt(start_num) + chunk_size;
stop_num = parseInt(stop_num) + chunk_size;
console.log("\nsupplied data:", "\ni:", i, "\nbody_url:", body_url, "\nstart_num:", start_num, "\nstop_num:", stop_num, "\nchunk_size", chunk_size);
// actually spawn the thing
const yt_list = spawn("yt-dlp", ["--playlist-start", start_num, "--playlist-end", stop_num, "--flat-playlist",
"--print", '%(title)s\t%(id)s\t%(webpage_url)s', body_url]);
yt_list.stdout.on("data", async data => {
response += data;
});
yt_list.stderr.on("data", data => {
response = `stderr: ${data}`;
});
yt_list.on('error', (error) => {
response = `error: ${error.message}`;
});
// apparently await has no effect on this expression
// but then how are we supposed to know when to stop?
// the listing only ends when dont_stop is false
yt_list.on("close", async (code) => {
end = `child process exited with code ${code}`;
response_list = response.split("\n");
// remove the "" from the end of the list
response_list.pop();
// get the status at the end
console.log("\ndata after processing\ni:", i, "response:\n", response, "\nresponse_list:", response_list, "\nresponse_list.length:", response_list.length, "\n");
if (response_list == '') {
// basically when the resonse is empty it means that all
// the items have been listed and the function can just return
// this should then break the outer listing loop
console.log("no vidoes found", "\ni:", i, "\n");
// break wont work as `Jump target cannot cross function boundary.ts(1107)`
// so I am returning false to dont_stop and if dont_stop is is true then the loop
// should stop in the next iteration
dont_stop = false;
} else {
// adding the items to db
console.log("adding items to db", "\ni:", i, "\n");
await Promise.all(response_list.map(async (element) => {
var items = element.split("\t");
// console.log(items, items.length, "\ni:", i, "\n");
// update the vidoes too here by looking for any changes that could have been made
// use find or create here to update the entries
if (items.length == 3) {
try {
if (items[0] == "[Deleted video]" || items[0] == "[Private video]") {
item_available = false;
} else {
item_available = true;
}
const [found, created] = await vid_list.findOrCreate({
where: { url: items[2] },
defaults: {
id: items[1],
reference: body_url,
title: items[0],
downloaded: false,
available: item_available
}
})
//if (created)
//console.log("\nsaved", items[0], "\ni:", i, "\n");
//else
if (found) {
if (!item_available) {
found.available = false;
//console.log("\nfound", items[0], "updated", "\ni:", i, "\n");
}
else {
//console.log("\nfound", items[0], "no changes", "\ni:", i, "\n");
}
found.changed('updatedAt', true);
}
} catch (error) {
// remember to uncomment this later, the sequelize erros are not relevant here now
// console.error(error);
}
}
}));
dont_stop = true;
}
});
console.log('\n\ndont_stop', dont_stop, "\ni:", i, "\n");
i++;
}
console.log('\noutside the loop, and persumably done', "\ni:", i, "\n");
}
this is the test data that I use:
const daft_punk_essentials = { url: "https://www.youtube.com/playlist?list=PLSdoVPM5WnneERBKycA1lhN_vPM6IGiAg", size: 22 }
// first 10 will be listed by the main method so the number of vidoes that we should get here is total-10
list_background(daft_punk_essentials['url'], 1, 10, 10);
I recorded the output of the execution to find out what is happening
can't_stop.log
From my observations I have found out that the spawn doesn't start until after the loop has finished, which I had to limit it 10 as without a limit it just crashes my computer. (see log file for how it happening)
Now I know about await Promise.all() to wait for it's internal stuff to complete but how do i don't get how to implement this for a while loop that need process parts of a list in order to add them to a db.
I am not sure if this is the right approach to do this. I used while loop because there can be up to 5000 videos in a playlist and using a for loop to make chunks would be wasteful if the playlist has like < 500 videos.
The beauty of using promises and async/await is that you can use normal flow of control programming with loops, break, return, etc... because your code isn't running inside of event triggered callback functions which have no control over the higher level scope.
So, the first thing to clean up here is to take all the .on() event handling from the spawn() and wrap it into a promise so that can all be abstracted away in a separate function that you can use await on.
Then, I'd also suggest breaking some of the complication you have into separate functions as that will also allow you to more simply see and control the flow.
I did not follow everything you were trying to do in this loop or how you want to handle all possible error conditions so I'm sure this will need some further tweaking, but here's the general idea.
Synopsis of Changes
Put the spawn operation into a separate function which I called getVideoInfo() that returns a promise that resolves/rejects when its done. This wraps all the .on() event handlers in a promise that the caller can more simply deal with.
Break out the functionality that adds items to the DB into its own function. This is done just to simplify the code and make the main control flow easier to follow and see and write.
Just use a while (true) loop and when you're done, you can simply return. No need for stop loop variables or any of that.
Here's the general idea for how that could look (you will likely have to fix up some details and error handling since I can't run this myself).
const { Sequelize, DataTypes } = require('sequelize'); // including this just in case
const { spawn } = require("child_process");
function getVideoInfo(body_url, start_num, stop_num) {
return new Promise((resolve, reject) => {
// actually spawn the thing
let response = "";
const yt_list = spawn("yt-dlp", [
"--playlist-start",
start_num,
"--playlist-end",
stop_num,
"--flat-playlist",
"--print", '%(title)s\t%(id)s\t%(webpage_url)s',
body_url
]);
yt_list.stdout.on("data", data => {
response += data;
});
yt_list.stderr.on("data", data => {
reject(new Error(`stderr: ${data}`));
});
yt_list.on("close", async (code) => {
resolve(response);
});
yt_list.on("error", reject);
});
}
async function addItemsToDb(response_list) {
// adding the items to db
console.log("adding items to db", "\ni:", i, "\n");
await Promise.all(response_list.map(async (element) => {
const items = element.split("\t");
// update the vidoes too here by looking for any changes that could have been made
// use find or create here to update the entries
if (items.length === 3) {
try {
const item_available = items[0] === "[Deleted video]" || items[0] === "[Private video]";
const [found, created] = await vid_list.findOrCreate({
where: { url: items[2] },
defaults: {
id: items[1],
reference: body_url,
title: items[0],
downloaded: false,
available: item_available
}
});
if (found) {
if (!item_available) {
found.available = false;
//console.log("\nfound", items[0], "updated", "\ni:", i, "\n");
}
else {
//console.log("\nfound", items[0], "no changes", "\ni:", i, "\n");
}
found.changed('updatedAt', true);
}
} catch (error) {
// remember to uncomment this later, the sequelize erros are not relevant here now
// console.error(error);
}
}
}));
}
async function list_background(body_url, start_num, stop_num, chunk_size) {
console.log('\nlisting in background');
start_num = parseInt(start_num);
stop_num = parseInt(stop_num);
while (true) {
// make the start and stop numbers
start_num += chunk_size;
stop_num += chunk_size;
const response = await getVideoInfo(body_url, start_num, stop_num);
const response_list = response.split("\n");
// remove the "" from the end of the list
response_list.pop();
// get the status at the end
if (response_list == '') {
return;
} else {
await addItemsToDb(response_list);
}
}
}
P.S. I don't understand why you're adding chunk_size to start_num before you ever use it. It seems like you'd want to do that after you do the first iteration so you start at start_num, not start at start_num + chunk_size. But, this is how your original code was written so I left it that way.

BatchWrite in AWS dynamo db skipping some items

I am trying to write Items to AWS dynamo db using node SDK. The problem I am facing is that when I write batch items to AWS in parallel using threads, some of the items are not written to database. The number of items are written are random. For instance, If I run my code 3 times, at one time it would be 150, next it would 200 and third time it could be 135. In addition, when I write the items sequentially without threads, even then some of the items are not written.However, in this case the items are less missing. For instance if the total number of items is 300 then the items written are 298. I investigated the problem to see if there any unprocessed items but the batchWrite method returns nothing. It means that all the items are being processed correctly. Please note that I have OnDemand provision for my respective database so I do not expect any throttling issues. So here is my code.
exports.run = async function() {
**This is the function which runs first !!!!!**
const data = await getArrayOfObjects();
console.log("TOTAL PRICE CHANGES")
console.log(data.length)
const batchesOfData = makeBatches(data)
const threads = new Set();
console.log("**********")
console.log(batchesOfData.length)
console.log("**********")
for(let i = 0; i < batchesOfData.length; i++) {
console.log("BATCH!!!!!")
console.log(i)
console.log(batchesOfData[i].length)
// Sequential Approach
const response = await compensationHelper.createItems(batchesOfData[i])
console.log("RESPONSE")
console.log(response)
Parallel approach
// const workerResult = await runService(batchesOfData[i])
// console.log("WORKER RESUULT!!!!")
// console.log(workerResult);
}
}
exports.updateItemsInBatch = async function(data, tableName) {
console.log("WRITING DATA")
console.log(data.length)
const batchItems = {
RequestItems: {},
};
batchItems.RequestItems[tableName] = data;
try {
const result = await documentClient.batchWrite(batchItems).promise();
console.log("UNPROCESSED ITEMS")
console.log(result)
if (result instanceof Error) {
console.log(`[Error]: ${JSON.stringify(Error)}`);
throw new Error(result);
}
return Promise.resolve(true);
} catch (err) {
console.error(`[Error]: ${JSON.stringify(err.message)}`);
return Promise.reject(new Error(err));
}
};
exports.convertToAWSCompatibleFormat = function(data) {
const awsCompatibleData = [];
data.forEach(record => awsCompatibleData.push({ PutRequest: { Item: record } }));
return awsCompatibleData;
};
const createItems = async function(itemList) {
try {
const objectsList = [];
for (let index = 0; index < itemList.length; index++) {
try {
const itemListObj = itemList[index];
const ObjToBeInserted = {
// some data assignments here
};
objectsList.push(ObjToBeInserted);
if (
objectsList.length >= AWS_BATCH_SIZE ||
index === itemList.length - 1
) {
const awsCompatiableFormat = convertToAWSCompatibleFormat(
objectsList
);
await updateItemsInBatch(
awsCompatiableFormat,
process.env.myTableName
);
}
} catch (error) {
console.log(`[Error]: ${JSON.stringify(error)}`);
}
}
return Promise.resolve(true);
} catch (err) {
return Promise.reject(new Error(err));
}
};
const makeBatches = products => {
const productBatches = [];
let countr = -1;
for (let index = 0; index < products.length; index++) {
if (index % AWS_BATCH_SIZE === 0) {
countr++;
productBatches[countr] = [];
if (countr === MAX_BATCHES) {
break;
}
}
try {
productBatches[countr].push(products[index]);
} catch (error) {
continue;
}
}
return productBatches;
};
async function runService(workerData) {
return new Promise((resolve, reject) => {
const worker = new Worker(path.join(__dirname, './worker.js'), { workerData });
worker.on('message', resolve);
worker.on('error', reject);
worker.on('exit', (code) => {
if (code !== 0)
reject(new Error(`Worker stopped with exit code ${code}`));
})
})
}
// My worker file
'use strict';
const { workerData, parentPort } = require('worker_threads')
const creatItems = require('myscripts')
// You can do any heavy stuff here, in a synchronous way
// without blocking the "main thread"
console.log("I AM A NEW THREAD")
createItems(workerData)
// console.log('Going to write tons of content on file '+workerData);
parentPort.postMessage({ fileName: workerData, status: 'Done' })
From boto3 documentation:
If one or more of the following is true, DynamoDB rejects the entire batch write operation:
One or more tables specified in the BatchWriteItem request does not exist.
Primary key attributes specified on an item in the request do not match those in the corresponding table's primary key schema.
You try to perform multiple operations on the same item in the same BatchWriteItem request. For example, you cannot put and delete the same item in the same BatchWriteItem request.
Your request contains at least two items with identical hash and range keys (which essentially is two put operations).
There are more than 25 requests in the batch.
Any individual item in a batch exceeds 400 KB.
The total request size exceeds 16 MB.
To me, it looks some of this is true. At my job, we also had a problem that one batch contained 2 identical primary and secondary keys in the batch so the whole batch was discarded. I know it's not node.js, but we used this to overcome that problem.
It is batch_writer(overwrite_by_pkeys) and it is used to overwrite the last occurance of the same primary and last key in the batch. If only a small portion of your data is duplicate data and you do not need to save it, you can use this. BUT if you need to save all your data, I do not advise you to use this functionality.
I don't see where you are checking the response for UnprocessedItems. Batch operations will often return a list of items it didn't process. As is documented, BatchWriteItem "can write up to 16 MB of data, which can comprise as many as 25 put or delete requests."
I had duplicate keys issue which means that primary and the sort key had duplicate values in the batch, however, in my case this error was not returned from the AWS BatchWrite method if my timestamp was in fraction of seconds 2020-02-09T08:02:36.71, which was a bit surprising. I resolved the issue by making my createdAt(sort key) to be more granular like this => 2020-02-09T08:02:36.7187 Thus making it non-repetitive.

NodeJS find query inside for loop results after the loops end

var user_id = '98-XXXXXXXX'
Contact.find({user_id: user_id})
.exec(function (err, results) {
if (err) {
return next(err);
}
var finalArray = [];
for(var i = 0; i< results[0].Total; i++) {
if(results[0].Contacts[i].name != "SPAM") {
for(var j = 0; j < results[0].Contacts[i].phoneNumbers.length; j++){
var number = results[0].Contacts[i].phoneNumbers[j].number
number = number.replace(/ /g,'');
var user_id = number.substr(number.length - 10);
Login.find({user_id:user_id})
.exec(function(err,results) {
if(err) {
return next(err); }
var intCount = results.length;
if (intCount > 0)
{
console.log('called')
finalArray.push(results[0])
console.log(finalArray)
}
});
}
}
//console.log(i,results[0].Total - 1);
//if(i == results[0].Total - 1)
}
console.log('Ended Here',finalArray)
var responseTosend = {"message":finalArray,"user_id":user_id}
return res.send(responseTosend);
});
EndedHere[] this is coming up first empty, after that i got the result of login.find query which is correct. Any ideas how to get the finalArray after all the calculation.
Thanks in advance
Since the functions are returning promises within the loops, the code execution has to wait till all those promises are resolved. Consider using Promise.all or Promise.map to wait. Reference
As already mentioned, a structure like this, will not return the results, but the intermediate functions or objects before they are finished, since nodejs does not know it should await the results first.
const x = [1,2,3]
let results = []
for (let i = 0; i < x.length; i++){
results.push(someAsyncJobLikeADatabaseCall(x[i]))
}
// this will not return the results, but the intermediate async objects/functions
console.log(results)
Here is a better version using promises and the .map function. Notice, how we replaced the for loop with .map() (which you could see as a shorthand for .forEach + push() or for() + push(). Mongoose returns Promises if configured right, so you don't even have to manually define them and we can directly return them in .map.
const x = [1,2,3]
let results = []
async function getAsyncResults(array){
// map returns an array, this time, an array of promises
const promises = x.map(number => someAsyncJobLikeADatabaseCall(number))
// Promise.all resolves, if all promises in the array have been resolved
return Promise.all(promises)
}
try {
let results = await getAsyncResults(x)
// this will return the results you expect.
console.log(results)
} catch (err) {
console.log('Some error', err)
}

Create promises on queries inside a for loop

I'm trying to write a Node.js code that does the below.
Connect to a Salesforce instance.
Get the past 7 days, and loop through them.
Run 2 queries inside them and push the result to an Array.
Display the value in another function.
Here is my JS code.
var jsforce = require("jsforce");
var moment = require('moment');
function connectToEP() {
var main_Obj = {};
var response_Obj = {};
var pastSevenDaysArray = [];
var conn = new jsforce.Connection();
var beforeSevenDays = moment().subtract(7, 'days').format('YYYY-MM-DD');
var today = moment().startOf('day');
var i = 0;
conn.login("myUid", "myPwd").then(() => {
console.log("Connected To Dashboard");
for (var m = moment(beforeSevenDays); m.diff(today, 'days') <= 0; m.add(1, 'days')) {
conn.query("SELECT SUM(Total_ETA_of_all_tasks__c), SUM(Total_ETA__C) from Daily_Update__c where DAY_ONLY(createddate)= " + m.format('YYYY-MM-DD')).then(() => {
console.log("B1");
var z = response_Obj.aggrRes;
response_Obj.aggrRes = res;
pastSevenDaysArray.push({ z: res });
console.log("B1 Exit");
}).then(() => {
conn.query("SELECT count(Id), Task_Type__c FROM Daily_Task__c where DAY_ONLY(createddate) = " + m.format('YYYY-MM-DD') + " group by Task_Type__c").then(() => {
console.log("B2");
var z = response_Obj.aggrRes;
response_Obj.aggrRes = res;
pastSevenDaysArray.push({ z: res });
console.log("B2 Exit");
})
})
}
return Promise.resolve(pastSevenDaysArray);
}).then((data) => {
console.log(typeof data);
updateMessage(JSON.stringify(data));
console.log(typeof data);
});
}
function updateMessage(message) {
console.log("XXXXXXXXXXXX");
console.log(message);
console.log("XXXXXXXXXXXX");
}
function socketNotificationReceived() {
console.log("socket salesforce rec");
connectToEP();
}
socketNotificationReceived();
when I run this code, the output that I get is.
socket salesforce rec
Connected To Dashboard
object
XXXXXXXXXXXX
[]
XXXXXXXXXXXX
object
B1
B1
B1
B1
B1
B1
B1
B1
I'm very new to this js platform, unable to get the promises concepts :(. please let me know on were am I going wrong and how can I fix it.
An explanation of what's going is very helpful in my future projects.
Thanks
The thing I always do when I get confused is to decompose. Build the pieces one by one, and make sure each works. Trying to understand your code, I get something like this...
A function each for logging in, getting a "task sum" from the db and getting a "task count" from the db. (Task sum/count is what I guessed the queries were up to. Rename as you see fit).
var jsforce = require("jsforce");
var moment = require('moment');
function login(conn) {
return conn.login("myUid", "myPwd");
}
function queryTaskSumForDay(conn, m) {
return conn.query("SELECT SUM(Total_ETA_of_all_tasks__c), SUM(Total_ETA__C) from Daily_Update__c where DAY_ONLY(createddate)= " + m.format('YYYY-MM-DD'));
}
function queryTaskCountForDay(conn, m) {
return conn.query("SELECT count(Id), Task_Type__c FROM Daily_Task__c where DAY_ONLY(createddate) = " + m.format('YYYY-MM-DD') + " group by Task_Type__c");
}
With those working, it should be easy to get a sum and a count for a given day. Rather than returning these in an array (containing two objects that each have a "z" property as your code did), I opted for the simpler single object that has a sum and count property. You may need to change this to suit your design. Notice the use of Promise.all() to resolve two promises together...
function sumAndCountForDay(conn, m) {
let sum = queryTaskSumForDay(conn, m);
let count = queryTaskCountForDay(conn, m);
return Promise.all([sum, count]).then(results => {
return { sum: results[0], count: results[1] };
});
}
With that working, it should be easy to get an array of sum-count objects for a period of seven days using your moment logic and the Promise.all() idea...
function sumAndCountForPriorWeek(conn) {
let promises = [];
let beforeSevenDays = moment().subtract(7, 'days').format('YYYY-MM-DD');
let today = moment().startOf('day');
for (let m = moment(beforeSevenDays); m.diff(today, 'days') <= 0; m.add(1, 'days')) {
promises.push(sumAndCountForDay(conn, m));
}
return Promise.all(promises);
}
With that working (notice the pattern here?), your OP function is tiny and nearly fully tested because we tested all of it's parts...
function connectToEP() {
let conn = new jsforce.Connection();
return login(conn).then(() => {
return sumAndCountForPriorWeek(conn)
}).then(result => {
console.log(JSON.stringify(result));
return result;
}).catch(error => {
console.log('error: ' + JSON.stringify(error));
return error;
});
}
I think your general structure should be something like this. The biggest issue is not returning promises when you need to. A "for loop" of promises is a little difficult to step into, but if you can do them in parallel then the easiest thing to do is Promise.all If you need to aggregate the data before you can perform the next query then you need to do multiple Promise.all().then()'s. The reason you get an empty array [] is because your for loop creates the promises but doesn't wait until they finish.
var jsforce = require("jsforce");
var moment = require('moment');
function connectToEP() {
// connectToEP now returns a promise
return conn.login("myUid", "myPwd").then(() => {
console.log("Connected To Dashboard");
let myQueries = [];
for (start ; condition ; incrementer) {
myQueries.push( // Add all these query promises to the parallel queue
conn.query(someQuery)
.then((res) => {
return res;
})
.then((res) => {
return conn.query(someQuery).then((res) => {
return someData;
})
})
)
}
return Promise.all(myQueries); // Waits for all queries to finish...
}).then((allData) => { // allData is an array of all the promise results
return updateMessage(JSON.stringify(allData));
});
}

Limiting asynchronous calls in Node.js

I've got a Node.js app that gets a list of file locally and uploads them to a server. This list could contain thousands of files.
for (var i = 0; i < files.length; i++) {
upload_file(files[i]);
}
If I execute this with thousands of files, upload_file will get called thousands of times all at once, and most likely die (or at least struggle). In the synchronous world, we'd create a thread pool and limit it to a certain number of threads. Is there a simple way to limit how many asynchronous calls get executed at once?
As usual, I recommend Caolan McMahon's async module.
Make your upload_file function take a callback as it's second parameter:
var async = require("async");
function upload_file(file, callback) {
// Do funky stuff with file
callback();
}
var queue = async.queue(upload_file, 10); // Run ten simultaneous uploads
queue.drain = function() {
console.log("All files are uploaded");
};
// Queue your files for upload
queue.push(files);
queue.concurrency = 20; // Increase to twenty simultaneous uploads
The answer above, re: async on NPM is the best answer, but if you'd like to learn more about control flow:
You should look into control flow patterns. There's a wonderful discussion on control flow patterns in Chapter 7 of Mixu's Node Book. Namely, I'd look at the example in 7.2.3: Limited parallel - an asynchronous, parallel, concurrency limited for loop.
I've adapted his example:
function doUpload() {
// perform file read & upload here...
}
var files = [...];
var limit = 10; // concurrent read / upload limit
var running = 0; // number of running async file operations
function uploader() {
while(running < limit && files.length > 0) {
var file = files.shift();
doUpload(file, function() {
running--;
if(files.length > 0)
uploader();
});
running++;
}
}
uploader();
You should try queueing. I assume that a callback is fired when upload_file() finishes. Something like this should do the trick (untested):
function upload_files(files, maxSimultaneousUploads, callback) {
var runningUploads = 0,
startedUploads = 0,
finishedUploads = 0;
function next() {
runningUploads--;
finishedUploads++;
if (finishedUploads == files.length) {
callback();
} else {
// Make sure that we are running at the maximum capacity.
queue();
}
}
function queue() {
// Run as many uploads as possible while not exceeding the given limit.
while (startedUploads < files.length && runningUploads < maxSimultaneousUploads) {
runningUploads++;
upload_file(files[startedUploads++], next);
}
}
// Start the upload!
queue();
}
The others answers seem to be outdated. This can be solved easily using paralleLimit from async. Below is how to use it. I haven't tested it.
var tasks = files.map(function(f) {
return function(callback) {
upload_file(f, callback)
}
});
parallelLimit(tasks, 10, function(){
});
No external libraries. Just plain JS.
It can be resolved using recursion.
The idea is that initially we immediately start the maximum allowed number of uploads and each of these requests should recursively initiate a new upload on its completion.
In this example I populate successful responses together with errors and I execute all requests but it's possible to slightly modify algorithm if you want to terminate batch upload on the first failure.
async function batchUpload(files, limit) {
limit = Math.min(files.length, limit);
return new Promise((resolve, reject) => {
const responsesOrErrors = new Array(files.length);
let startedCount = 0;
let finishedCount = 0;
let hasErrors = false;
function recursiveUpload() {
let index = startedCount++;
uploadFile(files[index])
.then(res => {
responsesOrErrors[index] = res;
})
.catch(error => {
responsesOrErrors[index] = error;
hasErrors = true;
})
.finally(() => {
finishedCount++;
if (finishedCount === files.length) {
hasErrors ? reject(responsesOrErrors) : resolve(responsesOrErrors);
} else if (startedCount < files.length) {
recursiveUpload();
}
});
}
for (let i = 0; i < limit; i++) {
recursiveUpload();
}
});
}
async function uploadFile(file) {
console.log(`${file} started`);
const delay = Math.floor(Math.random() * 1500);
return new Promise((resolve, reject) => {
setTimeout(() => {
if (delay <= 1000) {
console.log(`${file} finished successfully`);
resolve(`${file} success`);
} else {
console.log(`${file} finished with error`);
reject(`${file} error`);
}
}, delay);
});
}
const files = new Array(10).fill('file').map((file, index) => `${file}_${index + 1}`);
batchUpload(files, 3)
.then(responses => console.log('All successfull', responses))
.catch(responsesWithErrors => console.log('All with several failed', responsesWithErrors));

Resources