I've got a Node.js app that gets a list of file locally and uploads them to a server. This list could contain thousands of files.
for (var i = 0; i < files.length; i++) {
upload_file(files[i]);
}
If I execute this with thousands of files, upload_file will get called thousands of times all at once, and most likely die (or at least struggle). In the synchronous world, we'd create a thread pool and limit it to a certain number of threads. Is there a simple way to limit how many asynchronous calls get executed at once?
As usual, I recommend Caolan McMahon's async module.
Make your upload_file function take a callback as it's second parameter:
var async = require("async");
function upload_file(file, callback) {
// Do funky stuff with file
callback();
}
var queue = async.queue(upload_file, 10); // Run ten simultaneous uploads
queue.drain = function() {
console.log("All files are uploaded");
};
// Queue your files for upload
queue.push(files);
queue.concurrency = 20; // Increase to twenty simultaneous uploads
The answer above, re: async on NPM is the best answer, but if you'd like to learn more about control flow:
You should look into control flow patterns. There's a wonderful discussion on control flow patterns in Chapter 7 of Mixu's Node Book. Namely, I'd look at the example in 7.2.3: Limited parallel - an asynchronous, parallel, concurrency limited for loop.
I've adapted his example:
function doUpload() {
// perform file read & upload here...
}
var files = [...];
var limit = 10; // concurrent read / upload limit
var running = 0; // number of running async file operations
function uploader() {
while(running < limit && files.length > 0) {
var file = files.shift();
doUpload(file, function() {
running--;
if(files.length > 0)
uploader();
});
running++;
}
}
uploader();
You should try queueing. I assume that a callback is fired when upload_file() finishes. Something like this should do the trick (untested):
function upload_files(files, maxSimultaneousUploads, callback) {
var runningUploads = 0,
startedUploads = 0,
finishedUploads = 0;
function next() {
runningUploads--;
finishedUploads++;
if (finishedUploads == files.length) {
callback();
} else {
// Make sure that we are running at the maximum capacity.
queue();
}
}
function queue() {
// Run as many uploads as possible while not exceeding the given limit.
while (startedUploads < files.length && runningUploads < maxSimultaneousUploads) {
runningUploads++;
upload_file(files[startedUploads++], next);
}
}
// Start the upload!
queue();
}
The others answers seem to be outdated. This can be solved easily using paralleLimit from async. Below is how to use it. I haven't tested it.
var tasks = files.map(function(f) {
return function(callback) {
upload_file(f, callback)
}
});
parallelLimit(tasks, 10, function(){
});
No external libraries. Just plain JS.
It can be resolved using recursion.
The idea is that initially we immediately start the maximum allowed number of uploads and each of these requests should recursively initiate a new upload on its completion.
In this example I populate successful responses together with errors and I execute all requests but it's possible to slightly modify algorithm if you want to terminate batch upload on the first failure.
async function batchUpload(files, limit) {
limit = Math.min(files.length, limit);
return new Promise((resolve, reject) => {
const responsesOrErrors = new Array(files.length);
let startedCount = 0;
let finishedCount = 0;
let hasErrors = false;
function recursiveUpload() {
let index = startedCount++;
uploadFile(files[index])
.then(res => {
responsesOrErrors[index] = res;
})
.catch(error => {
responsesOrErrors[index] = error;
hasErrors = true;
})
.finally(() => {
finishedCount++;
if (finishedCount === files.length) {
hasErrors ? reject(responsesOrErrors) : resolve(responsesOrErrors);
} else if (startedCount < files.length) {
recursiveUpload();
}
});
}
for (let i = 0; i < limit; i++) {
recursiveUpload();
}
});
}
async function uploadFile(file) {
console.log(`${file} started`);
const delay = Math.floor(Math.random() * 1500);
return new Promise((resolve, reject) => {
setTimeout(() => {
if (delay <= 1000) {
console.log(`${file} finished successfully`);
resolve(`${file} success`);
} else {
console.log(`${file} finished with error`);
reject(`${file} error`);
}
}, delay);
});
}
const files = new Array(10).fill('file').map((file, index) => `${file}_${index + 1}`);
batchUpload(files, 3)
.then(responses => console.log('All successfull', responses))
.catch(responsesWithErrors => console.log('All with several failed', responsesWithErrors));
Related
I am trying to write Items to AWS dynamo db using node SDK. The problem I am facing is that when I write batch items to AWS in parallel using threads, some of the items are not written to database. The number of items are written are random. For instance, If I run my code 3 times, at one time it would be 150, next it would 200 and third time it could be 135. In addition, when I write the items sequentially without threads, even then some of the items are not written.However, in this case the items are less missing. For instance if the total number of items is 300 then the items written are 298. I investigated the problem to see if there any unprocessed items but the batchWrite method returns nothing. It means that all the items are being processed correctly. Please note that I have OnDemand provision for my respective database so I do not expect any throttling issues. So here is my code.
exports.run = async function() {
**This is the function which runs first !!!!!**
const data = await getArrayOfObjects();
console.log("TOTAL PRICE CHANGES")
console.log(data.length)
const batchesOfData = makeBatches(data)
const threads = new Set();
console.log("**********")
console.log(batchesOfData.length)
console.log("**********")
for(let i = 0; i < batchesOfData.length; i++) {
console.log("BATCH!!!!!")
console.log(i)
console.log(batchesOfData[i].length)
// Sequential Approach
const response = await compensationHelper.createItems(batchesOfData[i])
console.log("RESPONSE")
console.log(response)
Parallel approach
// const workerResult = await runService(batchesOfData[i])
// console.log("WORKER RESUULT!!!!")
// console.log(workerResult);
}
}
exports.updateItemsInBatch = async function(data, tableName) {
console.log("WRITING DATA")
console.log(data.length)
const batchItems = {
RequestItems: {},
};
batchItems.RequestItems[tableName] = data;
try {
const result = await documentClient.batchWrite(batchItems).promise();
console.log("UNPROCESSED ITEMS")
console.log(result)
if (result instanceof Error) {
console.log(`[Error]: ${JSON.stringify(Error)}`);
throw new Error(result);
}
return Promise.resolve(true);
} catch (err) {
console.error(`[Error]: ${JSON.stringify(err.message)}`);
return Promise.reject(new Error(err));
}
};
exports.convertToAWSCompatibleFormat = function(data) {
const awsCompatibleData = [];
data.forEach(record => awsCompatibleData.push({ PutRequest: { Item: record } }));
return awsCompatibleData;
};
const createItems = async function(itemList) {
try {
const objectsList = [];
for (let index = 0; index < itemList.length; index++) {
try {
const itemListObj = itemList[index];
const ObjToBeInserted = {
// some data assignments here
};
objectsList.push(ObjToBeInserted);
if (
objectsList.length >= AWS_BATCH_SIZE ||
index === itemList.length - 1
) {
const awsCompatiableFormat = convertToAWSCompatibleFormat(
objectsList
);
await updateItemsInBatch(
awsCompatiableFormat,
process.env.myTableName
);
}
} catch (error) {
console.log(`[Error]: ${JSON.stringify(error)}`);
}
}
return Promise.resolve(true);
} catch (err) {
return Promise.reject(new Error(err));
}
};
const makeBatches = products => {
const productBatches = [];
let countr = -1;
for (let index = 0; index < products.length; index++) {
if (index % AWS_BATCH_SIZE === 0) {
countr++;
productBatches[countr] = [];
if (countr === MAX_BATCHES) {
break;
}
}
try {
productBatches[countr].push(products[index]);
} catch (error) {
continue;
}
}
return productBatches;
};
async function runService(workerData) {
return new Promise((resolve, reject) => {
const worker = new Worker(path.join(__dirname, './worker.js'), { workerData });
worker.on('message', resolve);
worker.on('error', reject);
worker.on('exit', (code) => {
if (code !== 0)
reject(new Error(`Worker stopped with exit code ${code}`));
})
})
}
// My worker file
'use strict';
const { workerData, parentPort } = require('worker_threads')
const creatItems = require('myscripts')
// You can do any heavy stuff here, in a synchronous way
// without blocking the "main thread"
console.log("I AM A NEW THREAD")
createItems(workerData)
// console.log('Going to write tons of content on file '+workerData);
parentPort.postMessage({ fileName: workerData, status: 'Done' })
From boto3 documentation:
If one or more of the following is true, DynamoDB rejects the entire batch write operation:
One or more tables specified in the BatchWriteItem request does not exist.
Primary key attributes specified on an item in the request do not match those in the corresponding table's primary key schema.
You try to perform multiple operations on the same item in the same BatchWriteItem request. For example, you cannot put and delete the same item in the same BatchWriteItem request.
Your request contains at least two items with identical hash and range keys (which essentially is two put operations).
There are more than 25 requests in the batch.
Any individual item in a batch exceeds 400 KB.
The total request size exceeds 16 MB.
To me, it looks some of this is true. At my job, we also had a problem that one batch contained 2 identical primary and secondary keys in the batch so the whole batch was discarded. I know it's not node.js, but we used this to overcome that problem.
It is batch_writer(overwrite_by_pkeys) and it is used to overwrite the last occurance of the same primary and last key in the batch. If only a small portion of your data is duplicate data and you do not need to save it, you can use this. BUT if you need to save all your data, I do not advise you to use this functionality.
I don't see where you are checking the response for UnprocessedItems. Batch operations will often return a list of items it didn't process. As is documented, BatchWriteItem "can write up to 16 MB of data, which can comprise as many as 25 put or delete requests."
I had duplicate keys issue which means that primary and the sort key had duplicate values in the batch, however, in my case this error was not returned from the AWS BatchWrite method if my timestamp was in fraction of seconds 2020-02-09T08:02:36.71, which was a bit surprising. I resolved the issue by making my createdAt(sort key) to be more granular like this => 2020-02-09T08:02:36.7187 Thus making it non-repetitive.
I am trying to call CompaniesHouse API and fetch companies registered between November and February. The approach I took is to pick a starting index(a company registered in November) and a stop index(a company registered in February) and loop through to get the companies registered between the start and stop index. Like so:
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
for(idx = startIdx; idx < stopIdx; idx++)
{
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
.then(function(data) {
})
.catch(function(err) {
console.log('Call the locksmith!' + err)
})
}
But this doesn't work as gives either a timeout or socket hangup error.
The API is currently in beta and some features are still yet to be implemented.
Because the for loop runs synchronously and your calls to needle() are asynchronous and therefore do not block, you end up attempting to start more than 100,000 network requests at once. This overwhelms either your local computer or the target server and you start getting socket errors.
For this many requests, you need to run them X at a time so no more than X are in flight at the same time. To maximize performance, you will have to figure out what value of X you want to use because it will depend upon the target server and how it handles lots of simultaneous requests. It is generally safe to start with a value of 5 and then increase it from there to test higher values.
If you were processing an array, there are a number of pre-built options to run X requests at once. The simplest is to use a pre-built concurrency management operation such as Bluebird. Or you can write your own. You can see examples of both here: Make several requests to an API that can only handle 20 request a minute
But, since you are not processing an array, but are just incrementing a number for each successive request, I couldn't find a pre-built option that does that. So, I wrote a general purpose one where you can fill in the function that will increment your index:
// fn gets called on each iteration - must return a promise
// limit is max number of requests to be in flight at once
// cnt is number of times to call fn
// options is optional and can be {continueOnError: true}
// runN returns a promise that resolves with results array.
// If continueOnError is set, then results array
// contains error values too (presumed to be instanceof Error so caller can discern
// them from regular values)
function runN(fn, limit, cnt, options = {}) {
return new Promise((resolve, reject) => {
let inFlightCntr = 0;
let results = [];
let cntr = 0;
let doneCnt = 0;
function run() {
while (inFlightCntr < limit && cntr < cnt) {
let resultIndex = cntr++;
++inFlightCntr;
fn().then(result => {
--inFlightCntr;
++doneCnt;
results[resultIndex] = result;
run(); // run any more that still need to be run
}).catch(err => {
--inFlightCntr;
++doneCnt;
if (options.continueOnError) {
// assumes error is instanceof Error so caller can tell the
// difference between a genuine result and an error
results[resultIndex] = err;
run(); // run any more that still need to be run
} else {
reject(err);
}
});
}
if (doneCnt === cnt) {
resolve(results);
}
}
run();
});
}
Then, you could use this like this:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
To minimize memory use, you can use a .then() handler on your call to needle() and trim down the response to only what you need in the final array:
const needle = require("needle");
const startIdx = 11059000;
const stopIdx = 11211109;
const numConcurrent = 5;
let idxCntr = startIdx;
runN(function() {
let idx = idxCntr++;
return needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
}).then(response => {
// construct the smallest possible response here and then return it
// to minimize memory use for your 100,000+ requests
return response.someProperty;
});
}, numConcurrent, stopIdx - startIdx + 1, {continueOnError: true}).then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
var needle = require("needle");
var startIdx = 11059000;
var stopIdx = 11211109;
const promises = [];
for(idx = startIdx; idx < stopIdx; idx++)
{
promises.push(
needle('get', "https://api.companieshouse.gov.uk/company/"+idx, {
username: key,password:""
})
)
}
Promise.all(promises).then(results => {console.log(results);}).catch(err => console.log(err));
A simple Promise.all implementation can help.
So this is the code I used to crawl my pages (i'm using request and cheerio modules:
for (let j = 1; j < nbRequest; j++)
{
const currentPromise = new Promise((resolve, reject) => {
request(
`https://www.url${j}`,
(error, response, body) => {
if (error || !response) {
console.log("Error: " + error);
}
console.log("Status code: " + response.statusCode + ", Connected to the page");
var $ = cheerio.load(body);
let output = {
ranks: [],
names: [],
numbers: [],
};
$('td.rangCell').each(function( index ) {
if ($(this).text().trim() != "Rang")
{
output.ranks.push($(this).text().trim().slice(0, -1));
nbRanks = nb_ranks+1;
}
});
$('td.nameCell:has(label)').each(function( index ) {
output.names.push($(this).find('label.nameValue > a').text().trim());
});
$('td.numberCell').each(function( index ) {
if ($(this).text().trim() != "Nombre")
{
output.numbers.push($(this).text().trim());
}
});
console.log("HERE 1");
return resolve(output);
}
);
});
promises.push(currentPromise);
}
after that I'm parsing and saving the result in a csv file using a node module.
At this point i've been able to crawl about 100 pages, but when it comes to much bigger numbers (1000+) I'm receiving a 500 response meaning that i'm being kicked i think.
So i think the best solution is to delay requests, but i didn't find the solution.
Do you guys have any idea and how the code would look like ?
what you are looking for is called "Control Flow", you can achieve this by using async.queue for example.
If you add every request to the the queue you can control the amount of parallel requests with the amount of workers. And you could add setTimeouts to the final part of the request's callback to achieve the delaying of requests.
Additionally I'd suggest using a "crawler" package (instead of building your own) e.g. npm-crawler as they ship with build in rate-limiting and have already taken care of other things that you might face next :) e.g. user-agent pool
Update:
const async = require("async");
const delayTime = 1500; //wait 1,5 seconds after every new request
getRequestPromise(csvLine){
return new Promise( make you request here );
}
const asyncQueue = async.queue(function(task, callback) {
getRequestPromise(task).then(_ => {
setTimeout(() => {
callback(null);
}, delayTime);
});
}, 1); //1 one request at a time
for(csv){ //pseudo
asyncQueue.push(csv[i], () => {});
}
asyncQueue.drain = () => {
console.log("finished.");
};
I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?
You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);
Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.
I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})
A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);
I'm playing around with node.js, trying to re-write a particularly poorly designed part of my production system at work. So far, so good, I use rabbitmq for messaging, and my node.js part of the system runs ghostscript command line tool to convert tiff files to pdf. Obviously I need to make sure I'm not running more than some fixed amount of conversions at a time. What would be the best way to do this with node? I understand that maybe node.js isn't really about running heavy disk IO stuff, but I'm having too much fun with it to quit.
I was considering just using a blocking call to execute command line utilities but the thing is that some messages don't require this conversion and there's no need to delay their processing.
[Update] node-batches seems more appropriate.
I think you need something like forEachLimit (the following snippet was extracted from the async library)
forEachLimit = function (arr, limit, iterator, callback) {
callback = callback || function () {};
if (!arr.length || limit <= 0) {
return callback();
}
var completed = 0;
var started = 0;
var running = 0;
(function replenish () {
if (completed === arr.length) {
return callback();
}
while (running < limit && started < arr.length) {
iterator(arr[started], function (err) {
if (err) {
callback(err);
callback = function () {};
}
else {
completed += 1;
running -= 1;
if (completed === arr.length) {
callback();
}
else {
replenish();
}
}
});
started += 1;
running += 1;
}
})();
};
Usage:
var fileToConvert = ['file1', 'file2', 'file3']
maxConcurrency = 4;
function fnIter(item, callback){
console.log('converting', item);
// Convertion happen here
require('child_process').exec("some -f "+item, function(error, stdout, stderr){
callback(stderr); // stderr should be "null" if everything went good.
});
}
function fnDone(){
console.log('done !');
}
forEachLimit(fileToConvert, maxConcurrency, fnIter, fnDone);