How do I manage a large number of HTTP requests in node - node.js

Been looking around the net for an answer to this, but not found anything conclusive.
I have a node application that (potentially) needs to make a large number of HTTP GET requests.
Let's say http://foo.com/bar allows an 'id' query parameter, and I have a large number of IDs to process (~1k), i.e.
http://foo.com/bar?id=100
http://foo.com/bar?id=101
etc.
What libraries that folks have used might be best suited to this task?
I guess I'm looking for something between a queue and a connection pool:
The setup:
A large array of IDs exists to be processed (up to ~1k IDs)
The process:
Some kind of pool containing X number of 'workers' is defined
Each worker takes an ID and makes a request (with up to X concurrent workers running at a time)
When a worker completes, it takes the next ID from the array and processes that
etc. until all IDs have been processed
Any experience welcome

It was actually a lot simpler than I initially thought, and only requires Bluebird (I'm paraphrasing here a little bit since my final code ended up much more complex):
var Promise = require('bluebird');
...
var allResults = [];
...
Promise.map(idList, (id) => {
// For each ID in idList, make a HTTP call
return http.get( ... url: 'http://foo.com/bar?id=' + id ... )
.then((httpResposne) => {
return allResults.push(httpResposne);
})
.catch((err) => {
var errMsg = 'ERROR: [' + err + ']';
console.log(errMsg + (err.stack ? '\n' + err.stack : ''));
});
}, { concurrency: 10 }) // Max of 10 concurrent HTTP calls at once
.then(() => {
// All requests are now complete, return all results
return res.json(allResults);
});

Related

Ability to provide insights from Redis Bull Queue data

I have an application that makes API calls to another system, and it queues these API calls in a queue using Bull and Redis.
However, occasionally it gets bogged down with lots of API calls, or something stops working properly, and I want an easy way for users to check if the system is just "busy". (Otherwise, if they perform some action, and 10 minutes later it hasn't completed, they'll keep trying it again, and then we get a backlog of more entries (and in some cases data issues where they've issued duplicate parts, etc.)
Here's what a single "key" looks like for a successful API call in the queue:
HSET "bull:webApi:4822" "timestamp" "1639085540683"
HSET "bull:webApi:4822" "returnvalue" "{"id":"e1df8bb4-fb6c-41ad-ba62-774fe64b7882","workOrderNumber":"WO309967","status":"success"}"
HSET "bull:webApi:4822" "processedOn" "1639085623027"
HSET "bull:webApi:4822" "data" "{"id":"e1df8bb4-fb6c-41ad-ba62-774fe64b7882","token":"eyJ0eXAiOiJKV1QiL....dQVyEpXt64Fznudfg","workOrder":{"members":{"lShopFloorLoad":true,"origStartDate":"2021-12-09T00:00:00","origRequiredQty":2,"requiredQty":2,"requiredDate":"2021-12-09T00:00:00","origRequiredDate":"2021-12-09T00:00:00","statusCode":"Released","imaItemName":"Solid Pin - Black","startDate":"2021-12-09T00:00:00","reference":"HS790022053","itemId":"13840402"}},"socketId":"3b9gejTZjAXsnEITAAvB","type":"Create WO"}"
HSET "bull:webApi:4822" "delay" "0"
HSET "bull:webApi:4822" "priority" "0"
HSET "bull:webApi:4822" "name" "__default__"
HSET "bull:webApi:4822" "opts" "{"lifo":true,"attempts":1,"delay":0,"timestamp":1639085540683}"
HSET "bull:webApi:4822" "finishedOn" "1639085623934"
You can see in this case it took 83 seconds to process. (1639085540 - 1639085623)
I'd like to be able to provide summary metrics like:
Most recent API call was added to queue X seconds ago
Most recent successful API call completed X seconds ago and took XX seconds to
complete.
I'd also like to be able to provide a list of the 50 most recent API calls, formatted in a nice way and tagged with "success", "pending", or "failed".
I'm fairly new to Redis and Bull, and I'm trying to figure out how to query this data (using Redis in Node.js) and return this data as JSON to the application.
I can pull a list of keys like this:
// #route GET /status
async function status(req, res) {
const client = createClient({
url: `redis://${REDIS_SERVER}:6379`
});
try {
client.on('error', (err) => console.log('Redis Client Error', err));
await client.connect();
const value = await client.keys('*');
res.json(value)
} catch (error) {
console.log('ERROR getting status: ', error.message, new Date())
res.status(500).json({ message: error.message })
} finally {
client.quit()
}
}
Which will return ["bull:webApi:3","bull:webApi:1","bull:webApi:2"...]
But how can I pull the values associated to the respective keys?
And how can I find the key with the highest number, and then pull the details for the "last 50". In SQL, it would be like doing a ORDER BY key_number DESC LIMIT 50 - but I'm not sure how to do it in Redis.
I'm a bit late here, but if you're not set on manually digging around in Redis, I think Bull's API, in particular Queue#getJobs(), has everything you need here, and should be much easier to work with. Generally, you shouldn't have to reach into Redis to do any common tasks like this, that's what Bull is for!
If I understand you goal correctly, you should be able to do something like:
const Queue = require('bull')
async function status (req, res) {
const { listNum = 10 } = req.params
const api_queue = new Queue('webApi', `redis://${REDIS_SERVER}:6379`)
const current_timestamp_sec = new Date().getTime() / 1000 // convert to seconds
const recent_jobs = await api_queue.getJobs(null, 0, listNum)
const results = recent_jobs.map(job => {
const processed_on_sec = job.processedOn / 1000
const finished_on_sec = job.finishedOn / 1000
return {
request_data: job.data,
return_data: job.returnvalue,
processedOn: processed_on_sec,
finishedOn: finished_on_sec,
duration: finished_on_sec - processed_on_sec,
elapsedSinceStart: current_timestamp_sec - processed_on_sec,
elapsedSinceFinished: current_timestamp_sec - finished_on_sec
}
})
res.json(results)
}
That will get you the most recent numList* jobs in your queue. I haven't tested this full code, and I'll leave the error handling and adding of your custom fields to the job data up to you, but the core of it is solid and I think that should meet your needs without ever having to think about how Bull stores things in Redis.
I also included a suggestion on how to deal with timestamps a bit more nicely, you don't need to do string processing to convert milliseconds to seconds. If you need them to be integers you can wrap them in Math.floor().
* at least that many, anyway - see the second note below
A couple notes:
The first argument of getJobs() is a list of statuses, so if you want to look at just completed jobs, you can pass ['completed'], or completed and active, do ['completed', 'active'], etc. If no list is provided (null) it defaults to all statuses.
As mentioned in the reference I linked, the limit here is per state - so you'll likely get more than listNum jobs back. It doesn't seem like that should be a problem for your use case, but if it is, you can sort the list returned (probably by job id) and just return the first listNum - you're guaranteed to get at least that many (assuming there are that many jobs in your queue), and won't get more than 6*listNum (since there are 6 states).
Folks new to Bull can get nervous about instantiating a Queue object to do stuff like this - but don't be! By itself a Queue instance doesn't do anything, it's just an interface to the given queue. It won't start processing jobs until you call process() to add a processor. This is, incidentally, also how you'd add jobs from a separate process than you run your queues in, but of course nothing will be added unless you call add().
So I've figured out how to pull the data I need. I'm not saying it's a good method, and I'm open to suggestions; but it seems to work to provide a filtered JSON return with the needed data, without changing how the queue functions.
Here's what it looks like:
// #route GET /status/:listNum
async function status(req, res) {
const { listNum = 10} = req.params
const client = createClient({
url: `redis://${REDIS_SERVER}:6379`
});
try {
client.on('error', (err) => console.log('Redis Client Error', err));
await client.connect();
// Find size of queue database
const total_keys = await client.sendCommand(['DBSIZE']);
const upper_value = total_keys;
const lower_value = total_keys - listNum;
// Generate array
const range = (start, stop) => Array.from({ length: (start - stop) + 1}, (_, i) => start - (i));
var queue_ids = range(upper_value, lower_value)
queue_ids = queue_ids.filter(function(x){ return x > 0 }); // Filer out anything less than zero
// Current timestamp in seconds
const current_timestamp = parseInt(String(new Date().getTime()).slice(0, -3)); // remove microseconds ("now")
var response = []; // Initialize array
for(id of queue_ids){ // Loop through queries
// Query value
var value = await client.HGETALL('bull:webApi:'+id);
if(Object.keys(value).length !== 0){ // if returned a value
// Grab most of the request (exclude the token & socketId to save space, not used)
var request_data = JSON.parse(value.data)
request_data.token = '';
request_data.socketId = '';
// Grab & calculate desired times
const processedOn = value.processedOn.slice(0, -3); // remove microseconds ("start")
const finishedOn = value.finishedOn.slice(0, -3); // remove microseconds ("done")
const duration = finishedOn - processedOn; // (seconds)
const elapsedSinceStart = current_timestamp - processedOn;
const elapsedSinceFinished = current_timestamp - finishedOn;
// Grab the returnValue
const return_data = value.returnValue;
// ignoring queue keys of: opts, priority, delay, name, timestamp
const object_data = {request_data: request_data, processedOn: processedOn, finishedOn: finishedOn, return_data: return_data, duration: duration, elapsedSinceStart: elapsedSinceStart, elapsedSinceFinished: elapsedSinceFinished }
response.push(object_data);
}
}
res.json(response);
} catch (error) {
console.log('ERROR getting status: ', error.message, new Date());
res.status(500).json({ message: error.message });
} finally {
client.quit();
}
}
It's looping the Redis query, so I wouldn't want to use this for hundreds of keys, but for 10 or even 50 I'm thinking it should work.
For now I've resorted to getting the total number of keys and working backwards:
await client.sendCommand(['DBSIZE']);
In my case it will return a total number slightly higher than the highest key id (~ a handful of status keys), but at least gets close, and then I just filter out any non-responses.
I've looked at ZRANGE a bit, but I can't figure out how to get it to give me the last id. When I have a Redis database (Bull Queue) like this:
If there's a simple Redis command I can run that will return "3", I'd probably use that instead. (since bull:webApi:3 has the highest number)
(In actual use case, this might be 9555 or some high number; I just want to get the highest numbered key that exists.)
For now I'll try using the method I've come up with above.

How to process large number of requests with promise all

I have about 5000 links and I need to crawl all those. So Im wonder is there a better approach than this. Here is my code.
let urls = [ 5000 urls go here ];
const doms = await getDoms(urls);
// processing and storing the doms
getDoms = (urls) => {
let data = await Promise.all(urls.map(url => {
return getSiteCrawlPromise(url)
}));
return data;
}
getSiteCrawlPromise = (url) => {
return new Promise((resolve, reject) => {
let j = request.jar();
request.get({url: url, jar: j}, function(err, response, body) {
if(err)
return resolve({ body: null, jar: j, error: err});
return resolve({body: body, jar: j, error: null});
});
})
}
Is there a mechanism implemented in promise so it can devide the jobs to multiple threads and process. then return the output as a whole ?
and I don't want to devide the urls into smaller fragments and process those fragments
The Promise object represents the eventual completion (or failure) of an asynchronous operation, and its resulting value.
There is no in-built mechanism in Promises to "divide jobs into multiple threads and process". If you must do that, you'll have to fragment the urls array into smaller arrays and queue the fragmented arrays onto separate crawler instances simultaneously.
But, there is absolutely no need to go that way, since you're using node-js and node-crawler, you can use the maxConnections option of the node-crawler. This is what it was built for and the end result would be the same. You'll be crawling the urls on multiple threads, without wasting time and effort on manual chunking and handling of multiple crawler instances, or depending on any concurrency libraries.
There isn't such a mechanism built-in to Javascript, at least right now.
You can use third-party Promise libraries that offer more features, like Bluebird, in which you can make use of their concurrency feature:
const Promise = require('bluebird');
// Crawl all URLs, with 10 concurrent "threads".
Promise.map(arrayOfUrls, url => {
return /* promise for crawling the url */;
}, { concurrency: 10 });
Another option is to use a dedicated throttling library (I recommend highly bottleneck), which lets you express any generic kind of rate limit. The syntax in that case would be similar to what you already have:
const Bottleneck = require('bottleneck');
const limit = new Bottleneck({ maxConcurrent: 10 });
const getCallSitePromise = limit.wrap(url => {
// the body of your getCallSitePromise function, as normal
});
// getDoms stays exactly the same
You can solve this problem yourself, but bringing one (or both!) of the libraries above will save you a lot of code.

How to run asynchronous tasks synchronous?

I'm developing an app with the following node.js stack: Express/Socket.IO + React. In React I have DataTables, wherein you can search and with every keystroke the data gets dynamically updated! :)
I use Socket.IO for data-fetching, so on every keystroke the client socket emits some parameters and the server calls then the callback to return data. This works like a charm, but it is not garanteed that the returned data comes back in the same order as the client sent it.
To simulate: So when I type in 'a', the server responds with this same 'a' and so for every character.
I found the async module for node.js and tried to use the queue to return tasks in the same order it received it. For simplicity I delayed the second incoming task with setTimeout to simulate a slow performing database-query:
Declaration:
const async = require('async');
var queue = async.queue(function(task, callback) {
if(task.count == 1) {
setTimeout(function() {
callback();
}, 3000);
} else {
callback();
}
}, 10);
Usage:
socket.on('result', function(data, fn) {
var filter = data.filter;
if(filter.length === 1) { // TEST SYNCHRONOUSLY
queue.push({name: filter, count: 1}, function(err) {
fn(filter);
// console.log('finished processing slow');
});
} else {
// add some items to the queue
queue.push({name: filter, count: filter.length}, function(err) {
fn(data.filter);
// console.log('finished processing fast');
});
}
});
But the way I receive it in the client console, when I search for abc is as follows:
ab -> abc -> a(after 3 sec)
I want it to return it like this: a(after 3sec) -> ab -> abc
My thought is that the queue runs the setTimeout and then goes further and eventually the setTimeout gets fired somewhere on the event loop later on. This resulting in returning later search filters earlier then the slow performing one.
How can i solve this problem?
First a few comments, which might help clear up your understanding of async calls:
Using "timeout" to try and align async calls is a bad idea, that is not the idea about async calls. You will never know how long an async call will take, so you can never set the appropriate timeout.
I believe you are misunderstanding the usage of queue from async library you described. The documentation for the queue can be found here.
Copy pasting the documentation in here, in-case things are changed or down:
Creates a queue object with the specified concurrency. Tasks added to the queue are processed in parallel (up to the concurrency limit). If all workers are in progress, the task is queued until one becomes available. Once a worker completes a task, that task's callback is called.
The above means that the queue can simply be used to priorities the async task a given worker can perform. The different async tasks can still be finished at different times.
Potential solutions
There are a few solutions to your problem, depending on your requirements.
You can only send one async call at a time and wait for the first one to finish before sending the next one
You store the results and only display the results to the user when all calls have finished
You disregard all calls except for the latest async call
In your case I would pick solution 3 as your are searching for something. Why would you use care about the results for "a" if they are already searching for "abc" before they get the response for "a"?
This can be done by giving each request a timestamp and then sort based on the timestamp taking the latest.
SOLUTION:
Server:
exports = module.exports = function(io){
io.sockets.on('connection', function (socket) {
socket.on('result', function(data, fn) {
var filter = data.filter;
var counter = data.counter;
if(filter.length === 1 || filter.length === 5) { // TEST SYNCHRONOUSLY
setTimeout(function() {
fn({ filter: filter, counter: counter}); // return to client
}, 3000);
} else {
fn({ filter: filter, counter: counter}); // return to client
}
});
});
}
Client:
export class FilterableDataTable extends Component {
constructor(props) {
super();
this.state = {
endpoint: "http://localhost:3001",
filters: {},
counter: 0
};
this.onLazyLoad = this.onLazyLoad.bind(this);
}
onLazyLoad(event) {
var offset = event.first;
if(offset === null) {
offset = 0;
}
var filter = ''; // filter is the search character
if(event.filters.result2 != undefined) {
filter = event.filters.result2.value;
}
var returnedData = null;
this.state.counter++;
this.socket.emit('result', {
offset: offset,
limit: 20,
filter: filter,
counter: this.state.counter
}, function(data) {
returnedData = data;
console.log(returnedData);
if(returnedData.counter === this.state.counter) {
console.log('DATA: ' + JSON.stringify(returnedData));
}
}
This however does send unneeded data to the client, which in return ignores it. Somebody any idea's for further optimizing this kind of communication? For example a method to keep old data at the server and only send the latest?

Querying WordPress WP-API for lots of posts (using promises!)

I am working on a Node.js application which uses the WordPress JSON API as a kind of headless CMS. When the application spins up, we query out to the WP database and pull in the information we need (using Axios), manipulate it, and store it temporarily.
Simple enough - but one of our post categories in the CMS has a rather large number of entries. For some godforsaken reason, WordPress has capped the API request limit to a maximum of 99 posts at a time, and requires that we write a loop that can send concurrent API requests until all the data has been pulled.
For instance, if we have 250 posts of some given type, I need to hit that route three separate times, specifying the specific "page" of data I want each time.
Per the docs, https://developer.wordpress.org/rest-api/using-the-rest-api/pagination/, I have access to a ?page= query string that I can use to send these requests concurrently. (i.e. ...&page=2)
I also have access to X-WP-Total in the headers object, which gives me the total number of posts within the given category.
However, these API calls are part of a nested promise chain, and the whole process needs to return a promise I can continue chaining off of.
The idea is to make it dynamic so it will always pull all of the data, and return it as one giant array of posts. Here's what I have, which is functional:
const request = require('axios');
module.exports = (request_url) => new Promise((resolve, reject) => {
// START WITH SMALL ARBITRARY REQUEST TO GET TOTAL NUMBER OF POSTS FAST
request.get(request_url + '&per_page=1').then(
(apiData) => {
// SETUP FOR PROMISE.ALL()
let promiseArray = [];
// COMPUTE HOW MANY REQUESTS WE NEED
// ALWAYS ROUND TOTAL NUMBER OF PAGES UP TO GET ALL THE DATA
const totalPages = Math.ceil(apiData.headers['x-wp-total']/99);
for (let i = 1; i <= totalPages; i++) {
promiseArray.push( request.get(`${request_url}&per_page=99&page=${i}`) )
};
resolve(
Promise.all(promiseArray)
.then((resolvedArray) => {
// PUSH IT ALL INTO A SINGLE ARRAY
let compiledPosts = [];
resolvedArray.forEach((axios_response) => {
// AXIOS MAKES US ACCESS W/RES.DATA
axios_response.data.forEach((post) => {
compiledPosts.push(post);
})
});
// RETURN AN ARRAY OF ALL POSTS REGARDLESS OF LENGTH
return compiledPosts;
}).catch((e) => { console.log('ERROR'); reject(e);})
)
}
).catch((e) => { console.log('ERROR'); reject(e);})
})
Any creative ideas to make this pattern better?
I have exactly the same question. In my case, I use Vue Resource :
this.$resource('wp/v2/media').query().then((response) => {
let pagesNumber = Math.ceil(response.headers.get('X-WP-TotalPages'));
for(let i=1; i <= pagesNumber; i++) {
this.$resource('wp/v2/media?page='+ i).query().then((response) => {
this.medias.push(response.data);
this.medias = _.flatten(this.medias);
console.log(this.medias);
});
}
I'm pretty sure there is a better workaround to achieve this.

A persistent multiset that is built up sequentially and processed at once for Node.js

In Node.js I am trying to get the following behaviour: During the runtime of my Express application I accumulate several IDs of objects that need further processing. For further processing, I need to transmit these IDs to a different service. The other service however, cannot handle a lot of requests, but rather requires batch transmission. Hence I need to accumulate a lot of individual requests to a bigger one while allowing persistence.
tl;dr — Over 15 minutes, several IDs are accumulated in my application, then after this 15 minute-window, they're all emitted at once. At the same time, the next windows is opened.
From my investigation, this is probably the abstract data type of a multiset: The items in my multiset (or bag) can have duplicates (hence the multi-), they're packaged by the time window, but have no index.
My infrastructure is already using redis, but I am not sure whether there's a way to accumulate data into one single job. Or is there? Are there any other sensible ways to achieve this kind of behavior?
I might be misunderstanding some of the subtlety of your particular situation, but here goes.
Here is a simple sketch of some code that processes a batch of 10 items at a time. The way you would do this differs slightly depending on whether the processing step is synchronous or asynchronous. You don't need anything more complicated than an array for this, since arrays have constant time push and length methods and that is the only thing you need to do. You may want to add another option to flush the batch after a given item in inserted.
Synchronous example:
var batch = [];
var batchLimit = 10;
var sendItem = function (item) {
batch.push(item);
if (item.length >= batchLimit) {
processBatchSynch(batch);
batch = [];
}
}
Asynchronous example:
// note that in this case the job of emptying the batch array
// has to be done inside the callback.
var batch = [];
var batchLimit = 10;
// your callback might look something like function(err, data) { ... }
var sendItem = function (item, cb) {
batch.push(item);
if (item.length >= batchLimit) {
processBatchAsync(batch, cb);
}
}
I came up with a npm module to solve this specific problem using a MySQL database for persistence:
persistent-bag: This is to bags like redis is to queues. A bag (or multiset) is filled over time and processed at once.
On instantiation of the object, the required table is created in the provided MySQL database if necessary.
var PersistentBag = require('persistent-bag');
var bag = new PersistentBag({
host: 'localhost',
port: '3306',
user: 'root',
password: '',
database: 'test'
});
Then items can be .add()ed during the runtime of any number of applications:
var item = {
title: 'Test item to store to bag'
};
bag.add(item, function (err, itemId) {
console.log('Item id: ' + itemId);
});
Working with the emitted, aggregated items every 15 minutes is done like in kue for redis by subscribing to .process():
bag.process(function worker(bag, done) {
// bag.data is now an array of all items
doSomething(bag.data, function () {
done();
});
});

Resources