Inconsistent request behavior in Node when requesting large number of links? - node.js

I am currently using this piece of code to connect to a massive list of links (a total of 2458 links, dumped at https://pastebin.com/2wC8hwad) to get feeds from numerous sources, and to deliver them to users of my program.
It's basically splitting up one massive array into multiple batches (arrays), then forking a process to handle a batch to request each stored link for a 200 status code. Only when a batch is complete is the next batch sent for processing, and when its all done the forked process is disconnected. However I'm facing issues concerning apparent inconsistency in how this is performing with this logic, particularly the part where it requests the code.
const req = require('./request.js')
const process = require('child_process')
const linkList = require('./links.json')
let processor
console.log(`Total length: ${linkList.length}`) // 2458 links
const batchLength = 400
const batchList = [] // Contains batches (arrays) of links
let currentBatch = []
for (var i in linkList) {
if (currentBatch.length < batchLength) currentBatch.push(linkList[i])
else {
batchList.push(currentBatch)
currentBatch = []
currentBatch.push(linkList[i])
}
}
if (currentBatch.length > 0) batchList.push(currentBatch)
console.log(`Batch list length by default is ${batchList.length}`)
// cutDownBatchList(1)
console.log(`New batch list length is ${batchList.length}`)
const startTime = new Date()
getBatchIsolated(0, batchList)
let failCount = 0
function getBatchIsolated (batchNumber) {
console.log('Starting batch #' + batchNumber)
let completedLinks = 0
const currentBatch = batchList[batchNumber]
if (!processor) processor = process.fork('./request.js')
for (var u in currentBatch) { processor.send(currentBatch[u]) }
processor.on('message', function (linkCompletion) {
if (linkCompletion === 'failed') failCount++
if (++completedLinks === currentBatch.length) {
if (batchNumber !== batchList.length - 1) setTimeout(getBatchIsolated, 500, batchNumber + 1)
else finish()
}
})
}
function finish() {
console.log(`Completed, time taken: ${((new Date() - startTime) / 1000).toFixed(2)}s. (${failCount}/${linkList.length} failed)`)
processor.disconnect()
}
function cutDownBatchList(maxBatches) {
for (var r = batchList.length - 1; batchList.length > maxBatches && r >= 0; r--) {
batchList.splice(r, 1)
}
return batchList
}
Below is request.js, using needle. (However, for some strange reason it may completely hang up on a particular site indefinitely - in that case, I just use this workaround)
const needle = require('needle')
function connect (link, callback) {
const options = {
timeout: 10000,
read_timeout: 8000,
follow_max: 5,
rejectUnauthorized: true
}
const request = needle.get(link, options)
.on('header', (statusCode, headers) => {
if (statusCode === 200) callback(null, link)
else request.emit('err', new Error(`Bad status code (${statusCode})`))
})
.on('err', err => callback(err, link))
}
process.on('message', function(linkRequest) {
connect(linkRequest, function(err, link) {
if (err) {
console.log(`Couldn't connect to ${link} (${err})`)
process.send('failed')
} else process.send('success')
})
})
In theory, I think this should perform perfectly fine - it spawns off a separate process to handle the dirty work in sequential batches so its not overloaded and is super scaleable. However, when using using the full list of links at length 2458 with a total of 7 batches, I often get massive "socket hang up" errors on random batches on almost every trial that I do, similar to what would happen if I requested all the links at once.
If I cut down the number of batches to 1 using the function cutDownBatchList it performs perfectly fine on almost every trial. This is all happening on a Linux Debian VPS with two 3.1GHz vCores and 4 GB RAM from OVH, on Node v6.11.2
One thing I also noticed is that if I increased the timeout to 30000 (30 sec) in request.js for 7 batches, it works as intended - however it works perfectly fine with a much lower timeout when I cut it down to 1 batch. If I also try to do all 2458 links at once, with a higher timeout, I also face no issues (which basically makes this mini algorithm useless if I can't cut down the timeout via batch handling links). This all goes back to the inconsistent behavior issue.
The best TLDR I can do: Trying to request a bunch of links in sequential batches in a forked child process - succeeds almost every time with a lower number of batches, fails consistently with full number of batches even though behavior should be the same since its handling it in isolated batches.
Any help would be greatly appreciated in solving this issue as I just cannot for the life of me figure it out!

Related

Why is my read query to Firebase Realtime Database so slow?

I have a database in a Firebase Realtime Database with data that looks like this:
root
|_history
|_{userId}
|_{n1}
| |_ ...
|_{n2}
|_{n...}
Nodes n are keyed with a date integer value. Each n node has at least 60 keys, with some values being arrays, max 5 levels deep.
Query times were measured in a fashion similar to this:
const startTime = performance.now();
await query();
const endTime = performance.now();
logger.info(`Query completed in ${endTime - startTime} ms`);
I have a function that queries for n nodes under history/${userId} with keys between and inclusive of the start and end values:
await admin
.database()
.ref(`history/${userId}`)
.orderByKey()
.startAt(`${start}`)
.endAt(`${end}`)
.once("value")
This query is executed in a callable cloud function. This query currently takes approximately 2-3 seconds, returning approximately 225 nodes. The total number of n nodes is currently less than 300. Looking through my logs, it looks like query times that returned 0 nodes took approximately 500 milliseconds.
Why are the queries so slow? Am I misunderstanding something about Firebase's Realtime Database?
I've run a few performance tests to allow you to compare against.
I populated my database with this script:
for (var i=0; i < 500; i++) {
ref.push({
refresh_at: Date.now() + Math.round(Math.random() * 60 * 1000)
});
}
This lead to a JSON of this form:
{
"-MlWgH51ia7Iz7ubZb7K" : {
"refresh_at" : 1633726623247
},
"-MlWgH534FgMlb7J4bH2" : {
"refresh_at" : 1633726586126
},
"-MlWgH54gd-uW_M7e6J-" : {
"refresh_at" : 1633726597651
},
...
}
When retrieved in its entirety through the API, the snapshot.val() for this JSON is 26.001 characters long.
Client-side JavaScript SDK in jsbin
With the regular client-side JavaScript SDK in a jsbin and with a simple node script similar to yours.
Updated for jsbin, the code I ran is:
ref.orderByChild("refresh_at")
.endAt(Date.now())
.limitToLast(1000) // 👈 This is what we'll vary
.once("value")
.then(function(snapshot) {
var endTime = performance.now();
console.log('Query completed in '+Math.round(endTime - startTime)+'ms, retrieved '+snapshot.numChildren()+" nodes, for a total JSON size of "+JSON.stringify(snapshot.val()).length+" chars");
});
Running it a few times, and changing the limit that I marked, leads to:
Limit
Snapshot size
Average time in ms
500
26,001
350ms - 420ms
100
5,201
300ms - 350ms
10
521
300ms - 320ms
Node.js Admin SDK
I ran the same test with a local Node.js script against the exact same data set, with a modified script that runs 10 times:
for (var i=0; i < 10; i++) {
const startTime = Date.now();
const snapshot = await ref.orderByChild("refresh_at")
.endAt(Date.now())
.limitToLast(10)
.once("value")
const endTime = Date.now();
console.log('Query completed in '+Math.round(endTime - startTime)+'ms, retrieved '+snapshot.numChildren()+" nodes, for a total JSON size of "+JSON.stringify(snapshot.val()).length+" chars");
};
The results:
Limit
Snapshot size
Time in ms
500
26,001
507ms, 78ms, 70ms, 65ms, 65ms, 61ms, 64ms, 65ms, 81ms, 62ms
100
5,201
442ms, 59ms, 56ms, 59ms, 55ms, 54ms, 54ms, 55ms, 57ms, 56ms
10
521
437ms, 52ms, 49ms, 52ms, 51ms, 51ms, 52ms, 50ms, 52ms, 50ms
So what you can see is that the first run is similar (but slightly slower) as the JavaScript SDK, and subsequent runs are then a lot faster. This makes sense as on the initial run the client establishes its (web socket) connection to the database server, which includes a few roundtrips to determine the right server. Subsequent calls seem more bandwidth constrained.
Ordering by key
I also test with ref.orderByKey().startAt("-MlWgH5QUkP5pbQIkVm0").endAt("-MlWgH5Rv5ij42Vel5Sm") in Node.js and get very similar results to the ordering by child.
Add the field that you are using for the query to the Realtime Database rules.
For example
{
"rules": {
".read": "auth.uid != null",
".write": "auth.uid != null",
"v1": {
"history": {
".indexOn": "refresh_at"
}
}
}
}

MacOS Catalina freezing+crashing after running Node.JS load test script

I wrote up a simple load testing script that runs N number of hits to and HTTP endpoint over M async parallel lanes. Each lane waits for the previous request to finish before starting a new request. The script, for my specific use-case, is randomly picking a numeric "width" parameter to add to the URL each time. The endpoint returns between 200k and 900k of image data on each request depending on the width parameter. But my script does not care about this data and simply relies on garbage collection to clean it up.
const fetch = require('node-fetch');
const MIN_WIDTH = 200;
const MAX_WIDTH = 1600;
const loadTestUrl = `
http://load-testing-server.com/endpoint?width={width}
`.trim();
async function fetchAll(url) {
const res = await fetch(url, {
method: 'GET'
});
if (!res.ok) {
throw new Error(res.statusText);
}
}
async function doSingleRun(runs, id) {
const runStart = Date.now();
console.log(`(id = ${id}) - Running ${runs} times...`);
for (let i = 0; i < runs; i++) {
const start = Date.now();
const width = Math.floor(Math.random() * (MAX_WIDTH - MIN_WIDTH)) + MIN_WIDTH;
try {
const result = await fetchAll(loadTestUrl.replace('{width}', `${width}`));
const duration = Date.now() - start;
console.log(`(id = ${id}) - Width ${width} Success. ${i+1}/${runs}. Duration: ${duration}`)
} catch (e) {
const duration = Date.now() - start;
console.log(`(id = ${id}) - Width ${width} Error fetching. ${i+1}/${runs}. Duration: ${duration}`, e)
}
}
console.log(`(id = ${id}) - Finished run. Duration: ` + (Date.now() - runStart));
}
(async function () {
const RUNS = 200;
const parallelRuns = 10;
const promises = [];
const parallelRunStart = Date.now();
console.log(`Running ${parallelRuns} parallel runs`)
for (let i = 0; i < parallelRuns; i++) {
promises.push(doSingleRun(RUNS, i))
}
await Promise.all(promises);
console.log(`Finished parallel runs. Duration ${Date.now() - parallelRunStart}`)
})();
When I run this in Node 14.17.3 on my MacBook Pro running MacOS 10.15.7 (Catalina) with even a modest parallel lane number of 3, after about 120 (x 3) hits of the endpoint the following happens in succession:
Console output ceases in the terminal for the script, indicating the script has halted
Other applications such as my browser are unable to make network connections.
Within 1 - 2 mins other applications on my machine begin to slow down and eventually freeze up.
My entire system crashes with a kernel panic and the machine reboots.
panic(cpu 2 caller 0xffffff7f91ba1ad5): userspace watchdog timeout: remoted connection watchdog expired, no updates from remoted monitoring thread in 60 seconds, 30 checkins from thread since monitoring enabled 640 seconds ago after loadservice: com.apple.logd, total successful checkins since load (642 seconds ago): 64, last successful checkin: 10 seconds ago
service: com.apple.WindowServer, total successful checkins since load (610 seconds ago): 60, last successful checkin: 10 seconds ago
I can very easily stop of the progression of these symptoms by doing a Ctrl+C in the terminal of my script and force quitting it. Everything quickly gets back to normal. And I can repeat the experiment multiple times before allowing it to crash my machine.
I've monitored Activity Monitor during the progression and there is very little (~1%) CPU usage, memory usage reaches up to maybe 60-70mb, though it is pretty evident that the Network activity is peaking during the script's run.
In my search for others with this problem there were only two Stack Overflow articles that came close:
node.js hangs other programs on my mac
Node script causes system freeze when uploading a lot of files
Anyone have any idea why this would happen? It seems very dangerous that a single app/script could so easily bring down a machine without being killed first by the OS.

How can I run multiple instances of a for loop in NodeJS?

I have a function which returns the usage of a CPU core with the help of a library called cpu-stat:
const cpuStat = require('cpu-stat')
var coreCount = cpuStat.totalCores()
var memArr = []
function getCoreUsage(i) {
return new Promise(async(resolve) => {
if (i === 0) {
cpuStat.usagePercent({coreIndex: i,sampleMs: 1000,},
async function(err, percent, seconds) {
if (err) {resolve(console.log(err))}
x = await percent
resolve("Core0: " + x.toFixed(2) + "%");
});
} else {
cpuStat.usagePercent({coreIndex: i,sampleMs: 1000,},
async function(err, percent, seconds) {
if (err) {resolve(console.log(err))}
x = await percent
resolve(x);
});
}
})
}
This function is called whenever a client requests a specific route:
function singleCore() {
return new Promise(async(resolve) => {
for (i=0; i <= coreCount; i++) {
if (i < coreCount) {core = await getCoreUsage(i), memArr.push(core)}
else if (i === coreCount) {resolve(memArr), memArr = []}
}
})
}
Now, this works just fine on machines which have less than 8 cores. The problem I am running into is that if I (hypothetically) use a high core count CPU like a Xeon or a Threadripper, the time it takes to get the usage will be close to a minute or so because they can have 56 or 64 cores respectively. To solve this, I thought of executing the for loop for each core on different threads such that the time comes down to one or two seconds (high core count CPUS have a lot of threads as well, so this probably won't be a problem).
But, I can't figure out how to do this. I looked into the child_process documentation and I think this can probably be done. Please correct me if I am wrong. Also, please suggest a better way if you know one.
This usagePercent function works by
looking at the cycle-count values in os.cpus[index] in the object returned by the os package.
delaying the chosen time, probably with setTimeout.
looking at the cycle counts again and computing the difference.
You'll get reasonably valid results if you use much shorter time intervals than one second.
Or you can rework the code in the package to do the computation for all cores in step 3 and return an array rather than just one number.
Or you can use Promise.all() to run these tests concurrently.

CosmosDB insertion loop stops inserting after a certain number of iterations (Node.js)

I'm doing a few tutorials on CosmosDB. I've got the database set up with the Core (SQL) API, and using Node.js to interface with it. for development, I'm using the emulator.
This is the bit of code that I'm running:
const CosmosClient = require('#azure/cosmos').CosmosClient
process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0";
const options = {
endpoint: 'https://localhost:8081',
key: REDACTED,
userAgentSuffix: 'CosmosDBJavascriptQuickstart'
};
const client = new CosmosClient(options);
(async () => {
let cost = 0;
let i = 0
while (i < 2000) {
i += 1
console.log(i+" Creating record, running cost:"+cost)
let response = await client.database('TestDB').container('TestContainer').items.upsert({}).catch(console.log);
cost += response.requestCharge;
}
})()
This, without fail, stops at around iteration 1565, and doesn't continue. I've tried it with different payloads, without much difference (it may do a few more or a few less iterations, but seems to almsot always be around that number)
On the flipside, a similar .NET Core example works great to insert 10,000 documents:
double cost = 0.0;
int i = 0;
while (i < 10000)
{
i++;
ItemResponse<dynamic> resp = await this.container.CreateItemAsync<dynamic>(new { id = Guid.NewGuid() });
cost += resp.RequestCharge;
Console.WriteLine("Created item {0} Operation consumed {1} RUs. Running cost: {2}", i, resp.RequestCharge, cost);
}
So I'm not sure what's going on.
So, after a bit of fiddling, this doesn't seem to have anything to do with CosmosDB or it's library.
I was running this in the debugger, and Node would just crap out after x iterations. I noticed if I didn't use a console.log it would actually work. Also, if I ran the script with node file.js it also worked. So there seems to be some sort of issue with debugging the script while also printing to the console. Not exactly sure whats up with that, but going to go ahead and mark this as solved

fs.readFileSync seems faster than fs.readFile - is it OK to use for a web app in production?

I know that when developing in node, you should always try to avoid blocking (sync) functions and go with async functions, however, I did a little test to see how they compare.
I need to open a JSON file that contains i18n data (like date and time formats, etc) and pass that data to a class that uses this data to format numbers, etc in my view.
It would be kind of awkward to start wrapping all the class's methods inside callbacks, so if possible, I would use the synchronous version instead.
console.time('one');
console.time('two');
fs.readFile( this.dir + "/" + locale + ".json", function (err, data) {
if (err) cb( err );
console.timeEnd('one');
});
var data = fs.readFileSync( this.dir + "/" + locale + ".json" );
console.timeEnd('two');
This results in the following lines in my console:
two: 1ms
one: 159ms
It seems that fs.readFileSync is about 150 times faster than fs.readFile - it takes about 1 ms to load a 50KB JSON file (minified). All my JSON files are around 50-100KB.
I was also thinking maybe somehow memoizing or saving this JSON data to the session so that the file is read-only once per session (or when the user changes their locale). I'm not entirely sure how to do that, it's just an idea.
Is it okay to use fs.readFileSync in my case or will I get in trouble later?
No, it is not OK to use a blocking API call in a node server as you describe. Your site's responsiveness to many concurrent connections will take a huge hit. It's also just blatantly violating the #1 principle of node.
The key to node working is that while it is waiting on IO, it is doing CPU/memory processing at the same time. This requires asynchronous calls exclusively. So if you have 100 clients reading 100 JSON files, node can ask the OS to read those 100 files but while waiting for the OS to return the file data when it is available, node can be processing other aspects of those 100 network requests. If you have a single synchronous call, ALL of your client processing stops entirely while that operation completes. So client number 100's connection waits with no processing whatsoever while you read files for clients 1, 2, 3, 4, and so on sequentially. This is Failville.
Here's another analogy. If you went to a restaurant and were the only customer, you would probably get faster service if a single person sat you, took your order, cooked it, served it to you, and handled the bill without the coordination overhead of dealing with the host/hostess, server, head chef, line cooks, cashiers, etc. However, with 100 customers in the restaurant, the extra coordination means things happen in parallel and the overall responsiveness of the restaurant is increased way beyond what it would be if a single person were trying to handle 100 customers on their own.
You are blocking the callback of the asynchronous read with your synchronous read, remember single thread.
Now I understand that the time difference is still amazing, but you should try with a file that is much, much longer to read and imagine that many, many clients will do the same, only then the overhead will pay off.
That should answer your question, yes you will run into trouble if you are serving thousands
of requests with blocking IO.
After a lot of time and a lot of learn & practice I've tried once more and I've found the answer and I can show some example:
const fs = require('fs');
const syncTest = () => {
let startTime = +new Date();
const results = [];
const files = [];
for (let i=0, len=4; i<len; i++) {
files.push(fs.readFileSync(`file-${i}.txt`));
};
for (let i=0, len=360; i<len; i++) results.push(Math.sin(i), Math.cos(i));
console.log(`Sync version: ${+new Date() - startTime}`);
};
const asyncTest = () => {
let startTime = +new Date();
const results = [];
const files = [];
for (let i=0, len=4; i<len; i++) {
fs.readFile(`file-${i}.txt`, file => files.push(file));
};
for (let i=0, len=360; i<len; i++) results.push(Math.sin(i), Math.cos(i));
console.log(`Async version: ${+new Date() - startTime}`);
};
syncTest();
asyncTest();
Yes, it's correct, to deal with the asynchronous way in a server-side environment. But if their use case is different like to generating the build as in client-side JS project, meanwhile reading and writing the JSON files for different flavors.
It doesn't affect that much. Although we needed a rapid manner to create a minified build for deployment (here synchronous comes into the picture).
for more info and library
I've tried to check the real, measurable difference in a speed between fs.readFileSync() and fs.readFile() for downloading 3 different files which are on SD card and I've added between this downloads some math calculation and I don't understand where is the difference in speed which is always showed on node pictures when node is faster also in simple operation like downloading 3 times the same file and the time for this operation is close to time which is needed for downloading 1 time this file.
I understand that this is no doubtly useful that server during downloading some file is able to doing other job but a lot of time on youtube or in books there are some diagrams which are not precise because when you have a situation like below async node is slower then sync in reading small files(like below: 85kB, 170kB, 255kB).
var fs = require('fs');
var startMeasureTime = () => {
var start = new Date().getTime();
return start;
};
// synch version
console.log('Start');
var start = startMeasureTime();
for (var i = 1; i<=3; i++) {
var fileName = `Lorem-${i}.txt`;
var fileContents = fs.readFileSync(fileName);
console.log(`File ${1} was downloaded(${fileContents.length/1000}KB) after ${new Date().getTime() - start}ms from start.`);
if (i === 1) {
var hardMath = 3*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9;
};
};
// asynch version
setImmediate(() => {
console.log('Start');
var start = startMeasureTime();
for (var i = 1; i<=3; i++) {
var fileName = `Lorem-${i}.txt`;
fs.readFile(fileName, {encoding: 'utf8'}, (err, fileContents) => {
console.log(`File ${1} was downloaded(${fileContents.length/1000}KB) after ${new Date().getTime() - start}ms from start.`);
});
if (i === 1) {
var hardMath = 3*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9;
};
};
});
This is from console:
Start
File 1 was downloaded(255.024KB) after 2ms from start.
File 1 was downloaded(170.016KB) after 5ms from start.
File 1 was downloaded(85.008KB) after 6ms from start.
Start
File 1 was downloaded(255.024KB) after 10ms from start.
File 1 was downloaded(85.008KB) after 11ms from start.
File 1 was downloaded(170.016KB) after 12ms from start.

Resources