Node - Readable stream pipe() overwrite previous streams in a for loop - node.js

I am trying to stream a data collection to multiple files with the code below:
for (var key in data) {
// skip if collection length is 0
if (data[key].length > 0) {
// Use the key and jobId to open file for appending
let filePath = folderPath + '/' + key + '_' + jobId + '.txt';
// Using stream to append the data output to file, which should perform better when file gets big
let rs = new Readable();
let n = data[key].length;
let i = 0;
rs._read = function () {
rs.push(data[key][i++]);
if (i === n) {
rs.push(null);
}
};
rs.pipe(fs.createWriteStream(filePath, {flags: 'a', encoding: 'utf-8'}));
}
}
However, I end up getting all files being populated with the same data, which is the array for the last key in data object. It seems the reader stream is overridden for each loop, and the pipe() to writable stream doesn't start until the for loop is finished. How is that possible?

So the reason why you code is probably not working is because rs._read method is called asynchronically, and your key variable is function scoped(because of var keyword).
Every rs stream that you create points to the same variable which is key, at the end of main loop, each of those callbacks will have the same value.
When you change "var" to "let", then in each iteration new key variable will be created and it will solve your problem(_read function will have its own copy of key variable instead of shared one).
If you change it to let it should work.

This is happening because the key you're defining in the loop statement is not block-scoped. This is not a problem at first, but when you create a closure on it inside the rs._read function, all subsequent stream reads are using the last known value, which is the last value of data array.
And while we at it, I can propose a bit of a refactoring to make the code cleaner and more reusable:
const writeStream = (folderPath, index, jobId) => {
const filePath = `${folderPath}/${index}_${jobId}.txt`;
return fs.createWriteStream(filePath, {
flags: 'a', encoding: 'utf-8'
});
}
data.forEach((value, index) => {
const length = value.length;
if (length > 0) {
const rs = new Readable();
const n = length;
let i = 0;
rs._read = () => {
rs.push(value[i++]);
if (i === n) rs.push(null);
}
rs.pipe(writeStream(folderPath, index, jobId));
}
});

Related

NodeJS debugging coderbyte Task

node.js debugging json data in the javascript file, you have a program that performs a get request on the route https://coderbyte.com/api/challenges/json/age-counting which contains a data key and the value is a string which contains items in the format: key=string, age=integer. your goal is to print out the key id's between indices 10 and 15 in string format (e.g. q3kg6,mgqpf,tg2vm,...). the program provided already parses the data and loops through each item, but you need to figure out how to fix it so that it correctly populates keyarray with only key id's.
here is code given by coderbyte.
const https = require('https');
https.get('https://coderbyte.com/api/challenges/json/age-counting', (resp) => {
let content = '';
// push data character by character to "content" variable
resp.on('data', (c) => content += c);
// when finished reading all data, parse it for what we need now
resp.on('end', () => {
let jsonContent = JSON.parse(content);
let jsonContentArray = jsonContent.data.split(',');
let keyArray = [];
for (let i = 0; i < jsonContentArray.length; i++) {
let keySplit = jsonContentArray[i].trim().split('=');
keyArray.push(keySplit[1]);
}
console.log(keyArray.toString());
});
});
const https = require('https');
https.get('https://coderbyte.com/api/challenges/json/age-counting', (resp) => {
let content = '';
// push data character by character to "content" variable
resp.on('data', (c) => content += c);
// when finished reading all data, parse it for what we need now
resp.on('end', () => {
let jsonContent = JSON.parse(content);
let jsonContentArray = jsonContent.data.split(',');
let keyArray = [];
console.log(jsonContentArray.length);
// Traversing over the json content array and checking only for keys while ignoring the age.
for (let i = 0; i < jsonContentArray.length; i+=2) {
// extracting only the keys by triming the white space and splitting the data by "=" and taking the first character from the array.
let key = jsonContentArray[i].trim().split('=')[1];
// Now pushing only the 10*2 for getting the keys and ignoring the age. If you look into the json you get a better understanding.
// Here is an eg of json considered.
// data":"key=IAfpK, age=58, key=WNVdi, age=64, key=jp9zt, age=47, key=0Sr4C, age=68, key=CGEqo, age=76, key=IxKVQ, age=79, key=eD221, age=29, key=XZbHV, age=32, key=k1SN5, age=88, .....etc
if (i>=20 && i< 30) {
keyArray.push(key);
}
}
console.log(keyArray);
});
});

cant get variable value in node js

I tried to make the function async but when I print the attacks it prints out {} without anything in it but when I print the values right after adding them in attacks I can print them why is it like that? how can I use the value?
var fs = require('fs');
var http = require('http');
var attacks = {};
var phase_name;
var directory = 'cti-master\\enterprise-attack\\attack-pattern\\';
// getting all files names.
async function getData(directory){
fs.readdir(directory, (err, files) => {
if(err) { return;}
var fileNum = 0;
// opening all the files and sorting the data in them.
while (fileNum < files.length - 1)
{
fs.readFile(directory + files[fileNum], 'utf8', (err, data) =>
{
// parsing the data from json.
var fileData = JSON.parse(data);
// sometimes there is no phase name.
if(fileData['objects'][0]['kill_chain_phases'] == undefined){phase_name = undefined;}
else{phase_name = fileData['objects'][0]['kill_chain_phases'][0]['phase_name'];}
// sorting data by name to make it easier later.
attacks[fileData['objects'][0]['name']] = {
id: fileData['objects'][0]['id'],
type: fileData['objects'][0]['type'],
description: fileData['objects'][0]['description'],
x_mitre_platforms: fileData['objects'][0]['x_mitre_platforms'],
x_mitre_detection: fileData['objects'][0]['x_mitre_detection'],
phase_name: phase_name};
});
fileNum += 1;
};
});
var keys = Object.keys(attacks);
console.log(attacks);
}
getData(directory);
The reason for the empty log here because the node does not wait to finish while loop Hence you are getting empty log. Basically, you can improve this code by using the async-await method.
But if you are stick with this code, I am just suggesting this logic.
Just bring your log inside an if condition block. which have condition "print only if expected file count reached"
for example.
if(fileNum === files.length) {
var keys = Object.keys(attacks);
console.log(attacks);
}
So now log print only when this condition is satisfied which means after completion of while loop

wait for writestream to finish before executing next function

I have two functions.
The first function reads all the files in a folder and writes their data to a new file.
The second function takes that new file (output of function 1) as input and creates another file. Therefore it has to wait until the write stream of function 1 has finished.
const fs = require('fs');
const path = require('path');
function f1(inputDir, outputFile) {
let stream = fs.createWriteStream(outputFile, {flags:'a'}); // new data should be appended to outputFile piece by piece (hence flag a)
let files = await fs.promises.readdir(inputDir);
for(let file of files) {
let pathOfCurrentFile = path.join(inputDir, file);
let stat = fs.statSync(pathOfCurrentFile);
if(stat.isFile()) {
data = await fs.readFileSync(pathOfCurrentFile, 'utf8');
// now the data is being modified for output
let result = data + 'other stuff';
stream.write(result);
}
}
stream.end();
}
function f2(inputFile, outputFile) {
let newData = doStuffWithMy(inputFile);
let stream = fs.createWriteStream(outputFile);
stream.write(newData);
stream.end();
}
f1('myFiles', 'myNewFile.txt');
f2('myNewFile.txt', 'myNewestFile.txt');
Here's what happens:
'myNewFile.txt' (output of f1) is created correctly
'myNewestFile.txt' is created but is either empty or only contains one or two words (it should contain a long text)
When I use a timeout before executing f2, it works fine, but I can't use a timeout because there can be thousands of input files in the inputDir, therefore I need a way to do it dynamically.
I've experimented with async/await, callbacks, promises etc. but that stuff seems to be a little to advanced for me, I couldn't get it to work.
Is there anything else I can try?
Since you asked about a synchronous version, here's what that could look like. This should only be used in a single user script or in startup code, not in a running server. A server should only use asynchronous file I/O.
// synchronous version
function f1(inputDir, outputFile) {
let outputHandle = fs.openSync(outputFile, "a");
try {
let files = fs.readdirSync(inputDir, {withFileTypes: true});
for (let f of files) {
if (f.isFile()) {
let pathOfCurrentFile = path.join(inputDir, f.name);
let data = fs.readFileSync(pathOfCurrentFile, 'utf8');
fs.writeSync(outputHandle, data);
}
}
} finally {
fs.closeSync(outputHandle);
}
}
function f2(inputFile, outputFile) {
let newData = doStuffWithMy(inputFile);
fs.writeFileSync(outputFile, newData);
}
f1('myFiles', 'myNewFile.txt');
f2('myNewFile.txt', 'myNewestFile.txt');

wait for previous stream to be empty before allowing reading

Say I have a file that contains a list of integers, one per line. I use fs.createReadStream and pipe that into split (so that each chunk is an integer). Then I pipe that into a duplex stream that is supposed to add the numbers and write the sum by piping into fs.createWriteStream.
var fs = require('fs');
var stream = require('stream');
var split = require('split');
var addIntegers = new stream.Duplex();
addIntegers.sum = 0;
addIntegers._read = function(size) {
this.push(this.sum + '\n');
}
addIntegers._write = function(chunk, encoding, done) {
this.sum += +chunk;
done();
}
fs.createReadStream('list-of-integers.txt')
.pipe(split())
.pipe(addIntegers)
.pipe(fs.createWriteStream('sum.txt'));
When I run this, sum.txt just gets continually filled with zeroes and the program never terminates (as expected). How do I wait for the input stream (split) to be empty before allowing the ouput stream (fs.createWriteStream) to read from addIntegers?
I figured it out.
I decided to use a Transform stream instead (thanks mscdex) because it has a method (_flush) that gets called after all written data is consumed. The working code is below. Don't forget to npm i split :)
var fs = require('fs');
var stream = require('stream');
var split = require('split');
var addIntegers = new stream.Transform();
addIntegers.sum = 0;
addIntegers._transform = function(chunk, encoding, done) {
this.sum += +chunk;
done();
}
addIntegers._flush = function(done) {
this.push(this.sum + '\n');
}
fs.createReadStream('list-of-integers.txt')
.pipe(split())
.pipe(addIntegers)
.pipe(fs.createWriteStream('sum.txt'));

Reasons why a node stream read() function might return null?

I am building a node application that reads a CSV file from the file system, analyzes the file, and then parses the file using the csv-parse module. Each of these steps are in the form of a stream, piped one into the next.
The trouble I am having is that for some files, the parse step can read the stream, but for others, the read() method returns null on the readable event and I don't know why.
In the code below, specifically, I will sometimes see data come through on calling read() on the parser stream, and other times it will return null. CSV files that succeed always succeed, and CSV files that fail always fail. I tried to determine some difference between the files, but other than using different field names in the first row, and slightly different data in the body, I can't see any significant difference between the source files.
What are some reasons that a node stream's read() function might return null after a readable event?
Sample code:
var parse = require('csv-parse');
var fs = require('fs');
var stream = require('stream');
var byteCounter = new stream.Transform({objectMode : true});
byteCounter.setEncoding('utf8');
var totalBytes = 0;
// count the bytes we have read
byteCounter._transform = function (chunk, encoding, done) {
var data = chunk.toString();
if (this._lastLineData) {
data = this._lastLineData + data ;
}
var lines = data.split('\n');
// this is because each chunk will probably not end precisely at the end of a line:
this._lastLineData = lines.splice(lines.length-1,1)[0];
lines.forEach(function(line) {
totalBytes += line.length + 1 // we add an extra byte for the end-of-line
this.push(line);
}, this);
done();
};
byteCounter._flush = function (done) {
if (this._lastLineData) {
this.push(this._lastLineData);
}
this._lastLineData = null;
done();
};
// csv parser
var parser = parse({
delimiter: ",",
comment: "#",
skip_empty_lines: true,
auto_parse: true,
columns: true
});
parser.on('readable', function(){
var row;
while( null !== (row = parser.read()) ) {
// do stuff
}
});
// start by reading a file, piping to byteCounter, then pipe to parser.
var myPath = "/path/to/file.csv";
var options = {
encoding : "utf-8"
};
fs.createReadStream(myPath, options).pipe(byteCounter).pipe(parser);

Resources