Collect data from a readable stream into a variable

Collect data from a readable stream into a variable - node.js

I'm trying to implement a writable stream that will save the data that is written to it into a variable. This is the implementation of the writable stream:
var util = require('util');
var Writable = require('stream').Writable;
function Collector()
{
Writable.call(this, {objectMode: true});
this.entities = [];
};
util.inherits(Collector, Writable);
Collector.prototype._write = function (chunk, encoding, callback)
{
this.entities.push(chunk);
callback();
};
module.exports = Collector;
and this is how I'm trying to test it it out:
var fs = require('fs');
var Tokenizer = require('./tokenizer');
var Processor = require('../parser');
var Collector = require('./collector.js');
var tokenizer = new Tokenizer();
var processor = new Processor();
var collector = new Collector();
var readable = fs.createReadStream('./test/fixtures/test.dxf');
readable.pipe(tokenizer)
.pipe(parser)
.pipe(processor); // if this is piped to stdout, lots of data
console.log(collector.entities); // logs an empty array
I'm not sure why, but the entities property is empty after all it has been piped. If I console log this.entities within the _write function, the data is available.
Ultimately I want to be to call a function that returns an array whose elements are made up of data chunks from Processor. Collector was some hacking to see how I could do it, but I haven't gotten very far.
How can I store chunks from a readable stream into a variable and return them from a function?

It returns empty array because your streaming has not finished yet. You should listen finish event in order to poperly get your entities array:
var fs = require('fs');
var Tokenizer = require('./tokenizer');
var Processor = require('../parser');
var Collector = require('./collector.js');
var tokenizer = new Tokenizer();
var processor = new Processor();
var collector = new Collector();
var readable = fs.createReadStream('./test/fixtures/test.dxf');
readable.pipe(tokenizer)
.pipe(parser)
.pipe(processor)
.on('finish', function() {
console.log(collector.entities);
});

Related

Pause Node csv-parser when uploading data

I am working with Node csv-parser and my read stream is from std in. I am piping that to the parser. Inside the parser readable function, I am making an async HTTP call to upload this data. During this time, I want the parser to pause reading. Until the async call has finished excecuting.
var parse = require('csv-parse');
var output = [];
// Create the parser
var parser = parse({delimiter: ',', columns: true, trim: true});
parser.on('readable', function () {
while (record = parser.read()) {
console.log('Still reading');
var jsonRec = convertIpToInt(record);
var jsonData = JSON.stringify(jsonRec);
output.push(new Buffer(jsonData + "\n"));
//parser.pause();
var assertPromise = uploadBatches1(indexName, function() {
//parser.resume();
console.log('Returned');
});
}
});
function uploadBatches1(indexToAssert, cb) {
//uploads data got form parser
}

Since csv-parse exposes a readable stream, you can pause the stream with parser.pause()
Likewise, when your update is complete, you can call parser.resume(). You can also check if you need to resume beforehand using parser.isPaused().
You can read more about readable streams and what the pause method does here:
https://nodejs.org/api/stream.html#stream_readable_pause
Based on the above, you should be implementing the parser.on('data') readable stream event, instead of the while (record = parser.read()) loop. Pausing the stream does nothing if you are actively calling the read method yourself.
Consider restructuring your code like so, and then use parser.pipe from a filesystem read:
var parse = require('csv-parse');
var output = [];
// Create the parser
var parser = parse({delimiter: ',', columns: true, trim: true});
parser.on('data', function () {
console.log('Still reading');
var jsonRec = convertIpToInt(record);
var jsonData = JSON.stringify(jsonRec);
output.push(new Buffer(jsonData + "\n"));
parser.pause();
var assertPromise = uploadBatches1(indexName, function() {
parser.resume();
console.log('Returned');
});
});
function uploadBatches1(indexToAssert, cb) {
//uploads data got form parser
}

fs.createReadStream('b.mp3') reduce latency

I'm playing around with events triggering sounds (500 msec long), therefore I use the lame library.
var lame = require('lame');
var fs = require('fs');
var Speaker = require('speaker');
while(listening) {
if(eventIsFired) {
fs.createReadStream('b.mp3')
.pipe(new lame.Decoder)
.pipe(new Speaker);
}
}
Is there any way to preload the stream/file, so I won't need to load it on every single event? it actually blocks my whole while loop and making it async didn't work. How can I reduce latency and make it more efficient?

You can cache the mp3 file to a buffer, then convert that buffer to a readable stream when you need.
var lame = require('lame');
var fs = require('fs');
var Speaker = require('speaker');
var Readable = require('stream').Readable;
var mp3Buffer = fs.readFileSync('b.mp3');
while(listening) {
if(eventIsFired) {
bufferToReadableStream(mp3Buffer)
.pipe(new lame.Decoder)
.pipe(new Speaker);
}
}
function bufferToReadableStream(buffer) {
let stream = new Readable();
stream.push(buffer);
stream.push(null);
return stream;
}

Node.js create object.stdin

I have an object in node that has .stdin, .stderr and .stdout objects attached to it. I have stdout and stderr working as Writable streams and I'm having some trouble getting stdin to work as a Readable stream.
In the use case I'm building for I could do the following:
var stream = require('stream');
var util = require('util');
var Readable = stream.Readable;
var My_Stdin = function(){
Readable.call(this, {objectMode: true});
};
util.inherits(My_Stdin, Readable);
My_Stdin.prototype._read = function(data) {
this.emit(data);
};
function myObject(){
this.stdin = new My_Stdin;
this.stdin.on('data', function(data){
process.stdout.write(data);
});
}
var object = new myObject;
process.stdin.pipe(object.stdin);
Right now I'm getting an error that says:
_stream_readable.js:525
var ret = dest.write(chunk);
^
TypeError: dest.write is not a function
How do I properly implement a stdin on my object similar to child_process.spawn?

Read csv files in stream and store them in database

I have a few huge csv files, what I need to store in a mongo database. Because these files are too big, I need to use stream. I pause the stream while the data writing into the database.
var fs = require('fs');
var csv = require('csv');
var mongo = require('mongodb');
var db = mongo.MongoClient.connect...
var readStream = fs.createReadStream('hugefile.csv');
readStream.on('data', function(data) {
readStream.pause();
csv.parse(data.toString(), { delimiter: ','}, function(err, output) {
db.collection(coll).insert(data, function(err) {
readStream.resume();
});
});
});
readStream.on('end', function() {
logger.info('file stored');
});
But the csv.parse drop an error, because I would need to read the files line by line to handle them as csv, and convert to json for the mongodb. Maybe I should not pause them, but use an interface. I didn't find any solution for this yet.
Any help would be appreciated!

I think you might want to create a stream of lines from your raw data stream.
Here is an example from the split package. https://www.npmjs.com/package/split
fs.createReadStream(file)
.pipe(split())
.on('data', function (line) {
//each chunk now is a seperate line!
})
Adapted to your example it might look like this
var readStream = fs.createReadStream('hugefile.csv');
var lineStream = readStream.pipe(split());
lineStream.on('data', function(data) {
//remaining code unmodified

I'm unsure if bulk() was a thing back in '15, but whosoever is trying to import items from large sources should consider using them.
var fs = require('fs');
var csv = require('fast-csv');
var mongoose = require('mongoose');
var db = mongoose.connect...
var counter = 0; // to keep count of values in the bulk()
const BULK_SIZE = 1000;
var bulkItem = Item.collection.initializeUnorderedBulkOp();
var readStream = fs.createReadStream('hugefile.csv');
const csvStream = csv.fromStream(readStream, { headers: true });
csvStream.on('data', data => {
counter++;
bulkOrder.insert(order);
if (counter === BATCH_SIZE) {
csvStream.pause();
bulkOrder.execute((err, result) => {
if (err) console.log(err);
counter = 0;
bulkItem = Item.collection.initializeUnorderedBulkOp();
csvStream.resume();
});
}
}
});

One symbol at time stream with node

I'm trying implement a stream which returns one symbol from file on each 'data' event.
I finished with code bellow:
var util = require('util'),
fs = require('fs'),
Readable = require('stream').Readable;
var util = require('util');
var Readable = require('stream').Readable;
var SymbolReadStream = function(filename, options) {
Readable.call(this);
this._readable = fs.createReadStream(filename, options).pause();
self = this;
this._readable.on('readable', function() {
var chunk;
chunk = self._readable.read(1);
// I believe the problem is here
self._readable.pause();
});
};
util.inherits(SymbolReadStream, Readable); // inherit the prototype methods
SymbolReadStream.prototype._read = function() {
this._readable.resume();
};
var r = new SymbolReadStream("test.txt", {
encoding: 'utf8',
});
r.on('data', function(el) {
console.log(el);
});
but this code doesn't work. Please help.
Is there an easier way to achieve the behavior?

This post give a great clue how to answer your question.
Also, you should take a loop at pipe that would be a cleaner way to accomplish what you're trying to do: piping an adapter to the filestream instead of wrapping it up
That said, personaly I wont reinvent the wheel here, and would just search for modules that can accomplish that. Especially "split" modules, making them split on every char, instead on new lines. As an example, event-stream has a split method that "takes the same arguments as string.split except it defaults to '\n' instead of ','". So the logic would be to try myStream.pipe(es.split('')) but the modules takes this like myStream.pipe(es.split()) which breaks on lines. So here's my solution, using a regex to say "break on each char"
var es = require('event-stream');
var fs = require('fs');
var symbolStream = fs.createReadStream(filename, options).pipe(es.split(/(?!$)/));
EDIT: event-stream seems to use split module internally, so you can even try
var split = require('split');
var fs = require('fs');
var symbolStream = fs.createReadStream(filename, options).pipe(split(/(?!$)/));
(this loose test is responsible of converting '' to \r\n)

In you stream implementation there is no emitting 'data' event to handler. Because of it, console.log are never called. After adding events, they will be streamed symbol by symbol. Example below:
var util = require('util'),
fs = require('fs'),
Readable = require('stream').Readable;
function SymbolReadStream(filename, options) {
if (!(this instanceof SymbolReadStream)) {
return new SymbolReadStream(length, options);
}
Readable.call(this);
this._readable = fs.createReadStream(filename, options);
}
util.inherits(SymbolReadStream, Readable); // inherit the prototype methods
SymbolReadStream.prototype._read = function() {
var self = this;
this._readable.on('readable', function() {
var chunk;
while (null !== (chunk = self._readable.read(1))) {
self.emit('data', chunk);
}
});
this._readable.on('end', function() {
self.emit('end');
});
};
var r = new SymbolReadStream("test.txt", {
encoding: 'utf8',
});
r.on('data', function(el) {
console.log(el);
});
r.on('end', function(el) {
console.log('done');
});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Collect data from a readable stream into a variable - node.js

Related

Pause Node csv-parser when uploading data

fs.createReadStream('b.mp3') reduce latency

Node.js create object.stdin

Read csv files in stream and store them in database

One symbol at time stream with node

Categories

Resources