HTTP POST elastic search event stream bulk - node.js

I have a node.js program that is using streams to read a file (nodejs event stream setting a variable per stream )
I would like to use the same program to write this data into elastic search . I wrote up a small write function
var writeFunction = function(data) {
//console.log(data);
var client = request.newClient("http://localhost:9200");
client.post('/newtest3/1',data,function(err,res,body) {
return console.log(res.statusCode);
});
};
and hooked this up with the streaming
var processMyFile = function(file) {
var stream = getStream(file);
var nodeName = stream.nodeName;
stream
.pipe(es.split())
.on('end',endFunction)
.pipe(es.map(function(data,cb) {
processFunction(nodeName,data,cb);
}))
.pipe(es.map(function(data,cb) {
writeFunction(data);
}));
}
The above works as expected asynchronously and writes the data except that it takes a long time .It also seems to work as a buffer since the write takes a much longer time than the read.( advantage of using the pipe )
I know there is a bulk interface in elastic search and I can import using that . The shakesphere.json example in the Kibana getting started guide (http://www.elasticsearch.org/guide/en/kibana/current/using-kibana-for-the-first-time.html)
This means I would need to create a file in the format needed by the bulk import and then run a curl program etc. I would like to avoid creating a temporary file .
Is there an easier way to import data into elasticsearch faster as part of the streaming process

elasticsearch-streams Will help you to use the bulk interface with streaming, without the need of write a json file first.
I believe that your code would be more or less like this:
var TransformToBulk = require('elasticsearch-streams').TransformToBulk
var WritableBulk = require('elasticsearch-streams').WritableBulk;
var client = new require('elasticsearch').Client();
var bulkExec = function(bulkCmds, callback) {
client.bulk({
index : 'newtest3',
type : '1',
body : bulkCmds
}, callback);
};
var ws = new WritableBulk(bulkExec);
var toBulk = new TransformToBulk(function getIndexTypeId(doc) { return { _id: doc.id }; });
var processMyFile = function(file) {
var stream = getStream(file);
stream
.pipe(toBulk)
.pipe(ws)
.on('close', endFunction)
.on('err', endFunction);
}

Related

Obtaining offset when appending to file

I am trying to implement a simple Express handler which appends the request body to a file, and returns the offset in a file where the write has been performed. Based on some research, it seems that there is no ftell-like function in NodeJS, and the similar questions (like fs.createReadStream() at specific position of file) refer to using start parameter or manually seeking - which I do not see how to use in combination with appending to a file (when the offset at the end of a file is unknown). I am currently stuck with a code like
app.put('/:blob', (req, res) => {
var blob = req.params.blob;
var blobpath = path.join(__dirname, 'data/' + blob);
var stream = fs.createWriteStream(blobpath, {flags:'a'});
var pos = fs.tell(stream); // <-- I do not know how to do this
function handle(data) {
stream.write(data);
req.once('data', handle);
}
req.once('data', handle);
req.on('end', function() {
stream.end();
res.json({offset: pos});
});
});
Could you please help me how I should achieve my goal?

How to Download a Large CSV file with node.js and react.js

I want to download a Mongo collection (with huge number of records) as a CSV from react through node.js.
Currently what I do is, I load data first in to the Node and after process them I resolve promise. For small collections, this works fine. But when number of records getting higher it returns 502 error after a long time. What is the best way to do this?
This is a basic code spinet which helped me to solve the above problem.
const collection = ... ;
const query = ... ;
const select = ... ;
const cursor = mongodb.collection(collection).find(query, select)
var csv = require('csv');
transfer(doc) {
return {
Address: doc.address,
State: doc.state.abbreviation
};
}
function(req, resp) {
const cursor = ...
// The transfer function (above)
const transfer = ...;
const fileName = "Download.csv";
resp.setHeader('Content-disposition', `attachment; fileame=${fileName}`);
resp.writeHead(200, { 'Content-Type': 'text/csv' });
resp.flushHeaders();
// Stream the query result
cursor.stream()
.pipe(csv.transfer(transformer))
.pipe(csv.stringify({header: true}))
.pipe(resp)
}
Split the process into parts and make use of Redis. Parse the CSV and push each job to Redis queue. start new worker processes in parallel to fetch a job from the queue and processing it. if your system has multiple cores you can use cluster modules to split the worker process. note that process don't have a shared memory.
Redis connection
var kue = require("kue");
var queue = kue.createQueue(
{
redis: `redis://${redisHost}:${redisPort}`,
jobEvents: false
}
);
Worker Process
var clusterWorkerSize = require('os').cpus().length;
var workers = [];
for (var i = 0; i < clusterWorkerSize; i++) {
var worker = cluster.fork();
workers.push(worker);
}

Using stream-combiner and Writable Streams (stream-adventure)

i'm working on nodeschool.io's stream-adventure. The challenge:
Write a module that returns a readable/writable stream using the
stream-combiner module. You can use this code to start with:
var combine = require('stream-combiner')
module.exports = function () {
return combine(
// read newline-separated json,
// group books into genres,
// then gzip the output
)
}
Your stream will be written a newline-separated JSON list of science fiction
genres and books. All the books after a "type":"genre" row belong in that
genre until the next "type":"genre" comes along in the output.
{"type":"genre","name":"cyberpunk"}
{"type":"book","name":"Neuromancer"}
{"type":"book","name":"Snow Crash"}
{"type":"genre","name":"space opera"}
{"type":"book","name":"A Deepness in the Sky"}
{"type":"book","name":"Void"}
Your program should generate a newline-separated list of JSON lines of genres,
each with a "books" array containing all the books in that genre. The input
above would yield the output:
{"name":"cyberpunk","books":["Neuromancer","Snow Crash"]}
{"name":"space opera","books":["A Deepness in the Sky","Void"]}
Your stream should take this list of JSON lines and gzip it with
zlib.createGzip().
HINTS
The stream-combiner module creates a pipeline from a list of streams,
returning a single stream that exposes the first stream as the writable side and
the last stream as the readable side like the duplexer module, but with an
arbitrary number of streams in between. Unlike the duplexer module, each
stream is piped to the next. For example:
var combine = require('stream-combiner');
var stream = combine(a, b, c, d);
will internally do a.pipe(b).pipe(c).pipe(d) but the stream returned by
combine() has its writable side hooked into a and its readable side hooked
into d.
As in the previous LINES adventure, the split module is very handy here. You
can put a split stream directly into the stream-combiner pipeline.
Note that split can send empty lines too.
If you end up using split and stream-combiner, make sure to install them
into the directory where your solution file resides by doing:
`npm install stream-combiner split`
Note: when you test the program, the source stream is automatically inserted into the program, so it's perfectly fine to have split() as the first parameter in combine(split(), etc., etc.)
I'm trying to solve this challenge without using the 'through' package.
My code:
var combiner = require('stream-combiner');
var stream = require('stream')
var split = require('split');
var zlib = require('zlib');
module.exports = function() {
var ws = new stream.Writable({decodeStrings: false});
function ResultObj() {
name: '';
books: [];
}
ws._write = function(chunk, enc, next) {
if(chunk.length === 0) {
next();
}
chunk = JSON.parse(chunk);
if(chunk.type === 'genre') {
if(currentResult) {
this.push(JSON.stringify(currentResult) + '\n');
}
var currentResult = new ResultObj();
currentResult.name = chunk.name;
} else {
currentResult.books.push(chunk.name);
}
next();
var wsObj = this;
ws.end = function(d) {
wsObj.push(JSON.stringify(currentResult) + '\n');
}
}
return combiner(split(), ws, zlib.createGzip());
}
My code does not work and returns 'Cannot pipe. Not readable'. Can someone point out to me where i'm going wrong?
Any other comments on how to improve are welcome too...

How to reset nodejs stream?

How to reset nodejs stream?
How to read stream again in nodejs?
Thanks in advance!
var fs = require('fs');
var lineReader = require('line-reader');
// proxy.txt = only 3 lines
var readStream = fs.createReadStream('proxy.txt');
lineReader.open(readStream, function (err, reader) {
for(var i=0; i<6; i++) {
reader.nextLine(function(err, line) {
if(err) {
readStream.reset(); // ???
} else {
console.log(line);
}
});
}
});
There are two ways of solving your problem, as someone commented before you could simply wrap all that in a function and instead of resetting - simply read the file again.
Ofc this won't work well with HTTP requests for example so the other way, provided that you do take a much bigger memory usage into account, you can simply accumulate your data.
What you'd need is to implement some sort of "rewindable stream" - this means that you'd essentially need to implement a Transform stream that would keep a list of all the buffers and write them to a piped stream on a rewind method.
Take a look at the node API for streams here, the methods should look somewhat like this.
class Rewindable extends Transform {
constructor() {
super();
this.accumulator = [];
}
_transform(buf, enc, cb) {
this.accumulator.push(buf);
callback()
}
rewind() {
var stream = new PassThrough();
this.accumulator.forEach((chunk) => stream.write(chunk))
return stream;
}
And you would use this like this:
var readStream = fs.createReadStream('proxy.txt');
var rewindableStream = readStream.pipe(new Rewindable());
(...).on("whenerver-you-want-to-reset", () => {
var rewound = rewindablesteram.rewind();
/// and do whatever you like with your stream.
});
Actually I think I'll add this to my scramjet. :)
Edit
I released the logic below in rereadable-stream npm package. The upshot over the stream depicted here is that you can now control the buffer length and get rid of the read data.
At the same time you can keep a window of count items and tail a number of chunks backwards.

Is it possible to register multiple listeners to a child process's stdout data event? [duplicate]

I need to run two commands in series that need to read data from the same stream.
After piping a stream into another the buffer is emptied so i can't read data from that stream again so this doesn't work:
var spawn = require('child_process').spawn;
var fs = require('fs');
var request = require('request');
var inputStream = request('http://placehold.it/640x360');
var identify = spawn('identify',['-']);
inputStream.pipe(identify.stdin);
var chunks = [];
identify.stdout.on('data',function(chunk) {
chunks.push(chunk);
});
identify.stdout.on('end',function() {
var size = getSize(Buffer.concat(chunks)); //width
var convert = spawn('convert',['-','-scale',size * 0.5,'png:-']);
inputStream.pipe(convert.stdin);
convert.stdout.pipe(fs.createWriteStream('half.png'));
});
function getSize(buffer){
return parseInt(buffer.toString().split(' ')[2].split('x')[0]);
}
Request complains about this
Error: You cannot pipe after data has been emitted from the response.
and changing the inputStream to fs.createWriteStream yields the same issue of course.
I don't want to write into a file but reuse in some way the stream that request produces (or any other for that matter).
Is there a way to reuse a readable stream once it finishes piping?
What would be the best way to accomplish something like the above example?
You have to create duplicate of the stream by piping it to two streams. You can create a simple stream with a PassThrough stream, it simply passes the input to the output.
const spawn = require('child_process').spawn;
const PassThrough = require('stream').PassThrough;
const a = spawn('echo', ['hi user']);
const b = new PassThrough();
const c = new PassThrough();
a.stdout.pipe(b);
a.stdout.pipe(c);
let count = 0;
b.on('data', function (chunk) {
count += chunk.length;
});
b.on('end', function () {
console.log(count);
c.pipe(process.stdout);
});
Output:
8
hi user
The first answer only works if streams take roughly the same amount of time to process data. If one takes significantly longer, the faster one will request new data, consequently overwriting the data still being used by the slower one (I had this problem after trying to solve it using a duplicate stream).
The following pattern worked very well for me. It uses a library based on Stream2 streams, Streamz, and Promises to synchronize async streams via a callback. Using the familiar example from the first answer:
spawn = require('child_process').spawn;
pass = require('stream').PassThrough;
streamz = require('streamz').PassThrough;
var Promise = require('bluebird');
a = spawn('echo', ['hi user']);
b = new pass;
c = new pass;
a.stdout.pipe(streamz(combineStreamOperations));
function combineStreamOperations(data, next){
Promise.join(b, c, function(b, c){ //perform n operations on the same data
next(); //request more
}
count = 0;
b.on('data', function(chunk) { count += chunk.length; });
b.on('end', function() { console.log(count); c.pipe(process.stdout); });
You can use this small npm package I created:
readable-stream-clone
With this you can reuse readable streams as many times as you need
For general problem, the following code works fine
var PassThrough = require('stream').PassThrough
a=PassThrough()
b1=PassThrough()
b2=PassThrough()
a.pipe(b1)
a.pipe(b2)
b1.on('data', function(data) {
console.log('b1:', data.toString())
})
b2.on('data', function(data) {
console.log('b2:', data.toString())
})
a.write('text')
I have a different solution to write to two streams simultaneously, naturally, the time to write will be the addition of the two times, but I use it to respond to a download request, where I want to keep a copy of the downloaded file on my server (actually I use a S3 backup, so I cache the most used files locally to avoid multiple file transfers)
/**
* A utility class made to write to a file while answering a file download request
*/
class TwoOutputStreams {
constructor(streamOne, streamTwo) {
this.streamOne = streamOne
this.streamTwo = streamTwo
}
setHeader(header, value) {
if (this.streamOne.setHeader)
this.streamOne.setHeader(header, value)
if (this.streamTwo.setHeader)
this.streamTwo.setHeader(header, value)
}
write(chunk) {
this.streamOne.write(chunk)
this.streamTwo.write(chunk)
}
end() {
this.streamOne.end()
this.streamTwo.end()
}
}
You can then use this as a regular OutputStream
const twoStreamsOut = new TwoOutputStreams(fileOut, responseStream)
and pass it to to your method as if it was a response or a fileOutputStream
If you have async operations on the PassThrough streams, the answers posted here won't work.
A solution that works for async operations includes buffering the stream content and then creating streams from the buffered result.
To buffer the result you can use concat-stream
const Promise = require('bluebird');
const concat = require('concat-stream');
const getBuffer = function(stream){
return new Promise(function(resolve, reject){
var gotBuffer = function(buffer){
resolve(buffer);
}
var concatStream = concat(gotBuffer);
stream.on('error', reject);
stream.pipe(concatStream);
});
}
To create streams from the buffer you can use:
const { Readable } = require('stream');
const getBufferStream = function(buffer){
const stream = new Readable();
stream.push(buffer);
stream.push(null);
return Promise.resolve(stream);
}
What about piping into two or more streams not at the same time ?
For example :
var PassThrough = require('stream').PassThrough;
var mybiraryStream = stream.start(); //never ending audio stream
var file1 = fs.createWriteStream('file1.wav',{encoding:'binary'})
var file2 = fs.createWriteStream('file2.wav',{encoding:'binary'})
var mypass = PassThrough
mybinaryStream.pipe(mypass)
mypass.pipe(file1)
setTimeout(function(){
mypass.pipe(file2);
},2000)
The above code does not produce any errors but the file2 is empty

Resources