Obtaining offset when appending to file - node.js

I am trying to implement a simple Express handler which appends the request body to a file, and returns the offset in a file where the write has been performed. Based on some research, it seems that there is no ftell-like function in NodeJS, and the similar questions (like fs.createReadStream() at specific position of file) refer to using start parameter or manually seeking - which I do not see how to use in combination with appending to a file (when the offset at the end of a file is unknown). I am currently stuck with a code like
app.put('/:blob', (req, res) => {
var blob = req.params.blob;
var blobpath = path.join(__dirname, 'data/' + blob);
var stream = fs.createWriteStream(blobpath, {flags:'a'});
var pos = fs.tell(stream); // <-- I do not know how to do this
function handle(data) {
stream.write(data);
req.once('data', handle);
}
req.once('data', handle);
req.on('end', function() {
stream.end();
res.json({offset: pos});
});
});
Could you please help me how I should achieve my goal?

Related

Stop function from being invoked multiple times

I'm in the process of building a file upload component that allows you to pause/resume file uploads.
The standard way to achieve this seems to be to break the file into chunks on the client machine, then send the chunks along with book-keeping information up to the server which can store the chunks into a staging directory, then merge them together when it has received all of the chunks. So, this is what I am doing.
I am using node/express and I'm able to get the files fine, but I'm running into an issue because my merge_chunks function is being invoked multiple times.
Here's my call stack:
router.post('/api/videos',
upload.single('file'),
validate_params,
rename_uploaded_chunk,
check_completion_status,
merge_chunks,
record_upload_date,
videos.update,
send_completion_notice
);
the check_completion_status function is implemented as follows:
/* Recursively check to see if we have every chunk of a file */
var check_completion_status = function (req, res, next) {
var current_chunk = 1;
var see_if_chunks_exist = function () {
fs.exists(get_chunk_file_name(current_chunk, req.file_id), function (exists) {
if (current_chunk > req.total_chunks) {
next();
} else if (exists) {
current_chunk ++;
see_if_chunks_exist();
} else {
res.sendStatus(202);
}
});
};
see_if_chunks_exist();
};
The file names in the staging directory have the chunk numbers embedded in them, so the idea is to see if we have a file for every chunk number. The function should only next() one time for a given (complete) file.
However, my merge_chunks function is being invoked multiple times. (usually between 1 and 4) Logging does reveal that it's only invoked after I've received all of the chunks.
With this in mind, my assumption here is that it's the async nature of the fs.exists function that's causing the issue.
Even though the n'th invocation of check_completion_status may occur before I have all of the chunks, by the time we get to the nth call to fs.exists(), x more chunks may have arrived and been processed concurrently, so the function can keep going and in some cases get to the end and next(). However those chunks that arrived concurrently are also going to correspond to invocations of check_completion_status, which are also going to next() because we obviously have all of the files at this point.
This is causing issues because I didn't account for this when I wrote merge_chunks.
For completeness, here's the merge_chunks function:
var merge_chunks = (function () {
var pipe_chunks = function (args) {
args.chunk_number = args.chunk_number || 1;
if (args.chunk_number > args.total_chunks) {
args.write_stream.end();
args.next();
} else {
var file_name = get_chunk_file_name(args.chunk_number, args.file_id)
var read_stream = fs.createReadStream(file_name);
read_stream.pipe(args.write_stream, {end: false});
read_stream.on('end', function () {
//once we're done with the chunk we can delete it and move on to the next one.
fs.unlink(file_name);
args.chunk_number += 1;
pipe_chunks(args);
});
}
};
return function (req, res, next) {
var out = path.resolve('videos', req.video_id);
var write_stream = fs.createWriteStream(out);
pipe_chunks({
write_stream: write_stream,
file_id: req.file_id,
total_chunks: req.total_chunks,
next: next
});
};
}());
Currently, I'm receiving an error because the second invocation of the function is trying to read the chunks that have already been deleted by the first invocation.
What is the typical pattern for handling this type of situation? I'd like to avoid a stateful architecture if possible. Is it possible to cancel pending handlers right before calling next() in check_completion_status?
If you just want to make it work ASAP, I would use a lock (much like a db lock) to lock the resource so that only one of the requests processes the chunks. Simply create a unique id on the client, and send it along with the chunks. Then just store that unique id in some sort of a data structure, and look that id up prior to processing. The example below is by far not optimal (in fact this map will keep growing, which is bad), but it should demonstrate the concept
// Create a map (an array would work too) and keep track of the video ids that were processed. This map will persist through each request.
var processedVideos = {};
var check_completion_status = function (req, res, next) {
var current_chunk = 1;
var see_if_chunks_exist = function () {
fs.exists(get_chunk_file_name(current_chunk, req.file_id), function (exists) {
if (processedVideos[req.query.uniqueVideoId]){
res.sendStatus(202);
} else if (current_chunk > req.total_chunks) {
processedVideos[req.query.uniqueVideoId] = true;
next();
} else if (exists) {
current_chunk ++;
see_if_chunks_exist();
} else {
res.sendStatus(202);
}
});
};
see_if_chunks_exist();
};

Running function once inside node stream pipe chain

I'm using vinyl-fs to write a simple pipeline that loads markdown files, converts them to HTML, and saves them to disk. This is all working.
However, in the middle of my pipe() chain, I want to perform an asynchronous task that should just be done once for all my files. My current problem relates to loading a file (and it's important that file is loaded in the middle of the chain), but it's a problem I find myself stumbling upon all the time.
To solve this problem, I have started to do this:
vfs.src(*.md).pipe(function() {
var loaded = false;
return through2.obj(function(file, enc, cb) {
if(!loaded) {
fs.readFile('myfile', function(err, data) {
// use data for something
loaded = true;
cb(null, file);
}
} else {
// passthrough
cb(null, file);
}
});
}
This feels a bit silly. Am I approaching this all wrong, or is this actually an okay thing to do?
After reading a ton of articles about Node streams, it seems that the best implementation for this is to listen to the streams finish event, and then create a new stream based on the files from the former stream. This allows me to do exactly what I want: stream files through the pipeline until a point where I need to access the array of files for some task, and then continue the pipeline stream afterwards.
Here's what that looks like:
var vfs = require('vinyl-fs');
var through = require('through2');
// array for storing file objects
var files = [];
// start the stream
var firstStream = vfs.src("*.jpg")
// pipe it through a function that saves each file to the array
.pipe(through.obj(function(file, enc, cb) {
files.push(file);
console.log('1: ', path.basename(file.path));
cb(null, file);
}))
// when this stream is done
.on('finish', function() {
console.log('FINISH');
// files will now be full of all files from stream
// and you can do whatever you want with them.
// create a new stream
var secondStream = through.obj();
// write the files to the new stream
files.each(function(file) {
secondStream.write(file);
});
// end the stream to make sure the finish
// event triggers
secondStream.end();
// now continue piping
secondStream.pipe(through.obj(function(file, enc, cb) {
console.log('2: ', path.basename(file.path));
cb(null, file)
}))
.pipe(vfs.dest("tmp"));
});
In this scenario, I have 5 JPG images next to my scripts, and the console.log will say
1: IMG_1.JPG
1: IMG_2.JPG
1: IMG_3.JPG
1: IMG_4.JPG
1: IMG_5.JPG
FINISH
2: IMG_1.JPG
2: IMG_2.JPG
2: IMG_3.JPG
2: IMG_4.JPG
2: IMG_5.JPG

HTTP POST elastic search event stream bulk

I have a node.js program that is using streams to read a file (nodejs event stream setting a variable per stream )
I would like to use the same program to write this data into elastic search . I wrote up a small write function
var writeFunction = function(data) {
//console.log(data);
var client = request.newClient("http://localhost:9200");
client.post('/newtest3/1',data,function(err,res,body) {
return console.log(res.statusCode);
});
};
and hooked this up with the streaming
var processMyFile = function(file) {
var stream = getStream(file);
var nodeName = stream.nodeName;
stream
.pipe(es.split())
.on('end',endFunction)
.pipe(es.map(function(data,cb) {
processFunction(nodeName,data,cb);
}))
.pipe(es.map(function(data,cb) {
writeFunction(data);
}));
}
The above works as expected asynchronously and writes the data except that it takes a long time .It also seems to work as a buffer since the write takes a much longer time than the read.( advantage of using the pipe )
I know there is a bulk interface in elastic search and I can import using that . The shakesphere.json example in the Kibana getting started guide (http://www.elasticsearch.org/guide/en/kibana/current/using-kibana-for-the-first-time.html)
This means I would need to create a file in the format needed by the bulk import and then run a curl program etc. I would like to avoid creating a temporary file .
Is there an easier way to import data into elasticsearch faster as part of the streaming process
elasticsearch-streams Will help you to use the bulk interface with streaming, without the need of write a json file first.
I believe that your code would be more or less like this:
var TransformToBulk = require('elasticsearch-streams').TransformToBulk
var WritableBulk = require('elasticsearch-streams').WritableBulk;
var client = new require('elasticsearch').Client();
var bulkExec = function(bulkCmds, callback) {
client.bulk({
index : 'newtest3',
type : '1',
body : bulkCmds
}, callback);
};
var ws = new WritableBulk(bulkExec);
var toBulk = new TransformToBulk(function getIndexTypeId(doc) { return { _id: doc.id }; });
var processMyFile = function(file) {
var stream = getStream(file);
stream
.pipe(toBulk)
.pipe(ws)
.on('close', endFunction)
.on('err', endFunction);
}

Reassembling file chunks produced in a multi-part upload

I'm using the excellent flow.js library to handle file uploads. It's a resumable HTML5 upload that produces a bunch of chunks on the server that must be reassembled. For example, foo.mov might become
timestamp-foomov.1
timestamp-foomov.2
...
timestamp-foomov.n
Uploads are working but I'm having trouble recombining the parts into a single binary. I have the following code from the Node.js server example the library authors provided on Github (https://github.com/flowjs/flow.js/tree/master/samples/Node.js).
$.write = function(identifier, writableStream, options) {
options = options || {};
options.end = (typeof options['end'] == 'undefined' ? true : options['end']);
// Iterate over each chunk
var pipeChunk = function(number) {
var chunkFilename = getChunkFilename(number, identifier);
fs.exists(chunkFilename, function(exists) {
if (exists) {
// If the chunk with the current number exists,
// then create a ReadStream from the file
// and pipe it to the specified writableStream.
var sourceStream = fs.createReadStream(chunkFilename);
sourceStream.pipe(writableStream, {
end: false
});
sourceStream.on('end', function() {
// When the chunk is fully streamed,
// jump to the next one
pipeChunk(number + 1);
});
} else {
// When all the chunks have been piped, end the stream
if (options.end) writableStream.end();
if (options.onDone) options.onDone();
}
});
}
pipeChunk(1);
}
I'm invoking this code with the following route and am expecting it to produce a reassembled binary in the tmp directory (that's where I'm saving my chunks). However nothing is happening. What am I missing?
exports.download = function(req, res, next) {
switch(req.method) {
case 'GET':
var stream = fs.createWriteStream('foobar');
flow.write(req.params.identifier, res);
break;
}
}
Reassembling all chunks is easy, just call this:
var stream = fs.createWriteStream(filename);
r.write(identifier, stream);
And that is it!
But other question is, when this method should be called?
Maybe when all chunks are uploaded and present at tmp folder.
But there is another issue with duplicate calls of the done.
This can be solved by creating and locking the file, once all chunks exists.
Then call
r.write(identifier, stream);
Then clean all chunks, release the lock and close the file.
Same approuch is done in php server side library: https://github.com/flowjs/flow-php-server/blob/master/src/Flow/File.php#L102

How to use filesystem's createReadStream with Meteor router(NodeJS)

I need to allow the user of my app to download a file with Meteor. Currently what I do is when the user requests to download a file I enter into a "fileRequests" collection in Mongo a document with the file location and a timestamp of the request and return the ID of the newly created request. When the client gets the new ID it imediately goes to mydomain.com/uploads/:id. I then use something like this to intercept the request before Meteor does:
var connect = Npm.require("connect");
var Fiber = Npm.require("fibers");
var path = Npm.require('path');
var fs = Npm.require("fs");
var mime = Npm.require("mime");
__meteor_bootstrap__.app
.use(connect.query())
.use(connect.bodyParser()) //I add this for file-uploading
.use(function (req, res, next) {
Fiber(function() {
if(req.method == "GET") {
// get the id here, and stream the file using fs.createReadStream();
}
next();
}).run();
});
I check to make sure the file request was made less than 5 seconds ago, and I immediately delete the request document after I've queried it.
This works, and is secure(enough) I think. No one can make a request without being logged in and 5 seconds is a pretty small window for someone to be able to highjack the created request URL but I just don't feel right with my solution. It feels dirty!
So I attempted to use Meteor-Router to accomplish the same thing. That way I can check if they're logged in correctly without doing the 5 second open to the world trickery.
So here's the code I wrote for that:
Meteor.Router.add('/uploads/:id', function(id) {
var path = Npm.require('path');
var fs = Npm.require("fs");
var mime = Npm.require("mime");
var res = this.response;
var file = FileSystem.findOne({ _id: id });
if(typeof file !== "undefined") {
var filename = path.basename(file.filePath);
var filePath = '/var/MeteorDMS/uploads/' + filename;
var stat = fs.statSync(filePath);
res.setHeader('Content-Disposition', 'attachment; filename=' + filename);
res.setHeader('Content-Type', mime.lookup(filePath));
res.setHeader('Content-Length', stat.size);
var filestream = fs.createReadStream(filePath);
filestream.pipe(res);
return;
}
});
This looks great, fits right in with the rest of the code and is easy to read, no hacking involved, BUT! It doesn't work! The browser spins and spins and never quite knows what to do. I have ZERO error messages coming up. I can keep using the app on other tabs. I don't know what it's doing, it never stops "loading". If I restart the server, I get a 0 byte file with all the correct headers, but I don't get the data.
Any help is greatly appreciated!!
EDIT:
After digging around a bit more, I noticed that trying to turn the response object into a JSON object results in a circular structure error.
Now the interesting thing about this is that when I listen to the filestream for the "data" event, and attempt to stringify the response object I don't get that error. But if I attempt to do the same thing in my first solution(listen to "data" and stringify the response) I get the error again.
So using the Meteor-Router solution something is happening to the response object. I also noticed that on the "data" event response.finished is flagged as true.
filestream.on('data', function(data) {
fs.writeFile('/var/MeteorDMS/afterData', JSON.stringify(res));
});
The Meteor router installs a middleware to do the routing. All Connect middleware either MUST call next() (exactly once) to indicate that the response is not yet settled or MUST settle the response by calling res.end() or by piping to the response. It is not allowed to do both.
I studied the source code of the middleware (see below). We see that we can return false to tell the middleware to call next(). This means we declare that this route did not settle the response and we would like to let other middleware do their work.
Or we can return a template name, a text, an array [status, text] or an array [status, headers, text], and the middleware will settle the response on our behalf by calling res.end() using the data we returned.
However, by piping to the response, we already settled the response. The Meteor router should not call next() nor res.end().
We solved the problem by forking the Meteor router and making a small change. We replaced the else in line 87 (after if (output === false)) by:
else if (typeof(output)!="undefined") {
See the commit with sha 8d8fc23d9c in my fork.
This way return; in the route method will tell the router to do nothing. Of course you already settled the response by piping to it.
Source code of the middleware as in the commit with sha f910a090ae:
// hook up the serving
__meteor_bootstrap__.app
.use(connect.query()) // <- XXX: we can probably assume accounts did this
.use(this._config.requestParser(this._config.bodyParser))
.use(function(req, res, next) {
// need to wrap in a fiber in case they do something async
// (e.g. in the database)
if(typeof(Fiber)=="undefined") Fiber = Npm.require('fibers');
Fiber(function() {
var output = Meteor.Router.match(req, res);
if (output === false) {
return next();
} else {
// parse out the various type of response we can have
// array can be
// [content], [status, content], [status, headers, content]
if (_.isArray(output)) {
// copy the array so we aren't actually modifying it!
output = output.slice(0);
if (output.length === 3) {
var headers = output.splice(1, 1)[0];
_.each(headers, function(value, key) {
res.setHeader(key, value);
});
}
if (output.length === 2) {
res.statusCode = output.shift();
}
output = output[0];
}
if (_.isNumber(output)) {
res.statusCode = output;
output = '';
}
return res.end(output);
}
}).run();
});

Resources