Reassembling file chunks produced in a multi-part upload

Reassembling file chunks produced in a multi-part upload - node.js

I'm using the excellent flow.js library to handle file uploads. It's a resumable HTML5 upload that produces a bunch of chunks on the server that must be reassembled. For example, foo.mov might become
timestamp-foomov.1
timestamp-foomov.2
...
timestamp-foomov.n
Uploads are working but I'm having trouble recombining the parts into a single binary. I have the following code from the Node.js server example the library authors provided on Github (https://github.com/flowjs/flow.js/tree/master/samples/Node.js).
$.write = function(identifier, writableStream, options) {
options = options || {};
options.end = (typeof options['end'] == 'undefined' ? true : options['end']);
// Iterate over each chunk
var pipeChunk = function(number) {
var chunkFilename = getChunkFilename(number, identifier);
fs.exists(chunkFilename, function(exists) {
if (exists) {
// If the chunk with the current number exists,
// then create a ReadStream from the file
// and pipe it to the specified writableStream.
var sourceStream = fs.createReadStream(chunkFilename);
sourceStream.pipe(writableStream, {
end: false
});
sourceStream.on('end', function() {
// When the chunk is fully streamed,
// jump to the next one
pipeChunk(number + 1);
});
} else {
// When all the chunks have been piped, end the stream
if (options.end) writableStream.end();
if (options.onDone) options.onDone();
}
});
}
pipeChunk(1);
}
I'm invoking this code with the following route and am expecting it to produce a reassembled binary in the tmp directory (that's where I'm saving my chunks). However nothing is happening. What am I missing?
exports.download = function(req, res, next) {
switch(req.method) {
case 'GET':
var stream = fs.createWriteStream('foobar');
flow.write(req.params.identifier, res);
break;
}
}

Reassembling all chunks is easy, just call this:
var stream = fs.createWriteStream(filename);
r.write(identifier, stream);
And that is it!
But other question is, when this method should be called?
Maybe when all chunks are uploaded and present at tmp folder.
But there is another issue with duplicate calls of the done.
This can be solved by creating and locking the file, once all chunks exists.
Then call
r.write(identifier, stream);
Then clean all chunks, release the lock and close the file.
Same approuch is done in php server side library: https://github.com/flowjs/flow-php-server/blob/master/src/Flow/File.php#L102

Related

Node Read Streams - How can I limit the number of open files?

I'm running into AggregateError: EMFILE: too many open files while streaming multiple files.
Machine Details:
MacOS Monterey,
MacBook Pro (14-inch, 2021),
Chip Apple M1 Pro,
Memory 16GB,
Node v16.13.0
I've tried increasing the limits with no luck.
Ideally I would like to be able to set the limit of the number of files open at one time or resolve by closing files as soon as they have been used.
Code below. I've tried to remove the unrelated code and replace it with '//...'.
const MultiStream = require('multistream');
const fs = require('fs-extra'); // Also tried graceful-fs and the standard fs
const { fdir } = require("fdir");
// Also have a require for the bz2 and split2 functions but editing from phone right now
//...
let files = [];
//...
(async() => {
const crawler = await new fdir()
.filter((path, isDirectory) => path.endsWith(".bz2"))
.withFullPaths()
.crawl("Dir/Sub Dir")
.withPromise();
for(const file of crawler){
files = [...files, fs.createReadStream(file)]
}
multi = await new MultiStream(files)
// Unzip
.pipe(bz2())
// Create chunks from lines
.pipe(split2())
.on('data', function (obj) {
// Code to filter data and extract what I need
//...
})
.on("error", function(error) {
// Handling parsing errors
//...
})
.on('end', function(error) {
// Output results
//...
})
})();

To prevent pre-opening a filehandle for every single file in your array, you want to only open the files upon demand when it's that particular file's turn to be streamed. And, you can do that with multi-stream.
Per the multi-stream doc, you can lazily create the readStreams by changing this:
for(const file of crawler){
files = [...files, fs.createReadStream(file)]
}
to this:
let files = crawler.map((f) => {
return function() {
return fs.createReadStream(f);
}
});

After reading over the npm page for multistream I think I have found something that will help. I have also edited where you are adding the stream to the files array as I don't see a need to instantiate a new array and spread existing elements like you are doing.
To lazily create the streams, wrap them in a function:
var streams = [
fs.createReadStream(__dirname + '/numbers/1.txt'),
function () { // will be executed when the stream is active
return fs.createReadStream(__dirname + '/numbers/2.txt')
},
function () { // same
return fs.createReadStream(__dirname + '/numbers/3.txt')
}
]
new MultiStream(streams).pipe(process.stdout) // => 123 ```
With that we can update your logic to include this functionality by simply wrapping the readStreams in functions, this way the streams will not be created until they are needed. This will prevent you from having too many open at once. We can do this by simply updating your file loop:
for(const file of crawler){
files.push(function() {
return fs.createReadStream(file)
})
}

NodeJS - read and write file causes corruption

I'm kinda new to NodeJS and I'm working on a simple file encoder.
I planned to change the very first 20kb of a file and just copy the rest of it.
So I used the following code, but it changed some bytes in the rest of the file.
Here is my code:
var fs = require('fs');
var config = require('./config');
fs.open(config.encodeOutput, 'w', function(err, fw) {
if(err) {
console.log(err);
} else {
fs.readFile(config.source, function(err, data) {
var start = 0;
var buff = readChunk(data, start);
while(buff.length) {
if(start < config.encodeSize) {
var buffer = makeSomeChanges(buff);
writeChunk(fw, buffer);
} else {
writeChunk(fw, buff);
}
start += config.ENCODE_BUFFER_SIZE;
buff = readChunk(data, start);
}
});
}
});
function readChunk(buffer, start) {
return buffer.slice(start, start + config.ENCODE_BUFFER_SIZE);
}
function writeChunk(fd, chunk) {
fs.writeFile(fd, chunk, {encoding: 'binary', flag: 'a'});
}
I opened encoded file and compared it with the original file.
I even commented these parts:
//if(start < config.encodeSize) {
// var buffer = makeSomeChanges(buff);
// writeChunk(fw, buffer);
//} else {
writeChunk(fw, buff);
//}
So my program just copies the file, but it still changes some bytes.
What is wrong?

So I checked the pattern and I realized some bytes are not in the right place and I guessed that it should be because I'm using async write function.
I changed fs.writeFile() to fs.writeFileSync() and everything is working fine now.

Since you were using asynchronous IO, you should've been waiting for a queue of operations, as multiple writes happening at the same time are likely to end up corrupting your file. This explains why your issue is solved using synchronous IO — this way, a further write cannot start before the previous one completed.
However, using synchronous APIs when asynchronous ones are available is a poor choice, due to which your program will be actually blocked while it writes to the file. You should go for async and create a queue to wait for.

Stop function from being invoked multiple times

I'm in the process of building a file upload component that allows you to pause/resume file uploads.
The standard way to achieve this seems to be to break the file into chunks on the client machine, then send the chunks along with book-keeping information up to the server which can store the chunks into a staging directory, then merge them together when it has received all of the chunks. So, this is what I am doing.
I am using node/express and I'm able to get the files fine, but I'm running into an issue because my merge_chunks function is being invoked multiple times.
Here's my call stack:
router.post('/api/videos',
upload.single('file'),
validate_params,
rename_uploaded_chunk,
check_completion_status,
merge_chunks,
record_upload_date,
videos.update,
send_completion_notice
);
the check_completion_status function is implemented as follows:
/* Recursively check to see if we have every chunk of a file */
var check_completion_status = function (req, res, next) {
var current_chunk = 1;
var see_if_chunks_exist = function () {
fs.exists(get_chunk_file_name(current_chunk, req.file_id), function (exists) {
if (current_chunk > req.total_chunks) {
next();
} else if (exists) {
current_chunk ++;
see_if_chunks_exist();
} else {
res.sendStatus(202);
}
});
};
see_if_chunks_exist();
};
The file names in the staging directory have the chunk numbers embedded in them, so the idea is to see if we have a file for every chunk number. The function should only next() one time for a given (complete) file.
However, my merge_chunks function is being invoked multiple times. (usually between 1 and 4) Logging does reveal that it's only invoked after I've received all of the chunks.
With this in mind, my assumption here is that it's the async nature of the fs.exists function that's causing the issue.
Even though the n'th invocation of check_completion_status may occur before I have all of the chunks, by the time we get to the nth call to fs.exists(), x more chunks may have arrived and been processed concurrently, so the function can keep going and in some cases get to the end and next(). However those chunks that arrived concurrently are also going to correspond to invocations of check_completion_status, which are also going to next() because we obviously have all of the files at this point.
This is causing issues because I didn't account for this when I wrote merge_chunks.
For completeness, here's the merge_chunks function:
var merge_chunks = (function () {
var pipe_chunks = function (args) {
args.chunk_number = args.chunk_number || 1;
if (args.chunk_number > args.total_chunks) {
args.write_stream.end();
args.next();
} else {
var file_name = get_chunk_file_name(args.chunk_number, args.file_id)
var read_stream = fs.createReadStream(file_name);
read_stream.pipe(args.write_stream, {end: false});
read_stream.on('end', function () {
//once we're done with the chunk we can delete it and move on to the next one.
fs.unlink(file_name);
args.chunk_number += 1;
pipe_chunks(args);
});
}
};
return function (req, res, next) {
var out = path.resolve('videos', req.video_id);
var write_stream = fs.createWriteStream(out);
pipe_chunks({
write_stream: write_stream,
file_id: req.file_id,
total_chunks: req.total_chunks,
next: next
});
};
}());
Currently, I'm receiving an error because the second invocation of the function is trying to read the chunks that have already been deleted by the first invocation.
What is the typical pattern for handling this type of situation? I'd like to avoid a stateful architecture if possible. Is it possible to cancel pending handlers right before calling next() in check_completion_status?

If you just want to make it work ASAP, I would use a lock (much like a db lock) to lock the resource so that only one of the requests processes the chunks. Simply create a unique id on the client, and send it along with the chunks. Then just store that unique id in some sort of a data structure, and look that id up prior to processing. The example below is by far not optimal (in fact this map will keep growing, which is bad), but it should demonstrate the concept
// Create a map (an array would work too) and keep track of the video ids that were processed. This map will persist through each request.
var processedVideos = {};
var check_completion_status = function (req, res, next) {
var current_chunk = 1;
var see_if_chunks_exist = function () {
fs.exists(get_chunk_file_name(current_chunk, req.file_id), function (exists) {
if (processedVideos[req.query.uniqueVideoId]){
res.sendStatus(202);
} else if (current_chunk > req.total_chunks) {
processedVideos[req.query.uniqueVideoId] = true;
next();
} else if (exists) {
current_chunk ++;
see_if_chunks_exist();
} else {
res.sendStatus(202);
}
});
};
see_if_chunks_exist();
};

Running function once inside node stream pipe chain

I'm using vinyl-fs to write a simple pipeline that loads markdown files, converts them to HTML, and saves them to disk. This is all working.
However, in the middle of my pipe() chain, I want to perform an asynchronous task that should just be done once for all my files. My current problem relates to loading a file (and it's important that file is loaded in the middle of the chain), but it's a problem I find myself stumbling upon all the time.
To solve this problem, I have started to do this:
vfs.src(*.md).pipe(function() {
var loaded = false;
return through2.obj(function(file, enc, cb) {
if(!loaded) {
fs.readFile('myfile', function(err, data) {
// use data for something
loaded = true;
cb(null, file);
}
} else {
// passthrough
cb(null, file);
}
});
}
This feels a bit silly. Am I approaching this all wrong, or is this actually an okay thing to do?

After reading a ton of articles about Node streams, it seems that the best implementation for this is to listen to the streams finish event, and then create a new stream based on the files from the former stream. This allows me to do exactly what I want: stream files through the pipeline until a point where I need to access the array of files for some task, and then continue the pipeline stream afterwards.
Here's what that looks like:
var vfs = require('vinyl-fs');
var through = require('through2');
// array for storing file objects
var files = [];
// start the stream
var firstStream = vfs.src("*.jpg")
// pipe it through a function that saves each file to the array
.pipe(through.obj(function(file, enc, cb) {
files.push(file);
console.log('1: ', path.basename(file.path));
cb(null, file);
}))
// when this stream is done
.on('finish', function() {
console.log('FINISH');
// files will now be full of all files from stream
// and you can do whatever you want with them.
// create a new stream
var secondStream = through.obj();
// write the files to the new stream
files.each(function(file) {
secondStream.write(file);
});
// end the stream to make sure the finish
// event triggers
secondStream.end();
// now continue piping
secondStream.pipe(through.obj(function(file, enc, cb) {
console.log('2: ', path.basename(file.path));
cb(null, file)
}))
.pipe(vfs.dest("tmp"));
});
In this scenario, I have 5 JPG images next to my scripts, and the console.log will say
1: IMG_1.JPG
1: IMG_2.JPG
1: IMG_3.JPG
1: IMG_4.JPG
1: IMG_5.JPG
FINISH
2: IMG_1.JPG
2: IMG_2.JPG
2: IMG_3.JPG
2: IMG_4.JPG
2: IMG_5.JPG

Node js- writing data to the writable stream

In my node application im writing data to the file using write method in the createWriteStream method.Now i need to find whether the write for the particular stream is complete or not.How can i find that.
var stream = fs.createWriteStream('myFile.txt', {flags: 'a'});
var result = stream.write(data);
writeToStream();
function writeToStream() {
var result = stream.write(data + '\n');
if (!result) {
stream.once('drain',writeToStream());
}
}
I need to call other method for every time when write completes.How can i do this.

From the node.js WritableStream.write(...) documentation you can give the "write" method a callback that is called when the written data is flushed:
var stream = fs.createWriteStream('myFile.txt', {flags: 'a'});
var data = "Hello, World!\n";
stream.write(data, function() {
// Now the data has been written.
});
Note that you probably don't need to actually wait for each call to "write" to complete before queueing the next call. Even if the "write" method returns false you can still call subsequent writes and node will buffer the pending write requests into memory.

I am using maerics's answer along with error handling. The flag 'a' is used to Open file for appending. The file is created if it does not exist. There Other flags you can use.
// Create a writable stream & Write the data to stream with encoding to be utf8
var writerStream = fs.createWriteStream('MockData/output.txt',{flags: 'a'})
.on('finish', function() {
console.log("Write Finish.");
})
.on('error', function(err){
console.log(err.stack);
});
writerStream.write(outPutData,function() {
// Now the data has been written.
console.log("Write completed.");
});
// Mark the end of file
writerStream.end();

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Reassembling file chunks produced in a multi-part upload - node.js

Related

Node Read Streams - How can I limit the number of open files?

NodeJS - read and write file causes corruption

Stop function from being invoked multiple times

Running function once inside node stream pipe chain

Node js- writing data to the writable stream

Categories

Resources