untarring files to S3 fails, not sure why - node.js

(new information below)
I am trying to set up a lambda function that reacts to uploaded tgz files by uncompressing them and writing the results back to S3. The unzip and untar work fine, but uploading to S3 fails:
/Users/russell/lambda/gzip/node_modules/aws-sdk/lib/s3/managed_upload.js:350
var buf = self.body.read(self.partSize - self.partBuffer.length) ||
^
TypeError: undefined is not a function
at ManagedUpload.fillStream (/Users/russell/lambda/gzip/node_modules/aws-sdk/lib/s3/managed_upload.js:350:25)
at Entry.<anonymous> (/Users/russell/lambda/gzip/node_modules/aws-sdk/lib/s3/managed_upload.js:167:28)
at Entry.emit (events.js:104:17)
at Entry._read (/Users/russell/lambda/gzip/node_modules/tar/lib/entry.js:123:12)
at Entry.end (/Users/russell/lambda/gzip/node_modules/tar/lib/entry.js:82:8)
at Parse._process (/Users/russell/lambda/gzip/node_modules/tar/lib/parse.js:107:13)
at BlockStream.<anonymous> (/Users/russell/lambda/gzip/node_modules/tar/lib/parse.js:47:8)
at BlockStream.emit (events.js:107:17)
at BlockStream._emitChunk (/Users/russell/lambda/gzip/node_modules/tar/node_modules/block-stream/block-stream.js:145:10)
at BlockStream.write (/Users/russell/lambda/gzip/node_modules/tar/node_modules/block-stream/block-stream.js:45:10)
This error occurs when I write to S3, but if instead I write the files locally to disk it works, so the pipeline is correct.
Here is code that demonstrates the problem:
var aws = require('aws-sdk');
var s3 = new aws.S3({apiVersion: '2006-03-01'});
var zlib = require('zlib');
var tar = require('tar');
var fstream = require('fstream');
fstream.Reader({'path': 'testdata.tar.gz'})
.pipe(zlib.Unzip())
.pipe(tar.Parse())
.on('entry', function(entry) {
var filename = entry.path;
console.log('got ' + entry.type + ' ' + filename);
if (entry.type == 'File') {
if (1) { // switch between working and nonworking cases
s3.upload({Bucket: 'my_bucket', Key: 'gunzip-test/' + filename, Body: entry}, {},
function(err, data) {
if (err)
console.log('ERROR!');
else
console.log('OK');
});
}
else {
entry.pipe(fstream.Writer({ 'path': '/tmp/mytest/' + filename }));
}
}
});
If the code is set to write to S3 it fails with the above error, if it writes the extracted files locally it succeeds. ENTRY is a stream, and according to the doc should be accepted in the upload Body parameter. I put a print statement in ManagedUpload, where the fail comes, and confirmed that self.body is a stream:
var stream = require('stream');
console.log('is it a stream? ' + ((self.body instanceof stream) ? 'yes' : 'no'));
console.log('self.body.read is ' + self.body.read);
returns
$ got File gunzip.js
is it a stream? yes
self.body.read is undefined
I'm pretty new with aws and node.js, so there could be a basic problem with this, but I've spent a day and haven't found it. I did the upload call with unzip instead of gzip and it worked (using lambda functions to unzip archives in S3 is really sloooooow) Can anyone point me at something I am doing wrong in this code?
Thanks
I think I understand this a little better. I broke the pipeline up into pieces and looked at each one. The problem is that tar.Parse uses fstream and not stream. If I look at the return of the .pipe(tar.Parse()) statement it is a stream, but it is not a stream.Readable or a stream.Writable. fstream does not define a read() method (its reader is based on Stream, it is not a stream.Readable), so tar.Parse, which is based on Stream, does not have one either.
So a refinement of the question is, is this a bug in fstream, or is fstream not intended to be a stream? I think it is a bug - from the README:
"Like FS streams, but with stat on them, and supporting directories and
symbolic links, as well as normal files. Also, you can use this to set
the stats on a file, even if you don't change its contents, or to create
a symlink, etc."

In my case running the stream through stream.PassThrough helped.
var PassThrough = require('stream').PassThrough;
var stream = getStreamSomeHow();
var passthrough = new PassThrough();
stream.pipe(passthrough);
s3.upload({...,Body:passthrough}) //

Your body variable is a Stream object, in which case you will need to use .toString()
var aws = require('aws-sdk');
var s3 = new aws.S3({apiVersion: '2006-03-01'});
var zlib = require('zlib');
var tar = require('tar');
var fstream = require('fstream');
fstream.Reader({'path': 'testdata.tar.gz'})
.pipe(zlib.Unzip())
.pipe(tar.Parse())
.on('entry', function(entry) {
var filename = entry.path;
console.log('got ' + entry.type + ' ' + filename);
if (entry.type == 'File') {
if (1) { // switch between working and nonworking cases
s3.upload({Bucket: 'my_bucket', Key: 'gunzip-test/' + filename, Body: entry.toString()}, {},
function(err, data) {
if (err)
console.log('ERROR!');
else
console.log('OK');
});
}
else {
entry.pipe(fstream.Writer({ 'path': '/tmp/mytest/' + filename }));
}
}
});

Related

readFileSync not returning buffer when giving dynamic filename but giving result when static filename is given

readFileSync is not giving any response if I pass filename dynamic but if I pass it to statically it will give response.
function base64_encode(file) {
const fs = require('fs');
let bitmap = fs.readFileSync(file);
return Buffer.from(bitmap).toString('base64');
}
let data_base64 = await base64_encode(process.cwd() + '/public/receipt/' + data.file_name);
That code is working well for me. I'd suggest adding some logging to try and understand what is going wrong, I'd add the lines below.
const fileName = process.cwd() + '/public/receipt/' + data.file_name;
console.log("File name:", fileName);
console.log("File exists:", require('fs').existsSync(fileName));
console.log("File stat:", require('fs').statSync(fileName));
let data_base64 = await base64_encode(fileName);
console.log("Base64:", data_base64);

nodejs/fs: writing a tar to memory buffer

I need to be able to tar a directory, and send this to a remote endpoint via HTTP PUT.
I could of course create the tar, save it to disk, then read it again and send it.
But I'd rather like to create the tar, then pipe it to some buffer and send it immediately. I haven't been able to achieve this.
Code so far:
var tar = require('tar');
var fs = require("fs");
var path = "/home/me/uploaddir";
function getTar(path, cb) {
var buf = new Buffer('');
var wbuf = fs.createWriteStream(buf);
wbuf.on("finish", function() {
cb(buf);
});
tar.c({file:""},[path]).
pipe(wbuf);
}
getTar(path, function(tar) {
//send the tar over http
});
This code results in:
fs.js:575
binding.open(pathModule._makeLong(path),
^
TypeError: path must be a string
at TypeError (native)
at Object.fs.open (fs.js:575:11)
I've also tried using an array as buffer, no joy.
The following solution
creates the tar, then pipes it to some buffer and sends it immediately
and with great speed thanks to the tar-fs library:
First install the libraries request for simplified requests and tar-fs, which provides filesystem bindings for tar-stream: npm i -S tar-fs request
var tar = require('tar-fs')
var request = require('request')
var fs = require('fs')
// pack specific files in the directory
function packTar (folderName, pathsArr) {
return tar.pack(folderName, {
entries: pathsArr
})
}
// return put stream
function makePutReq (url) {
return request.put(url)
}
packTar('./testFolder', ['test.txt', 'test1.txt'])
.pipe(makePutReq('https://www.example.com/put'))
I have renamed the function names to be super verbose.

Create an archive and add files in it before sending to client

I'd like to create an archive with archiver and put some files in it. Client's side, an user will click on a button and it'll generate the archive (server's side).
I'm using Express.js, this is my server side code where the archive will be generated. I did something like this :
app.get('/export/siteoccupancy', function(req,res){
if(_.isEmpty(req.query)){
res.status(404).send('requires a start and end date');
}else{
//getting paramas
var sDate = req.query.startDate;
var eDate = req.query.endDate;
}
var fs = require('fs');
var archiver = require('archiver');
var archive = archiver('zip');
archive.on('err',function(err){
res.status(500).send({error : err.message});
});
res.on('close',function(){
console.log('Archive size : %d b',archive.pointer());
return res.status(200).send('OK').end();
});
res.attachment('data-export.zip');
archive.pipe(res);
var stream = fs.createWriteStream("data-report.txt')");
stream.once('open',function(fd) {
stream.write('test1');
stream.write('\n test2');
stream.write('\n test3');
stream.end();
});
archive.append(stream);
archive.finalize();
});
This is totally new for me and I'd like to understand why the console tells me the stream file is empty ?
Error: append: entry name must be a non-empty string value
at Archiver.append
Regards
you can only append a readStream, because an archive can only take data from readstreams. You can use the method named archive.append, and you should pass as a second argument with the name property to name the file. Like so :
archive.append(myReadStream,{ name : 'myTest.txt'});

Why append rather than write when using knox / node.js to grab file from Amazon s3

I'm experimenting with the knox module for node.js as a way of managing some small files in an Amazon S3 bucket. Everything works fine stand-alone: I can upload a file, download a file, etc. However, I want to be able to download a file on recurring schedule. When I modify the code to run on an interval, I'm getting the downloaded file appending to the previous instance instead of overwriting.
I'm not sure if I've made a mistake in the file write code or in the knox handling code. I've tried several different write approaches (writeFile, writeStream, etc.) and I've looked at the knox source code. Nothing obvious to me stands out as a problem. Here's the code I'm using:
knox = require('knox');
fs = require('fs');
var downFile = DOWNFILE;
var downTxt = '';
var timer = INTERVAL;
var path = S3PATH + downFile;
setInterval(function()
{
var s3client = knox.createClient(
{
key: '********************',
secret: '**********************************',
bucket: '********'
});
s3client.get(path).on('response', function(response)
{
response.setEncoding('ascii');
response.on('data', function(chunk)
{
downTxt += chunk;
});
response.on('end', function()
{
fs.writeFileSync(downFile, downTxt, 'ascii');
});
}).end();
},
timer);
The problem is with your placement of var downTxt = '';. That is the only place you set downTxt to blank, so every time you retrieve more data, you add it to the data that you got in the previous request because you never clear the data from the previous request. The simplest fix is to move that line to just before the setEncoding line.
However, the way you are processing the data is unnecessarily complicated. Try something like this instead. You don't need to recreate the client every time, and setting the encoding will just break things if you are downloading non-text files, and it won't make a difference with text files. Next, you shouldn't manually collect the data, you can immediately start writing it to the file as you receive it. Lastly, since request is a standard stream, you don't need to monitor the 'data' event because you can just use pipe.
var knox = require('knox'),
fs = require('fs'),
downFile = DOWNFILE,
timer = INTERVAL,
path = S3PATH + downFile,
s3client = knox.createClient({
key: '********************',
secret: '**********************************',
bucket: '********'
});
(function downloadFile() {
var str = fs.createWriteStream(downFile);
s3client.get(path).pipe(str);
str.on('close', function() {
setTimeout(downloadFile, timer);
});
})();

How do you upload, stream, and hash a file's contents in Node.js?

I'd like to upload files on my server and name them according to their contents. This should be simple (it is in python), but I am having a hard time figuring out how to do it in Node.js.
I am using express and connect-form, which really just uses formidable. I also see that node has a library called crypto that is very similar to python's hashlib. Now I just need to understand how to stream the temp file connect-form gives me and hash it.
This is a Python/Flask(ish) implementation of what I'd like to do.
import hashlib
from Flask import request
def upload():
file = request.files['file']
hash = hashlib.sha256()
name, ext = file.filename.rsplit('.', 1)
try:
for chunk in file.chunks()
hash.update(chunk)
finally:
file.seek(0)
new_name = "%s.%s" % (hash.hexdigest(),ext)
file.save(os.path.join(UPLOAD_DIR, new_name))
I have seen a lot of these toy answers that just print out the file's name, but none that actually read and write the data.
Here it is in coffee-script in all its glory.
app.post '/upload', (request, response, next) ->
request.form.complete (error, fields, files) ->
if error
next error
else
file = files.file
[kind, extension] = file.type.split '/'
hash = crypto.createHash 'sha256'
stream = fs.createReadStream file.path,
encoding:'binary'
stream.addListener 'data', (chunk) ->
hash.update chunk
stream.addListener 'close', ->
digest = hash.digest 'hex'
new_filename = "#{digest}.#{extension}"
new_path = "#{UPLOAD_DIR}/#{new_filename}"
fs.rename file.path, new_path
response.end new_filename
Rather than hacking the hash calculation into formidable, which would likely be more efficient but far more complicated, I opted to just re-read the file from its temporary location and hash that. Then instead of pumping it like in other examples, I just renamed the temp file into its new location.
you can save a file like this:
var fs = require('fs'),
util = require('util'),
crypto = require('crypto');
// ...
req.form.complete(function (err, fields, files) {
// ...
var ext = files['content[media]']['filename'].split('.');
ext = ext[ext.length-1];
ext = ext.toLowerCase();
var newFileName = req['connection']['remoteAddress'] + req['connection']['remotePort'] + Date.now();
newFileName = crypto.createHash('md5').update(newFileName).digest("hex");
newFileName += '.' + ext;
var is = fs.createReadStream(files['content[media]']['path']);
var os = fs.createWriteStream(app.set('dataDir') + '/' + newFileName);
// copy file to public folder
util.pump(is, os, function(error) {
if (error) {
console.log("Error copying file to public ... " + error);
res.redirect("back");
return;
}
else {
// delete temp file
fs.unlinkSync(files['content[media]']['path']);
res.redirect('....');
}
});
});

Resources