untar/decompress to a stream in node - node.js

I am trying to write an AWS Lambda that will take a tar.gz from a S3 bucket, inflate it and then unpack it whilst streaming the files back to another S3 bucket.
I have this code:
var AWS = require('aws-sdk');
var fs = require('fs');
var zlib = require('zlib');
var uuid = require('uuid/v4');
var tar = require('tar-stream')
var pack = tar.pack()
var s3 = new AWS.S3();
exports.handler = (event, context, callback) => {
var bucket = event.Records[0].s3.bucket.name;
var key = event.Records[0].s3.object.key;
var file = 'S3://' + bucket + '/' + key;
console.log(bucket)
console.log(key)
var readParams = {
Bucket: bucket,
Key: key
};
var dataStream = s3.getObject(readParams).createReadStream();
var extract = tar.extract()
extract.on('entry', function(header, stream, next) {
console.log(header.name)
var writeParams = {
Bucket: process.env.JOB_PROCESSING_BUCKET,
Key: uuid() + '-' + header.name,
Body: stream
};
s3.upload(writeParams).
on('httpUploadProgress', function(evt) {
console.log('Progress:', evt.loaded, '/', evt.total);
}).
send(function(err, data) {
if (err) console.log("An error occurred", err);
console.log("Uploaded the file at", data.Location);
});
stream.on('end', function() {
next() // ready for next entry
})
stream.resume() // just auto drain the stream
})
extract.on('finish', function() {
// all entries read
})
dataStream.pipe(zlib.createGunzip()).pipe(extract);
callback(null, 'Gunzip Lambda Function');
};
It pulls the file, sorts the gzipping out and then i can see each file being extracted on entry. The code then tries to steam the file to S3 which creates a 0kb file hangs around like its reading the stream then continues onto the next.
Why cant it seem to read/processes the stream body?
Is there a better way of doing this?
Thanks

I don't know if it's the best solution but the following code works for me.
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
const tar = require('tar-stream');
const zlib = require('zlib');
const stream = require('stream');
const uuid = require('uuid');
exports.get = (event, context) => {
var params = {
Bucket: event.Records[0].s3.bucket.name,
Key: event.Records[0].s3.object.key
};
var dataStream = s3.getObject(params).createReadStream();
var extract = tar.extract();
extract.on('entry', function(header, inputStream, next) {
inputStream.pipe(uploadFromStream(s3,header));
inputStream.on('end', function() {
next(); // ready for next entry
});
inputStream.resume(); // just auto drain the stream
});
extract.on('finish', function() {
// all entries read
});
dataStream.pipe(zlib.createGunzip()).pipe(extract);
}
function uploadFromStream(s3,header) {
var pass = new stream.PassThrough();
var writeParams = {
Bucket: process.env.JOB_PROCESSING_BUCKET,
Key: uuid.v1() + '-' + header.name,
Body: pass
};
s3.upload(writeParams, function(err, data) {
context.done(err, data);
});
return pass;
}

Tried for a couple of hours to get this to work, turns out the 'finish' event has been replaced with 'end'. So - answer above works great, just small change -
inputStream.on('end', function() {
next(); // ready for next entry
});
- Should be -
inputStream.on('finish', function() {
next(); // ready for next entry
});

Related

Unable to fetch list of all S3 objects using NodeJs

Kindly excuse my knowledge with NodeJs, as I've just started with it. I've following lambda function which isn't fetching list of objects (more than 1000) in S3 and stuck in infinite loop, resulting lambda timimg out. Not sure what's wrong here
Code:
console.log('Loading');
const AWS = require('aws-sdk');
var request=true;
const awsOptions = {
region: "us-east-1"
};
const s3 = new AWS.S3(awsOptions);
var list = [];
exports.handler = async (event, context, callback) => {
const SrcBucket = event.Records[0].s3.bucket.name;
const trigger_file = event.Records[0].s3.object.key;
var bucketParams = {
Bucket: SrcBucket,
Prefix: 'Test/'
};
do
{
s3.listObjects(bucketParams, (err, data) => {
if (err)
console.log("Error", err);
else
{
list.push(data.Contents);
if (data.IsTruncated)
bucketParams.Marker = data.NextMarker;
else
request = false;
}
});
} while (request);
callback(null, {
listLen: list.length
});

An issue with reading a gzipped file (.gz) with IBM Cloud Function (Action: Node.js 12)

I can read the data.json.gz file on my local machine with the code mentioned below (node --version: v14.15.0). But when I try to use the same in IBM Cloud with an Action (Node.js 12) to read the same file from an Object Store Bucket, I get the below error
["stderr: ERROR: undefined - input_buf.on is not a function"].
I am very new to NodeJS; Can someone help to identify the issue here?
I do appreciate your support.
Code that works on Local machine (Windows 10):
function decompressFile(filename) {
var fs = require("fs"),
zlib = require("zlib"),
var input = fs.createReadStream(filename);
var data = [];
input.on('data', function(chunk){
data.push(chunk);
}).on('end', function(){
var buf = Buffer.concat(data);
zlib.gunzip(buf, function(err, buffer) {
if (!err) {
var dataString = buffer.toString()
console.log(dataString, dataString+'\n');
var dataJSON = JSON.parse(dataString.toString('utf8'));
}else{
console.log(err);
}
});
});
}
decompressFile("data.json.gz");
Code that does not work on IBM Cloud Function and Object Store Bucket:
// Get file contents of gzipped item
async function getGzippedItem(cosClient, bucketName, itemName) { // <<< async keyword added
const fs = require('fs');
const zlib = require('zlib');
return await cosClient.getObject({ // <<< turned into assignment with await
Bucket: bucketName,
Key: itemName
}).promise()
.then((instream=fs.createReadStream(itemName)) => {
if (instream != null) {
var data = [];
var input_buf = instream.Body
input_buf.on('data', function(chunk){
data.push(chunk);
}).on('end', function() {
var buf = Buffer.concat(data);
zlib.gunzip(buf, function (err, buffer) {
if (!err) {
var dataString = buffer.toString()
var dataJSON = JSON.parse(dataString.toString('utf8'));
} else {
console.log(err);
}
});
});
return buf
}
})
.catch((e) => {
console.error(`ERROR: ${e.code} - ${e.message}\n`);
});
};
async function main(params) {
bucketName = 'bucket'
itemName = 'data.json.gz'
var ibm = require('ibm-cos-sdk');
var util = require('util');
var fs = require('fs');
// Initializing configuration
const myCOS = require('ibm-cos-sdk');
var config = {
endpoint: 'endpoint',
apiKeyId: 'apiKeyId',
ibmAuthEndpoint: 'ibmAuthEndpoint',
serviceInstanceId: 'serviceInstanceId',
};
var cosClient = new myCOS.S3(config);
gzippedItemContent = await getGzippedItem(cosClient, bucketName, itemName) // <<< await keyword added
console.log(">>>>>>>>>>>>>>>: ", typeof gzippedItemContent, gzippedItemContent )
}
The message is telling you, that your input_buf object is not of the type you expect it to be. The result of your createReadStream() call is just a stream:
[Stream] the readable stream object that can be piped or read from (by registering 'data' event listeners).
So you should be able to access the value directly
(not declaring var input_buf = instream.Body):
var getObjectStream = cosClient.getObject({
Bucket: 'BUCKET',
Key: 'KEY'
}).createReadStream();
getObjectStream.on('data', function(c) {
data += c.toString();
});
Have a look at the test section of the ibm-cos-sdk-js project, it is describing how to use the API.

Unresolved Promise Assistance Node.js

I have the following code below, which is a lambda function to get content from a s3Object zip file. I know for a fact that I am not resolving the list of promises and need a little direction on how to resolve. I have read several codes on here but having a hard time applying it to my code. Any assistance would be greatly appreciated.
// dependencies
const AWS = require('aws-sdk');
var JSZip = require('jszip');
// get reference to S3 client
const s3 = new AWS.S3();
exports.handler = async (event, context, callback) => {
// Read options from the event parameter.
const srcBucket = event.Records[0].s3.bucket.name;
// Object key may have spaces or unicode non-ASCII characters.
const srcKey = decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, " "));
// Download the file from the S3 source bucket.
try {
const params = {
Bucket: srcBucket,
Key: srcKey
};
const data = await s3.getObject(params).promise();
var zip = JSZip.loadAsync(data.Body).then(function (content){
return content;
});
zip.then(function(result){
var entries = Object.keys(result.files).map(function (name) {
if(name.indexOf("TestStatus") != -1){
return name;
}
}).filter(notUndefined => notUndefined !== undefined);
var listOfPromises = entries.map(function(entry) {
return result.file(entry).async("text").then(function(fileContent){
return fileContent;
});
});
Promise.all(listOfPromises).then((values) =>{
values.forEach(function(value){
console.log(value);
});
});
});
} catch (error) {
context.fail(error);
return;
}
};
Modified/Corrected code
// dependencies
const AWS = require('aws-sdk');
var JSZip = require('jszip');
// get reference to S3 client
const s3 = new AWS.S3();
exports.handler = async (event, context, callback) => {
// Read options from the event parameter.
const srcBucket = event.Records[0].s3.bucket.name;
// Object key may have spaces or unicode non-ASCII characters.
const srcKey = decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, " "));
// Download the file from the S3 source bucket.
try {
const params = {
Bucket: srcBucket,
Key: srcKey
};
const data = await s3.getObject(params).promise();
var zip = JSZip.loadAsync(data.Body);
return zip.then(function(result){
var entries = Object.keys(result.files).map((name) =>{
if(name.indexOf("TestStatus") != -1){
return result.files[name];
}
}).filter(notUndefined => notUndefined !== undefined);
var listOfPromises = entries.map((entry) => {
return entry.async("text")
.then((u8) => {
return [entry.name, u8];
}).catch(error => console.error(error));
});
var promiseOfList = Promise.all(listOfPromises);
promiseOfList.then(function (list) {
console.log(list.toString());
});
});
} catch (error) {
context.fail(error);
return;
}
};
If you look closely you are not retuning anything it is why it stays on Pending
const AWS = require('aws-sdk');
var JSZip = require('jszip');
// get reference to S3 client
const s3 = new AWS.S3();
exports.handler = async (event, context, callback) => {
// Read options from the event parameter.
const srcBucket = event.Records[0].s3.bucket.name;
// Object key may have spaces or unicode non-ASCII characters.
const srcKey = decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, " "));
// Download the file from the S3 source bucket.
try {
const params = {
Bucket: srcBucket,
Key: srcKey
};
const data = await s3.getObject(params).promise();
// here is the problem
// var zip = JSZip.loadAsync(data.Body).then(function (content){
// return content;
// }
var zip = await JSZip.loadAsync(data.body)
return zip.then(function(result){
var entries = Object.keys(result.files).map(function (name) {
if(name.indexOf("TestStatus") != -1){
return name;
}
}).filter(notUndefined => notUndefined !== undefined);
var listOfPromises = entries.map(function(entry) {
return result.file(entry).async("text").then(function(fileContent){
return fileContent;
});
});
console.log("Helo");
Promise.all(listOfPromises).then((values) =>{
values.forEach(function(value){
console.log(value);
});
});
});
} catch (error) {
context.fail(error);
return;
}
};
```

stream the contents of an S3 object into hash algorithm node.js

I'm new to node.js and I'm trying to write a AWS lambda function that would stream the content of an s3 object into the node's crypto module to create a md5 checksum value of the s3 object. Not sure why but everytime I run the code it would generate different hash values on the console.log. can anyone point me in the right direction to fix my code? appreciate the help!
var crypto = require('crypto');
var fs = require('fs');
var AWS = require('aws-sdk');
var s3 = new AWS.S3();
exports.handler = (event, context, callback) => {
var params = {
Bucket: 'bucket_name',
Key: 'key',
};
var hash = crypto.createHash('md5');
var stream = s3.getObject(params, function(err, data) {
if (err){
console.log(err);
return;
}
}).createReadStream();
stream.on('data', function (data) {
hash.update(data, 'utf-8')
})
stream.on('end', function () {
console.log(hash.digest('hex'))
})
};
You were close. You are mixing the "callback" style method signature with a "createReadStream" signature. Try this:
const crypto = require('crypto');
const fs = require('fs');
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
exports.handler = (event, context, callback) => {
let params = {
Bucket: 'bucket_name',
Key: 'key',
};
let hash = crypto.createHash('md5');
let stream = s3.getObject(params).createReadStream();
stream.on('data', (data) => {
hash.update(data);
});
stream.on('end', () => {
let digest = hash.digest('hex');
console.log(digest);
callback(null, digest);
});
};
Not directly an answer, but you can also add the md5 has as a ETag when uploading a file to S3.
const crypt = require('crypto');
const fs = require('fs').promises;
const aws = require('aws-sdk');
async function uploadFileToS3WithMd5Hash(bucket, filename, s3Key = null) {
const data = await fs.readFile(filename);
const md5Base64 = crypt.createHash("md5").update(data).digest('base64');
if (!s3Key) {
s3Key = filename;
}
/** Should you want to get the MD5 in hex format: */
// const md5Hex = Buffer.from(md5Base64, 'base64').toString('hex');
return new Promise((res, rej) => {
const s3 = new aws.S3();
s3.putObject({
Bucket: bucket,
Key: s3Key,
Body: data,
ContentMD5: md5Base64,
}, (err, resp) => err ? rej(err) : res(resp));
})
}
uploadFileToS3WithMd5Hash('your-own-bucket', 'file.txt')
.then(console.log)
.catch(console.error);
So by checking the ETag for an object on S3, you would get the hex-string of the files MD5 hash.
In some cases (see this post by Dennis), MD5 checksum is computed automatically upon upload.

NodeJS How do I Download a file to disk from an aws s3 bucket?

My goal:
Display a dialog box prompting the user to save a file being downloaded from aws.
My problem:
I am currently using awssum-amazon-s3 to create a download stream. However I've only managed to save the file to my server or stream it to the command line... As you can see from my code my last attempt was to try and manually set the content disposition headers which failed. I cannot use res.download() as the headers have already been set?
How can I achieve my goal?
My code for node:
app.post('/dls/:dlKey', function(req, res, next){
// download the file via aws s3 here
var dlKey = req.param('dlKey');
Dl.findOne({key:dlKey}, function(err, dl){
if (err) return next(err);
var files = dl.dlFile;
var options = {
BucketName : 'xxxx',
ObjectName : files,
};
s3.GetObject(options, { stream : true }, function(err, data) {
// stream this file to stdout
fmt.sep();
data.Headers['Content-Disposition'] = 'attachment';
console.log(data.Headers);
data.Stream.pipe(fs.createWriteStream('test.pdf'));
data.Stream.on('end', function() {
console.log('File Downloaded!');
});
});
});
res.end('Successful Download Post!');
});
My code for angular:
$scope.dlComplete = function (dl) {
$scope.procDownload = true;
$http({
method: 'POST',
url: '/dls/' + dl.dlKey
}).success(function(data/*, status, headers, config*/) {
console.log(data);
$location.path('/#!/success');
}).error(function(/*data, status, headers, config*/) {
console.log('File download failed!');
});
};
The purpose of this code it to let users use a generated key to download a file once.
This is the entire code using streaming on the latest version of aws-sdk
var express = require('express');
var app = express();
var fs = require('fs');
app.get('/', function(req, res, next){
res.send('You did not say the magic word');
});
app.get('/s3Proxy', function(req, res, next){
// download the file via aws s3 here
var fileKey = req.query['fileKey'];
console.log('Trying to download file', fileKey);
var AWS = require('aws-sdk');
AWS.config.update(
{
accessKeyId: "....",
secretAccessKey: "...",
region: 'ap-southeast-1'
}
);
var s3 = new AWS.S3();
var options = {
Bucket : '/bucket-url',
Key : fileKey,
};
res.attachment(fileKey);
var fileStream = s3.getObject(options).createReadStream();
fileStream.pipe(res);
});
var server = app.listen(3000, function () {
var host = server.address().address;
var port = server.address().port;
console.log('S3 Proxy app listening at http://%s:%s', host, port);
});
This code worked for me with the most recent library:
var s3 = new AWS.S3();
var s3Params = {
Bucket: 'your bucket',
Key: 'path/to/the/file.ext'
};
s3.getObject(s3Params, function(err, res) {
if (err === null) {
res.attachment('file.ext'); // or whatever your logic needs
res.send(data.Body);
} else {
res.status(500).send(err);
}
});
Simply create a ReadStream from S3 and WriteStream to the location were u want to download. Find the code below. Works perfectly for me:
var AWS = require('aws-sdk');
var path = require('path');
var fs = require('fs');
AWS.config.loadFromPath(path.resolve(__dirname, 'config.json'));
AWS.config.update({
accessKeyId: AWS.config.credentials.accessKeyId,
secretAccessKey: AWS.config.credentials.secretAccessKey,
region: AWS.config.region
});
var s3 = new AWS.S3();
var params = {
Bucket: '<your-bucket>',
Key: '<path-to-your-file>'
};
let readStream = s3.getObject(params).createReadStream();
let writeStream = fs.createWriteStream(path.join(__dirname, 's3data.txt'));
readStream.pipe(writeStream);
You've already figured what's most important to solve your issue: you can pipe the file stream coming from S3 to any writable stream, be it a filestream… or the response stream that will be sent to the client!
s3.GetObject(options, { stream : true }, function(err, data) {
res.attachment('test.pdf');
data.Stream.pipe(res);
});
Note the use of res.attachment that will set the correct headers. You can also check out this answer regarding streams and S3.
Using aws SDK v3
npm install #aws-sdk/client-s3
download code
import { GetObjectCommand } from "#aws-sdk/client-s3";
/**
* download a file from AWS and send to your rest client
*/
app.get('/download', function(req, res, next){
var fileKey = req.query['fileKey'];
var bucketParams = {
Bucket: 'my-bucket-name',
Key: fileKey,
};
res.attachment(fileKey);
var fileStream = await s3Client.send(new GetObjectCommand(bucketParams));
// for TS you can add: if (fileStream.Body instanceof Readable)
fileStream.Body.pipe(res)
});
For this I use React frontend and node js backend. Frontend I use Axios. I used this click the button download file.
==== Node js backend code (AWS S3) ======
//inside GET method I called this function
public download = (req: Request, res: Response) => {
const keyName = req.query.keyName as string;
if (!keyName) {
throw new Error('key is undefined');
}
const downloadParams: AWS.S3.GetObjectRequest = {
Bucket: this.BUCKET_NAME,
Key: keyName
};
this.s3.getObject(downloadParams, (error, data) => {
if (error) {
return error;
}
res.send(data.Body);
res.end();
});
};
====== React js frontend code ========
//this function handle download button onClick
const downloadHandler = async (keyName: string) => {
const response = await axiosInstance.get( //here use axios interceptors
`papers/paper/download?keyName=${keyName}`,{
responseType:'blob', //very very important dont miss (if not downloaded file unsupported to view)
}
);
const url = window.URL.createObjectURL(new Blob([response.data]));
const link = document.createElement("a");
link.href = url;
link.setAttribute("download", "file.pdf"); //change "file.pdf" according to saved name you want, give extension according to filetype
document.body.appendChild(link);
link.click();
link.remove();
};
------ OR (if you are using normal axios and not axios interceptors) -----
axios({
url: 'http://localhost:5000/static/example.pdf',
method: 'GET',
responseType: 'blob', // very very important
}).then((response) => {
const url = window.URL.createObjectURL(new Blob([response.data]));
const link = document.createElement('a');
link.href = url;
link.setAttribute('download', 'file.pdf');
document.body.appendChild(link);
link.click();
});
For more refer below articles
1. article 1
2. article 2
Using express, based on Jushua's answer and https://docs.aws.amazon.com/AmazonS3/latest/userguide/example_s3_GetObject_section.html
public downloadFeedFile = (req: IFeedUrlRequest, res: Response) => {
const downloadParams: GetObjectCommandInput = parseS3Url(req.s3FileUrl.replace(/\s/g, ''));
logger.info("requesting S3 file " + JSON.stringify(downloadParams));
const run = async () => {
try {
const fileStream = await this.s3Client.send(new GetObjectCommand(downloadParams));
if (fileStream.Body instanceof Readable){
fileStream.Body.once('error', err => {
console.error("Error downloading s3 file")
console.error(err);
});
fileStream.Body.pipe(res);
}
} catch (err) {
logger.error("Error", err);
}
};
run();
};

Resources