How to use stream.pipeline in aws nodejs lambda - node.js

I am trying to stream the data from a mongodb cursor into an s3 file using a nodejs lambda.
Following is a snippet of my code.
What i observe is that the lambda does not wait for the pipeline to complete and exists it, so the file is not written to s3.
But the same works fine if I run it as a standalone node.js script.
const logger = require('./logger').logger;
let s3Client = require('aws-sdk/clients/s3');
const stream = require('stream');
const util = require('util');
const pipeline = util.promisify(stream.pipeline);
exports.handler = async (event, context) => {
await pipeline(
client.db("somedb").collection("somecollection").aggregate(crtiriaObj).stream({transform: x => `${JSON.stringify(x)}\n`}),
uploadFromStream()
)
};
let uploadFromStream = () => {
let pass = new stream.PassThrough();
let s3 = new s3Client();;
let params = {Bucket: "bucketname", Key: "filename", Body: pass};
s3.upload(params, function(err, data) {
if (err) {
logger.error(`Error uploading file ${fileName}`,err);
} else {
logger.info(`Successfully uploaded file: ${fileName}, result: ${JSON.stringify(data)}`);
}
});
return pass;
};

I ended up doing it without async / await fashion.
My coded ended up looking like the snippet below.
I have also written a blogpost about it at: https://dev.to/anandsunderraman/copying-over-data-from-mongodb-to-s3-3j4g
const MongoClient = require('mongodb').MongoClient;
let s3Client = require('aws-sdk/clients/s3');
const stream = require('stream');
const pipeline = stream.pipeline;
//brute force method loading all the data into an array
exports.copyData = (event, context, callback) => {
MongoClient.connect(getDBURI(), {
useNewUrlParser: true,
useUnifiedTopology: true
}).then((dbConnection) => {
pipeline(
dbConnection.db("<db-name>").collection("<collection-name>").aggregate(<aggregate-criteria>)
.stream({transform: x => convertToNDJSON(x)}),
uploadDataToS3(callback),
(err) => {
if (err) {
console.log('Pipeline failed.', err);
} else {
console.log('Pipeline succeeded.');
}
}
)
})
}
/**
* Construct the DB URI based on the environment
* #returns {string}
*/
const getDBURI = () => {
//best practice is to fetch the password from AWS Parameter store
return "mongodb://<username>:<password>#<hostname>/<your-db-name>";
};
//converts each db record to ndjson => newline delimited json
let convertToNDJSON = (data) => {
return JSON.stringify(data) + "\n";
};
let uploadDataToS3 = (callback) => {
let env = process.env;
let s3 = null;
let pass = new stream.PassThrough();
if (env === 'local') {
s3 = new s3Client({
accessKeyId: 'minioadmin' ,
secretAccessKey: 'minioadmin' ,
endpoint: 'http://host.docker.internal:9000' ,
s3ForcePathStyle: true, // needed with minio?
signatureVersion: 'v4'
});
} else {
s3 = new s3Client();
}
//using multipart upload to speed up the process
let params = {Bucket: '<your-bucket-name>', Key: '<file-name>', Body: pass};
let opts = {queueSize: 2, partSize: 1024 * 1024 * 10};
s3.upload(params,opts, function(err, data) {
if (err) {
console.log(`Error uploading file ${file-name}`,err);
} else {
console.log(`Successfully uploaded file: ${file-name}, result: ${JSON.stringify(data)}`);
}
callback();
});
return pass;
};

Related

How to read JSON from S3 by AWS Lambda Node.js 18.x runtime?

#TBA gave the solution.
The root cause is not by runtime. It came from SDK v3.
Point: Do not update the code with mixed things (like both of runtime & SDK version together 🥲)
Thanks again, TBA.
I was using Node.js 14.x version runtime Lambda to read some json file from S3.
Brief code is below
const AWS = require("aws-sdk");
const s3 = new AWS.S3();
exports.handler = (event) => {
const { bucketName, objKey } = event
const params = {
Bucket: bucketName,
Key: objKey
};
return new Promise((resolve) => {
s3.getObject(params, async (err, data) =>{
if (err) console.log(err, err.stack);
else {
const contents = JSON.parse(data.Body)
resolve(contents);
}
});
})
};
and it returned the json data as I expected.
And today I tried to create a new lambda with runtime Node.js 18.x but it returned null or some errors...
Q) Could you give me some advice to solve this 🥲 ?
+) I used same json file for each lambda
+) Not sure why, but in my case, data.Body.toString() didn't work (I saw some answers in stackoverflow provide that and tried but no lucks)
Thanks in advance!
Case A (returns null)
import { S3Client, GetObjectCommand } from "#aws-sdk/client-s3";
const s3Client = new S3Client({ region: "ap-northeast-2" });
export const handler = (event) => {
const { objKey, bucketName } = event;
const params={
Bucket: bucketName,
Key: objKey
};
const getObjCommand = new GetObjectCommand(params);
return new Promise((resolve) => {
s3Client.send(getObjCommand, async (err, data) =>{
if (err) console.log(err, err.stack);
else {
const list = JSON.parse(data.Body)
resolve(list);
}
});
})
};
Case B (returns "Unexpected token o in JSON at position 1")
export const handler = async (event) => {
const { objKey, bucketName } = event;
const params={
Bucket: bucketName,
Key: objKey
};
const getObjCommand = new GetObjectCommand(params);
const response = await s3Client.send(getObjCommand)
console.log("JSON.parse(response.Body)", JSON.parse(response.Body))
};
Case C (returns "TypeError: Converting circular structure to JSON")
export const handler = async (event) => {
const { objKey, bucketName } = event;
const params={
Bucket: bucketName,
Key: objKey
};
const getObjCommand = new GetObjectCommand(params);
try {
const response = await s3Client.send(getObjCommand)
return JSON.stringify(response.Body)
} catch(err) {
console.log("error", err)
return err
}
};

Nodejs S3 - Delete Multiple Images

My code it's responding as successful but the images are still in my bucket.
I'm trying to delete images from my s3Bucket (upload works perfect) but the Images delete process it's not working properly.
Here's my code:
myRouter
router.delete("/:id", auth, async (req, res) => {
const productSelected = await Product.findById(req.params.id);
const result = await drop(productSelected);
console.log('>>>>>>> ', result);
res.send(result);
});
myS3Class
const aws = require('aws-sdk');
const multer = require('multer');
const multerS3 = require('multer-s3');
const AWS_CONSTANTS = require('../constants');
const config = require('config');
aws.config.update({
secretAccessKey: config.get('AWS_SECRET_ACCESS_KEY'),
accessKeyId: config.get('AWS_ACCESS_KEY_ID'),
region: AWS_CONSTANTS.region
})
const s3 = new aws.S3();
const drop = async (data) => {
let obj = [];
let result = "";
data.images.map((image) => {
obj.push({ Key: `${AWS_CONSTANTS.bucket}/${image}` }); // This is the full path
});
const options = {
Bucket: AWS_CONSTANTS.bucket,
Delete: {
Objects: obj,
Quiet: false,
},
}
try {
await s3.deleteObjects(
options,
function (err, data) {
if (err) console.log('err ==>', err);
console.log('delete successfully', data);
result = data;
}
).promise();
} catch(error) {
return { success: false, data: null }
}
return result;
}
module.exports.drop = drop;
This is the response from my code:
>>>>>>> delete successfully {
Deleted: [ { Key: 'my-bucketfolder/1655743085375Python.png' } ],
Errors: []
}

AWS Lambda Custom Nodejs Container Shows Runtime Error

I have built a AWS Lambda function with custom container image. I am trying to convert an excel file to pdf with Libreoffice - getting the file from S3 and saving it to a file and converting it to pdf and then uploading it back to S3.
Here the code.
const fs = require('fs');
const getStream = require('get-stream');
const { Readable } = require('stream')
const { S3Client, GetObjectCommand, PutObjectCommand } = require("#aws-sdk/client-s3");
const libre = require('libreoffice-convert');
const path = require('path');
exports.handler = async (event) => {
const bucket = event.queryStringParameters.bucket;
const file = event.queryStringParameters.file;
const convertedFile = event.queryStringParameters.convertedFile;
if (event.queryStringParameters['warmup'] !== undefined) {
return {
result: true,
message: 'warmed up'
}
}
const client = new S3Client({ region: "ap-south-1" });
const command = new GetObjectCommand({ Bucket: bucket, Key: file });
const response = await client.send(command);
const objectData = response.Body;
const writeStream = fs.createWriteStream("/tmp/sample.xlsx");
objectData.pipe(writeStream);
var end = new Promise((resolve, reject) => {
objectData.on('close', resolve(true));
objectData.on('end', resolve(true));
objectData.on('error', reject(false));
});
let completed = await end;
if (completed) {
const extend = '.pdf'
const outputPath = `/tmp/sample${extend}`;
const enterPath = '/tmp/sample.xlsx';
var readingFile = new Promise((resolve, reject) => {
fs.readFile(enterPath, (err, data)=>{
if (err) {
reject(false);
}
resolve(data);
});
});
var fileData = await readingFile;
var converting = new Promise((resolve, reject) => {
libre.convert(fileData, extend, undefined, (err, done) => {
if (err) {
reject(false)
}
fs.writeFileSync(outputPath, done);
resolve(true)
});
})
var converted = await converting;
if (converted) {
var convertedFileStream = fs.createReadStream(outputPath);
const uploadCommand = new PutObjectCommand({ Bucket: bucket, Key: convertedFile, Body: convertedFileStream });
const lastResponse = await client.send(uploadCommand);
const returnResponse = {
result: true,
message: 'success',
bucket: event.queryStringParameters.bucket,
file: event.queryStringParameters.file,
convertedFile: event.queryStringParameters.convertedFile
};
if (event.queryStringParameters['returnEvent'] !== undefined) {
returnResponse['returnEvent'] = event;
}
return returnResponse;
}
}
return completed;
};
However, I am getting this error at time. Sometimes, it is success, but, sometimes it throws this error.
{
"errorType": "Error",
"errorMessage": "false",
"stack": [
"Error: false",
" at _homogeneousError (/function/node_modules/aws-lambda-ric/lib/Runtime/CallbackContext.js:56:16)",
" at postError (/function/node_modules/aws-lambda-ric/lib/Runtime/CallbackContext.js:72:34)",
" at done (/function/node_modules/aws-lambda-ric/lib/Runtime/CallbackContext.js:99:13)",
" at fail (/function/node_modules/aws-lambda-ric/lib/Runtime/CallbackContext.js:113:13)",
" at /function/node_modules/aws-lambda-ric/lib/Runtime/CallbackContext.js:148:24",
" at processTicksAndRejections (internal/process/task_queues.js:97:5)"
]
}
I dont know Nodejs on a great deal so I think if the code is not written the correct way. Any ideas what I am doing wrong here ?
Like #hoangdv when I logged errors I came to know that the file saving to the disk was not correct. So, I changed the area of the code where it saves to like this and then it worked.
const client = new S3Client({ region: "ap-south-1" });
const command = new GetObjectCommand({ Bucket: bucket, Key: file });
const { Body } = await client.send(command);
await new Promise((resolve, reject) => {
Body.pipe(fs.createWriteStream(filePath))
.on('error', err => reject(err))
.on('close', () => resolve())
})
const excelFile = fs.readFileSync(filePath);

AWS S3 Loading many files

I need to upload a lot of files (about 65.000) splitted in subdirectory.
I tried to iterate and load every single file like this:
const fs = require("fs");
const path = require("path");
const async = require("async");
const AWS = require("aws-sdk");
const readdir = require("recursive-readdir");
const slash = require("slash");
const { BUCKET, KEY, SECRET } = process.env;
const rootFolder = path.resolve(__dirname, "./");
const uploadFolder = "./test_files/15";
const s3 = new AWS.S3({
signatureVersion: "v4",
accessKeyId: KEY,
secretAccessKey: SECRET,
});
function getFiles(dirPath) {
return fs.existsSync(dirPath) ? readdir(dirPath) : [];
}
async function deploy(upload) {
if (!BUCKET || !KEY || !SECRET) {
throw new Error("you must provide env. variables: [BUCKET, KEY, SECRET]");
}
const filesToUpload = await getFiles(path.resolve(__dirname, upload));
return new Promise((resolve, reject) => {
async.eachOfLimit(
filesToUpload,
10,
async.asyncify(async (file) => {
const Key = file.replace(rootFolder + path.sep, "");
console.log(`uploading: [${slash(Key)}]`);
var options = { partSize: 5 * 1024 * 1024, queueSize: 4 };
return new Promise((res, rej) => {
s3.upload(
{
Key: slash(Key),
Bucket: BUCKET,
Body: fs.readFileSync(file),
},
(err) => {
if (err) {
return rej(new Error(err));
}
res({ result: true });
}
);
});
}),
(err) => {
if (err) {
return reject(new Error(err));
}
resolve({ result: true });
}
);
});
}
deploy(uploadFolder)
.then(() => {
console.log("task complete");
process.exit(0);
})
.catch((err) => {
console.error(err);
process.exit(1);
});
but after a considerable number of uploads i have this:
Error: Error: NetworkingError: connect ETIMEDOUT IP_S3_AWS
I need to upload this set of files from ec2 instance (because its a result of a image processing). I have this behavior from my pc, i don't know if from ec2 have the same problem.
I have considered the way of zip all and upload but i need to keep the original directory structure.
I accept also new way to resolve the problem.
Sorry for my bad english.
It would probably be much simpler to use the AWS CLI aws s3 sync command instead of building this yourself.

Create a zip file on S3 from files on S3 using Lambda Node

I need to create a Zip file that consists of a selection of files (videos and images) located in my s3 bucket.
The problem at the moment using my code below is that I quickly hit the memory limit on Lambda.
async.eachLimit(files, 10, function(file, next) {
var params = {
Bucket: bucket, // bucket name
Key: file.key
};
s3.getObject(params, function(err, data) {
if (err) {
console.log('file', file.key);
console.log('get image files err',err, err.stack); // an error occurred
} else {
console.log('file', file.key);
zip.file(file.key, data.Body);
next();
}
});
},
function(err) {
if (err) {
console.log('err', err);
} else {
console.log('zip', zip);
content = zip.generateNodeStream({
type: 'nodebuffer',
streamFiles:true
});
var params = {
Bucket: bucket, // name of dest bucket
Key: 'zipped/images.zip',
Body: content
};
s3.upload(params, function(err, data) {
if (err) {
console.log('upload zip to s3 err',err, err.stack); // an error occurred
} else {
console.log(data); // successful response
}
});
}
});
Is this possible using Lambda, or should I look at a different
approach?
Is it possible to write to a compressed zip file on the fly, therefore eliminating the memory issue somewhat, or do I need to have the files collected before compression?
Any help would be much appreciated.
Okay, I got to do this today and it works. Direct Buffer to Stream, no disk involved. So memory or disk limitation won't be an issue here:
'use strict';
const AWS = require("aws-sdk");
AWS.config.update( { region: "eu-west-1" } );
const s3 = new AWS.S3( { apiVersion: '2006-03-01'} );
const _archiver = require('archiver');
//This returns us a stream.. consider it as a real pipe sending fluid to S3 bucket.. Don't forget it
const streamTo = (_bucket, _key) => {
var stream = require('stream');
var _pass = new stream.PassThrough();
s3.upload( { Bucket: _bucket, Key: _key, Body: _pass }, (_err, _data) => { /*...Handle Errors Here*/ } );
return _pass;
};
exports.handler = async (_req, _ctx, _cb) => {
var _keys = ['list of your file keys in s3'];
var _list = await Promise.all(_keys.map(_key => new Promise((_resolve, _reject) => {
s3.getObject({Bucket:'bucket-name', Key:_key})
.then(_data => _resolve( { data: _data.Body, name: `${_key.split('/').pop()}` } ));
}
))).catch(_err => { throw new Error(_err) } );
await new Promise((_resolve, _reject) => {
var _myStream = streamTo('bucket-name', 'fileName.zip'); //Now we instantiate that pipe...
var _archive = _archiver('zip');
_archive.on('error', err => { throw new Error(err); } );
//Your promise gets resolved when the fluid stops running... so that's when you get to close and resolve
_myStream.on('close', _resolve);
_myStream.on('end', _resolve);
_myStream.on('error', _reject);
_archive.pipe(_myStream); //Pass that pipe to _archive so it can push the fluid straigh down to S3 bucket
_list.forEach(_itm => _archive.append(_itm.data, { name: _itm.name } ) ); //And then we start adding files to it
_archive.finalize(); //Tell is, that's all we want to add. Then when it finishes, the promise will resolve in one of those events up there
}).catch(_err => { throw new Error(_err) } );
_cb(null, { } ); //Handle response back to server
};
I formated the code according to #iocoker.
main entry
// index.js
'use strict';
const S3Zip = require('./s3-zip')
const params = {
files: [
{
fileName: '1.jpg',
key: 'key1.JPG'
},
{
fileName: '2.jpg',
key: 'key2.JPG'
}
],
zippedFileKey: 'zipped-file-key.zip'
}
exports.handler = async event => {
const s3Zip = new S3Zip(params);
await s3Zip.process();
return {
statusCode: 200,
body: JSON.stringify(
{
message: 'Zip file successfully!'
}
)
};
}
Zip file util
// s3-zip.js
'use strict';
const fs = require('fs');
const AWS = require("aws-sdk");
const Archiver = require('archiver');
const Stream = require('stream');
const https = require('https');
const sslAgent = new https.Agent({
KeepAlive: true,
rejectUnauthorized: true
});
sslAgent.setMaxListeners(0);
AWS.config.update({
httpOptions: {
agent: sslAgent,
},
region: 'us-east-1'
});
module.exports = class S3Zip {
constructor(params, bucketName = 'default-bucket') {
this.params = params;
this.BucketName = bucketName;
}
async process() {
const { params, BucketName } = this;
const s3 = new AWS.S3({ apiVersion: '2006-03-01', params: { Bucket: BucketName } });
// create readstreams for all the output files and store them
const createReadStream = fs.createReadStream;
const s3FileDwnldStreams = params.files.map(item => {
const stream = s3.getObject({ Key: item.key }).createReadStream();
return {
stream,
fileName: item.fileName
}
});
const streamPassThrough = new Stream.PassThrough();
// Create a zip archive using streamPassThrough style for the linking request in s3bucket
const uploadParams = {
ACL: 'private',
Body: streamPassThrough,
ContentType: 'application/zip',
Key: params.zippedFileKey
};
const s3Upload = s3.upload(uploadParams, (err, data) => {
if (err) {
console.error('upload err', err)
} else {
console.log('upload data', data);
}
});
s3Upload.on('httpUploadProgress', progress => {
// console.log(progress); // { loaded: 4915, total: 192915, part: 1, key: 'foo.jpg' }
});
// create the archiver
const archive = Archiver('zip', {
zlib: { level: 0 }
});
archive.on('error', (error) => {
throw new Error(`${error.name} ${error.code} ${error.message} ${error.path} ${error.stack}`);
});
// connect the archiver to upload streamPassThrough and pipe all the download streams to it
await new Promise((resolve, reject) => {
console.log("Starting upload of the output Files Zip Archive");
streamPassThrough.on('close', resolve());
streamPassThrough.on('end', resolve());
streamPassThrough.on('error', reject());
archive.pipe(streamPassThrough);
s3FileDwnldStreams.forEach((s3FileDwnldStream) => {
archive.append(s3FileDwnldStream.stream, { name: s3FileDwnldStream.fileName })
});
archive.finalize();
}).catch((error) => {
throw new Error(`${error.code} ${error.message} ${error.data}`);
});
// Finally wait for the uploader to finish
await s3Upload.promise();
}
}
The other solutions are great for not so many files (less than ~60). If they handle more files, they just quit into nothing with no errors. This is because they open too many streams.
This solution is inspired by https://gist.github.com/amiantos/16bacc9ed742c91151fcf1a41012445e
It is a working solution, which works well even with many files (+300) and returns a presigned URL to the zip which contains the files.
Main Lambda:
const AWS = require('aws-sdk');
const S3 = new AWS.S3({
apiVersion: '2006-03-01',
signatureVersion: 'v4',
httpOptions: {
timeout: 300000 // 5min Should Match Lambda function timeout
}
});
const archiver = require('archiver');
import stream from 'stream';
const UPLOAD_BUCKET_NAME = "my-s3-bucket";
const URL_EXPIRE_TIME = 5*60;
export async function getZipSignedUrl(event) {
const prefix = `uploads/id123123/}`; //replace this with your S3 prefix
let files = ["12314123.png", "56787567.png"] //replace this with your files
if (files.length == 0) {
console.log("No files to zip");
return result(404, "No pictures to download");
}
console.log("Files to zip: ", files);
try {
files = files.map(file => {
return {
fileName: file,
key: prefix + '/' + file,
type: "file"
};
});
const destinationKey = prefix + '/' + 'uploads.zip'
console.log("files: ", files);
console.log("destinationKey: ", destinationKey);
await streamToZipInS3(files, destinationKey);
const presignedUrl = await getSignedUrl(UPLOAD_BUCKET_NAME, destinationKey, URL_EXPIRE_TIME, "uploads.zip");
console.log("presignedUrl: ", presignedUrl);
if (!presignedUrl) {
return result(500, null);
}
return result(200, presignedUrl);
}
catch(error) {
console.error(`Error: ${error}`);
return result(500, null);
}
}
Helper functions:
export function result(code, message) {
return {
statusCode: code,
body: JSON.stringify(
{
message: message
}
)
}
}
export async function streamToZipInS3(files, destinationKey) {
await new Promise(async (resolve, reject) => {
var zipStream = streamTo(UPLOAD_BUCKET_NAME, destinationKey, resolve);
zipStream.on("error", reject);
var archive = archiver("zip");
archive.on("error", err => {
throw new Error(err);
});
archive.pipe(zipStream);
for (const file of files) {
if (file["type"] == "file") {
archive.append(getStream(UPLOAD_BUCKET_NAME, file["key"]), {
name: file["fileName"]
});
}
}
archive.finalize();
})
.catch(err => {
console.log(err);
throw new Error(err);
});
}
function streamTo(bucket, key, resolve) {
var passthrough = new stream.PassThrough();
S3.upload(
{
Bucket: bucket,
Key: key,
Body: passthrough,
ContentType: "application/zip",
ServerSideEncryption: "AES256"
},
(err, data) => {
if (err) {
console.error('Error while uploading zip')
throw new Error(err);
reject(err)
return
}
console.log('Zip uploaded')
resolve()
}
).on("httpUploadProgress", progress => {
console.log(progress)
});
return passthrough;
}
function getStream(bucket, key) {
let streamCreated = false;
const passThroughStream = new stream.PassThrough();
passThroughStream.on("newListener", event => {
if (!streamCreated && event == "data") {
const s3Stream = S3
.getObject({ Bucket: bucket, Key: key })
.createReadStream();
s3Stream
.on("error", err => passThroughStream.emit("error", err))
.pipe(passThroughStream);
streamCreated = true;
}
});
return passThroughStream;
}
export async function getSignedUrl(bucket: string, key: string, expires: number, downloadFilename?: string): Promise<string> {
const exists = await objectExists(bucket, key);
if (!exists) {
console.info(`Object ${bucket}/${key} does not exists`);
return null
}
let params = {
Bucket: bucket,
Key: key,
Expires: expires,
};
if (downloadFilename) {
params['ResponseContentDisposition'] = `inline; filename="${encodeURIComponent(downloadFilename)}"`;
}
try {
const url = s3.getSignedUrl('getObject', params);
return url;
} catch (err) {
console.error(`Unable to get URL for ${bucket}/${key}`, err);
return null;
}
};
Using streams may be tricky as I'm not sure how you could pipe multiple streams into an object. I've done this several times using standard file object. It's a multistep process and it's quite fast. Remember that Lambda operates in Linux so you have all Linux resources at hand including the system /tmp directory.
Create a sub-directory in /tmp call "transient" or whatever works for you
Use s3.getObject() and write file objects to /tmp/transient
Use the GLOB package to generate an array[] of paths from /tmp/transient
Loop the array and zip.addLocalFile(array[i]);
zip.writeZip('tmp/files.zip');
I've used a similar approach, but I'm facing the issue that some of the files in the generated ZIP file don't have the correct size (and corresponding data). Is there any limitation on the size of the files this code can manage? In my case I'm zipping large files (a few larger than 1GB) and the overall amount of data may reach 10GB.
I do not get any error/warning message, so it seems it all works fine.
Any idea what may be hapenning?

Resources