Nodejs - testing AWS with Mocha - node.js

I'm having trouble writing the tests for the following nodejs code which uses AWS and graphicsmagick. I have tried to search for examples also on how to write tests for async's waterfall method but without any definite results.
// dependencies
var async = require('async');
var AWS = require('aws-sdk');
var gm = require('gm').subClass({ imageMagick: true });
var util = require('util');
// get reference to S3 client
var s3 = new AWS.S3();
exports.AwsHandler = function(event, context) {
// Read options from the event.
console.log("Reading options from event:\n", util.inspect(event, {depth: 5}));
var srcBucket = event.Records[0].s3.bucket.name;
var srcKey = event.Records[0].s3.object.key;
var dstnKey = srcKey;
// Infer the image type.
var typeMatch = srcKey.match(/\.([^.]*)$/);
if (!typeMatch) {
console.error('unable to infer image type for key ' + srcKey);
return;
}
var imageType = typeMatch[1];
if (imageType != "jpg" && imageType != "png") {
console.log('skipping non-image ' + srcKey);
return;
}
//Download the image from S3, transform, and upload to same S3 bucket but different folders.
async.waterfall([
function download(next) {
// Download the image from S3 into a buffer.
s3.getObject({
Bucket: srcBucket,
Key: srcKey
},
next);
},
function transformSave(response, next) {
var _buffer = null;
for (var i = 0; i<len; i++) {
// Transform the image buffer in memory.
gm(response.Body, srcKey)
.resize(_sizesArray[i].width)
.toBuffer(imageType, function(err, buffer) {
if (err) {
next(err);
} else {
console.log(buffer);
_buffer = buffer;
}
});
// put newly resized image into respective folder
s3.putObject({
Bucket: srcBucket,
Key: "dst/" + _sizesArray[i].destinationPath + "/" + dstnKey,
Body: _buffer,
ContentType: response.ContentType
}, next);
}
},
], function (err) {
if (err) {
console.error(
'---->Unable to resize ' + srcBucket + '/' + srcKey +
' and upload to ' + srcBucket + '/dst' +
' due to an error: ' + err
);
} else {
console.log(
'---->Successfully resized ' + srcBucket +
' and uploaded to ' + srcBucket + "/dst"
);
}
context.done();
}
);
};
My tests for this module so far:
require('blanket')({
pattern: function (filename) {
return !/node_modules/.test(filename);
}
});
// in terminal, type the following command to get code coverage: mocha -R html-cov > coverage.html
var chai = require('chai');
var sinonChai = require("sinon-chai");
var expect = chai.expect;
var sinon = require('sinon');
chai.use(sinonChai);
var sync = require("async");
var proxyquire = require('proxyquire');
describe('Image Resizing module', function () {
var gmSubclassStub = sinon.stub();
var getObjectStub = sinon.stub();
var putObjectSpy = sinon.spy();
var testedModule = proxyquire('../index', {
'gm': {subClass: sinon.stub().returns(gmSubclassStub)},
'AWS': {
"s3": {
getObject: sinon.stub().returns(getObjectStub),
putObject: putObjectSpy
}
}
});
describe('AwsHandler', function () {
var event = {
"Records": [
{
"s3": {
"bucket": {
"name": "testbucket"
},
"object": {
"key": "test.jpg"
}
}
}
]
};
it("should call gm write with correct files", function () {
// Arrange
// Spies are the methods you expect were actually called
var buffer800Spy = sinon.spy();
var buffer500Spy = sinon.spy();
var buffer200Spy = sinon.spy();
var buffer45Spy = sinon.spy();
// This is a stub that will return the correct spy for each iteration of the for loop
var resizeStub = sinon.stub();
resizeStub.withArgs(800).returns({toBuffer: buffer800Spy});
resizeStub.withArgs(500).returns({toBuffer: buffer500Spy});
resizeStub.withArgs(200).returns({toBuffer: buffer200Spy});
resizeStub.withArgs(45).returns({toBuffer: buffer45Spy});
// Stub is used when you just want to simulate a returned value
var nameStub = sinon.stub().yields({"name": "testbucket"});
var keyStub = sinon.stub().yields({"key": "test.jpg"});
gmSubclassStub.withArgs(event).returns({resize:resizeStub});
getObjectStub.withArgs(event).yields({name: nameStub}, {key: keyStub});
// Act - this calls the tested method
testedModule.AwsHandler(event);
// Assert
});
});
});

It's hard to respond this kind of question here; the question is not very specific and it's not an open question which can be replied with opinions, thoughts, etc.
Hence, I've created an similar implementation which solve the async.waterfall issue and provide a test which test the AwsHandler with 100% coverage.
The code is in this gist, because it's more handy and readable to be there than here.
I've also written a blog post related with this implementation

There are a few things that need to be changed:
You want to test the operation of the unit, without testing the implementation. That's why you should ignore the async in your tests (as you did).
It is just a way of implementing the method, the inner workings of the unit.
What you should test is that in given conditions, the unit gives the end result expected, in this case it's calling s3.putObject.
So you should stub everything that is external (gm and aws), and spy on the s3.putObject method, because that is the expected end result.
In your stubs you used "yield", which calls the callback function, but only if it is the first parameter.
If it's not, like in our case, you need to use "callsArgWith(index,...)" with the index of the parameter which is the callback.
The proxyquire has to have the injected modules with exactly the same name as in the code that requires them - changed 'AWS' to 'aws-sdk'
A way of checking if the stubs were injected correctly is in the debugger, put a watch on
"s3" variable, and check that it is "function proxy()" and not "function()". You can also print it to console if you're not using a debugger.
Your module is calling next in the for loop, which causes the waterfall to split into a tree with 36 calls to done(!).
Maybe you should use a different async model like map reduce. I fixed it by adding a silly condition, but that's not good code.
As a side note, you can see that the test is becoming awfully complicated.
This can be an indication that the tested code could use some separation of concerns.
For example, moving the gm operations, and the s3 operations to two separate modules can help separate things, and also make it easier to test.
Changes in the module itself, to prevent calling next 4*4 times:
function transform(response, next) {
for (var i = 0; i<len; i++) {
// Transform the image buffer in memory.
gm(response.Body, srcKey)
.resize(_sizesArray[i].width)
.toBuffer(imageType, function(err, buffer) {
if (err) {
next(err);
} else {
next(null, response.ContentType, buffer, i);
}
});
}
},
function upload(contentType, data, i, next) {
// Stream the transformed image to a different folder.
s3.putObject({
Bucket: srcBucket,
Key: "dst/" + _sizesArray[i].destinationPath + "/" + dstnKey,
Body: data,
ContentType: contentType
},
function(err) {
if (i==3) next(err);
});
}
And the test:
describe.only('Image Resizing module', function () {
var gmSubclassStub = sinon.stub();
var s3Stub = {};
var proxyquire = require('proxyquire');
var testedModule = proxyquire('../index', {
'gm': {subClass: sinon.stub().returns(gmSubclassStub)},
'aws-sdk': {"S3": sinon.stub().returns(s3Stub)}
});
describe('AwsHandler', function () {
var event = {};
// The done callback is used for async testing
it("should call gm write with correct files", function (done) {
// Arrange
var resizeStub = sinon.stub();
var buffer800Spy = sinon.stub().withArgs("jpg").callsArgWith(1, null, "800 buffer");
var buffer500Spy = sinon.stub().withArgs("jpg").callsArgWith(1, null, "500 buffer");
var buffer200Spy = sinon.stub().withArgs("jpg").callsArgWith(1, null, "200 buffer");
var buffer45Spy = sinon.stub().withArgs("jpg").callsArgWith(1, null, "45 buffer");
resizeStub.withArgs(800).returns({toBuffer: buffer800Spy});
resizeStub.withArgs(500).returns({toBuffer: buffer500Spy});
resizeStub.withArgs(200).returns({toBuffer: buffer200Spy});
resizeStub.withArgs(45).returns({toBuffer: buffer45Spy});
gmSubclassStub.withArgs("response body", "test.jpg").returns({resize: resizeStub});
s3Stub.getObject = sinon.stub()
.withArgs({name: "testbucket", key: "test.jpg"})
.callsArgWith(1, null, {
Body: "response body",
ContentType: "response content type"
});
var putObjectMock = sinon.mock();
s3Stub.putObject = putObjectMock;
putObjectMock.callsArgWith(1, null, {}); // return behaviour of the mock
putObjectMock.exactly(4); // sets expectation that it is called 4 times
// Act - this calls the tested method
testedModule.AwsHandler(event, {
done: function () {
// Assertions need to be inside callback because it is async
assert.deepEqual(putObjectMock.getCall(0).args[0], {
Bucket: "testbucket",
Key: "dst/large/test.jpg",
Body: "800 buffer",
ContentType: "response content type"
});
assert.deepEqual(putObjectMock.getCall(1).args[0], {
Bucket: "testbucket",
Key: "dst/medium/test.jpg",
Body: "500 buffer",
ContentType: "response content type"
});
assert.deepEqual(putObjectMock.getCall(2).args[0], {
Bucket: "testbucket",
Key: "dst/small/test.jpg",
Body: "200 buffer",
ContentType: "response content type"
});
assert.deepEqual(putObjectMock.getCall(3).args[0], {
Bucket: "testbucket",
Key: "dst/thumbnail/test.jpg",
Body: "45 buffer",
ContentType: "response content type"
});
// This ends the async test
done();
}
});
});
});
});

Related

Not going into AWS HttpClient.handleRequest to elasticsearch in lambda, Nodejs

I know this same question was basically asked and answered, however, trying to implement the answer did not get it to work. Here is the original question: AWS.HttpClient handleRequest is not working in AWS lambda
I tried putting async/await on multiple different portions of the request, but none of them worked as mentioned in one of the comments in the referred to link.
The situation is that I have a lambda function that listens for events in the S3 buckets, when an event happens it is supposed to index the documents in elasticsearch service. The issue happens when a the PUT request is sent to es.
I have done the test event with an S3 bucket and it WORKS, but for some reason it will hang/not go into the handleRequest function when I run an actual event to my S3 bucket.
Here is my code:
Index.js
const AWS = require('aws-sdk');
const s3 = new AWS.S3()
const elastic_client = require('elastic.js');
exports.handler = async (event, context) => {
const Bucket = event.Records[0].s3.bucket.name;
const Key = event.Records[0].s3.object.key;
const data = await s3.getObject({ Bucket, Key }).promise();
for (const quote_doc of data.Body) {
elastic_client.indexQuote(quote_doc);
}
}
elastic.js
var AWS = require('aws-sdk');
require('dotenv').config();
var region = process.env.AWS_REGION;
var domain = process.env.AWS_ELASTIC_DOMAIN;
function indexQuote(quote) {
var endpoint = new AWS.Endpoint(domain);
var request = new AWS.HttpRequest(endpoint, region);
var index = 'quotes';
var type = '_doc';
var id = quote.QuoteId;
request.method = 'PUT';
request.path += index + '/' + type + '/' + id;
request.body = JSON.stringify(quote);
request.headers['host'] = domain;
request.headers['Content-Type'] = 'application/json';
request.headers['Content-Length'] = Buffer.byteLength(request.body);
var credentials = new AWS.EnvironmentCredentials('AWS');
credentials.accessKeyId = process.env.AWS_ACCESS_KEY_ID;
credentials.secretAccessKey = process.env.AWS_SECRET_ACCESS_KEY;
var signer = new AWS.Signers.V4(request, 'es');
signer.addAuthorization(credentials, new Date());
var client = new AWS.HttpClient();
client.handleRequest(request, null, function(response) { // Here is where it gets hung up
console.log(response.statusCode + ' ' + response.statusMessage); // Never outputs this
var responseBody = '';
response.on('data', function (chunk) {
responseBody += chunk;
});
response.on('end', function (chunk) {
console.log('Response body: ' + responseBody);
});
}, function(error) {
console.log('Error: ' + error);
});
}
The confusing part for me is that it works fine when i do a test event, and it works fine when I index it locally on my own computer, but then just doesn't go into the handleRequest. Any help/direction is appreciated, thank you.
Edit:
package.json
{
"dependencies": {
"aws-sdk": "*",
"aws-xray-sdk": "^3.2.0",
"dotenv": "^8.2.0"
}
}
Try wrapping the handleRequest function inside a Promise. Your function indexQuote() would look almost the same, but at the end it would return a Promise
function indexQuote(quote) {
...
return new Promise((resolve, reject) => {
client.handleRequest(request, null,
response => {
const { statusCode, statusMessage, headers } = response;
let body = '';
response.on('data', chunk => {
body += chunk;
});
response.on('end', () => {
const data = {
statusCode,
statusMessage,
headers
};
if (body) {
data.body = body;
}
resolve(data);
});
},
err => {
reject(err);
});
});
And then you can await and inspect the result:
const result = await indexQuote(quote);
console.log("Index result: " + result);

When piping a movie from S3 the file isn't seekable

We have an application that sometimes serves a MP4 file which is stored on S3, since only specific people should be able to see each file, the file is private and inside our service we will only show it to authorised people.
The movie starts playing correctly (in the browser's built in video tag), however if we seek to a point in the movie that hasn't been buffered yet, the player will buffer for a bit, then stop playing. Afterwards clicking Play will cause the movie to start from the beginning. If I make the file public and access it directly form S3 seeking to an unbuffered point works correctly.
I created a standalone node program that reproduces this problem. I tried to make the response headers identical to those that S3 sends but the problem remains.
const http = require("http");
const AWS = require("aws-sdk");
const proxy = require("proxy-agent");
Object.assign(process.env, {
AWS_ACCESS_KEY_ID: "REDACTED",
AWS_SECRET_ACCESS_KEY: "REDACTED",
AWS_EC2_REGION: "us-west-2"
});
const s3 = new AWS.S3({
s3ForcePathStyle: 'true',
signatureVersion: 'v4',
httpOptions: { timeout: 300000 },
endpoint: 'https://s3.us-west-2.amazonaws.com',
region: 'us-west-2'
});
const objectParams = {
Bucket: 'REDACTED',
Key: 'some-movie.mp4'
};
let request = 0;
function serve(req, res) {
console.log("Handling request", ++request, req.url);
s3.headObject(objectParams, (err, data) => {
if (err)
throw err;
const { ContentType: type, ContentLength: length} = data;
console.log("Got", data);
if (data.ETag)
res.setHeader("ETag", data.ETag);
const range = req.headers.range;
if (range) {
console.log("Serving range", range);
const parts = range.replace("bytes=", "").split("-");
const start = parseInt(parts[0], 10);
const end = parts[1]? parseInt(parts[1], 10): length -1;
let headers = {
"Content-Range": `bytes ${start}-${end}/${length}`,
"Accept-Ranges": "bytes",
"Content-Type": type,
"Content-Length": end - start + 1,
"Last-Modified": data.LastModified,
};
if (req.headers["if-range"]) {
console.log("Setting if-range to", req.headers["if-range"]);
headers["If-Range"] = req.headers["if-range"];
}
res.writeHead(206, headers);
}
else {
console.log("Whole file");
res.setHeader("Accept-Ranges", "bytes");
res.setHeader("Content-Type", type);
res.setHeader("Content-Length", length);
res.setHeader("Last-Modified", data.LastModified);
}
const stream = s3.getObject(objectParams).createReadStream();
stream.on("error", err => console.error("stream error:", err));
stream.pipe(res).on("finish", data => {
console.log("Finished streaming");
});
});
}
http.createServer(serve).listen(1234);
What am I missing?
Here is the code with seekbar working just fine. You can test by integrating the below code and just open the api url in the browser.
import mime from 'mime-types';
const key = 'S3_BUCKET KEY';
const params = { Key: key, Bucket: AWS_BUCKET };
//s3 here refers to AWS.S3 object.
s3.headObject(params, function (err, data) {
if (err) {
console.error(err);
return next(err);
}
if (req.headers.range) {
const range = req.headers.range;
const bytes = range.replace(/bytes=/, '').split('-');
const start = parseInt(bytes[0], 10);
const total = data.ContentLength;
const end = bytes[1] ? parseInt(bytes[1], 10) : total - 1;
const chunkSize = end - start + 1;
res.set('Content-Range', 'bytes ' + start + '-' + end + '/' + total);
res.set('Accept-Ranges', 'bytes');
res.set('Content-Length', chunkSize.toString());
params['Range'] = range;
console.log('video buffering - range, total, start, end ,params', range, total, start, end, params);
} else {
res.set('Content-Length', data.ContentLength.toString());
console.log('video buffering - ,params', params);
}
res.status(206);
res.set('Content-Type', mime.lookup(key));
res.set('Last-Modified', data.LastModified.toString());
res.set('ETag', data.ETag);
const stream = s3.getObject(params).createReadStream();
stream.on('error', function error(err) {
return next(err);
});
stream.on('end', () => {
console.log('Served by Amazon S3: ' + key);
});
stream.pipe(res);
});

Error: Stream yields empty buffer when reducing picture quality with gm on AWS Lambda

i've read all other topics and tried a few answers, but i can't seem to figure out why i get this error.
My code gets the uploaded pic in a S3 bucket, reduces the quality and puts it in a second bucket. Plain and simple.
With small/medium images everything works just fine, but if i upload something over 2 MB (more or less) i get the error in the title.
My Lambda function has 128MB and 3 minutes of timeout; here is the code:
const gm = require('gm').subClass({imageMagick: true});
const AWS = require('aws-sdk');
const async = require('async');
const S3 = new AWS.S3();
exports.handler = (event, context, callback) => {
var srcBucket = event.Records[0].s3.bucket.name;
var srcKey = event.Records[0].s3.object.key;
var dstBucket = "destinationbucket";
var dstKey = "resized-" + srcKey;
// Infer the image type.
var typeMatch = srcKey.match(/\.([^.]*)$/);
if (!typeMatch) {
callback("Could not determine the image type.");
return;
}
var imageType = typeMatch[1].toLowerCase();
if (imageType != "jpg" && imageType != "png" && imageType != "jpeg") {
callback('Unsupported image type: ${imageType}');
return;
}
async.waterfall([
function download(next) {
S3.getObject({Bucket : srcBucket, Key : srcKey}, next);
},
function transform(response, next) {
var img_quality_reduced = gm(response.Body);
img_quality_reduced.quality(75).toBuffer(function( error, buffer )
{
if( error ) { console.log( error ); return; }
next(null, response.ContentType, buffer);
}
);
},
function upload(contentType, data, next) {
S3.putObject({Bucket: dstBucket, Key: dstKey, Body: data}, next);
},
function ending(next) {
console.log('got to ending');
context.done();
}
], function (err) {
console.log(err);
context.done();
});
};
Any idea why is this happening? i have loaded async, gm and graphicsmagick to Lambda (as a zip file). all downloaded through npm

Promises or Async with Node js

I have this large amount of code which gets an image from a S3 bucket, saves it to a temporary file on Lambda, resizes it to 4 different sizes, saves it into different folders according to size and them puts the images back into the s3 bucket also into different folders.
However when running on Lambda, I have to call context.done() at the end of the whole process otherwise the context will remain alive until Lambda times out.
So I need to call context.done() when upload returns for the last time.
Looking into the two options, async and promises, which would likely need less refactoring of my code to work?
// dependencies
var AWS = require('aws-sdk');
var gm = require('gm').subClass({ imageMagick: true });
var fs = require("fs");
// get reference to S3 client
var s3 = new AWS.S3();
var _800px = {
width: 800,
destinationPath: "large"
};
var _500px = {
width: 500,
destinationPath: "medium"
};
var _200px = {
width: 200,
destinationPath: "small"
};
var _45px = {
width: 45,
destinationPath: "thumbnail"
};
var _sizesArray = [_800px, _500px, _200px, _45px];
var len = _sizesArray.length;
module to be exported when in production
ports.AwsHandler = function(event, context) {
// Read options from the event.
var srcBucket = event.Records[0].s3.bucket.name;
var srcKey = event.Records[0].s3.object.key;
var dstnFolder = "/tmp";
// function to determine paths
function _filePath (directory, i) {
if ( directory === false ) {
return "dst/" + _sizesArray[i].destinationPath + "/" + srcKey;
} else if ( directory === true ) {
return dstnFolder + "/" + _sizesArray[i].destinationPath + "/" + srcKey;
}
};
for ( var i = 0; i<len; i++) {
fs.mkdir("/tmp" + "/" + _sizesArray[i].destinationPath, function (err) {
if (err) {
console.log(err);
}
});
};
// Infer the image type.
var typeMatch = srcKey.match(/\.([^.]*)$/);
if (!typeMatch) {
console.error('unable to infer image type for key ' + srcKey);
return;
};
var imageType = typeMatch[1];
if (imageType != "jpg" && imageType != "png") {
console.log('skipping non-image ' + srcKey);
return;
};
function download () {
s3.getObject({
Bucket: srcBucket,
Key: srcKey
},
function (err, response) {
if (err) {
console.error(err);
}
fs.writeFile("/tmp" + "/" + srcKey, response.Body, function (err) {
transform();
})
}
);
};
function transform () {
var _Key,
_Size;
for ( var i = 0; i<len; i++ ) {
// define path for image write
_Key = _filePath (true, i);
// define sizes to resize to
_Size = _sizesArray[i].width;
// resize images
gm("/tmp/" + srcKey)
.resize(_Size)
.write(_Key, function (err) {
if (err) {
return handle(err);
}
if (!err) {
// get the result of write
var readPath = this.outname;
var iniPath = this.outname.slice(4);
var writePath = "dst".concat(iniPath);
read(err, readPath, writePath, upload);
}
});
};
};
function read (err, readPath, writePath, callback) {
// read file from temp directory
fs.readFile(readPath, function (err, data) {
if (err) {
console.log("NO READY FILE FOR YOU!!!");
console.error(err);
}
callback(data, writePath);
});
};
function upload (data, path) {
// upload images to s3 bucket
s3.putObject({
Bucket: srcBucket,
Key: path,
Body: data,
ContentType: data.type
},
function (err) {
if (err) {
console.error(err);
}
console.log("Uploaded with success!");
});
}
download();
Take a look at how they use Q in this example.
Your code will end up very similar to
download()
.then(transform)
.then(read)
.then(upload)
.catch(function (error) {
// Handle any error from all above steps
console.error(error);
})
.done(function() {
console.log('Finished processing image');
context.done();
});
You could also take a look to async and use it as they show in this other example.

Node.js & Amazon S3: How to iterate through all files in a bucket?

Is there any Amazon S3 client library for Node.js that allows listing of all files in S3 bucket?
The most known aws2js and knox don't seem to have this functionality.
Using the official aws-sdk:
var allKeys = [];
function listAllKeys(marker, cb)
{
s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
allKeys.push(data.Contents);
if(data.IsTruncated)
listAllKeys(data.NextMarker, cb);
else
cb();
});
}
see s3.listObjects
Edit 2017:
Same basic idea, but listObjectsV2( ... ) is now recommended and uses a ContinuationToken (see s3.listObjectsV2):
var allKeys = [];
function listAllKeys(token, cb)
{
var opts = { Bucket: s3bucket };
if(token) opts.ContinuationToken = token;
s3.listObjectsV2(opts, function(err, data){
allKeys = allKeys.concat(data.Contents);
if(data.IsTruncated)
listAllKeys(data.NextContinuationToken, cb);
else
cb();
});
}
Using AWS-SDK v3 and Typescript
import {
paginateListObjectsV2,
S3Client,
S3ClientConfig,
} from '#aws-sdk/client-s3';
/* // For Deno
import {
paginateListObjectsV2,
S3Client,
S3ClientConfig,
} from "https://deno.land/x/aws_sdk#v3.32.0-1/client-s3/mod.ts"; */
const s3Config: S3ClientConfig = {
credentials: {
accessKeyId: 'accessKeyId',
secretAccessKey: 'secretAccessKey',
},
region: 'us-east-1',
};
const getAllS3Files = async (client: S3Client, s3Opts) => {
const totalFiles = [];
for await (const data of paginateListObjectsV2({ client }, s3Opts)) {
totalFiles.push(...(data.Contents ?? []));
}
return totalFiles;
};
const main = async () => {
const client = new S3Client(s3Config);
const s3Opts = { Bucket: 'bucket-xyz' };
console.log(await getAllS3Files(client, s3Opts));
};
main();
For AWS-SDK v2 Using Async Generator
Import S3
const { S3 } = require('aws-sdk');
const s3 = new S3();
create a generator function to retrieve all the files list
async function* listAllKeys(opts) {
opts = { ...opts };
do {
const data = await s3.listObjectsV2(opts).promise();
opts.ContinuationToken = data.NextContinuationToken;
yield data;
} while (opts.ContinuationToken);
}
Prepare aws parameter, based on api docs
const opts = {
Bucket: 'bucket-xyz' /* required */,
// ContinuationToken: 'STRING_VALUE',
// Delimiter: 'STRING_VALUE',
// EncodingType: url,
// FetchOwner: true || false,
// MaxKeys: 'NUMBER_VALUE',
// Prefix: 'STRING_VALUE',
// RequestPayer: requester,
// StartAfter: 'STRING_VALUE'
};
Use generator
async function main() {
// using for of await loop
for await (const data of listAllKeys(opts)) {
console.log(data.Contents);
}
}
main();
thats it
Or Lazy Load
async function main() {
const keys = listAllKeys(opts);
console.log(await keys.next());
// {value: {…}, done: false}
console.log(await keys.next());
// {value: {…}, done: false}
console.log(await keys.next());
// {value: undefined, done: true}
}
main();
Or Use generator to make Observable function
const lister = (opts) => (o$) => {
let needMore = true;
const process = async () => {
for await (const data of listAllKeys(opts)) {
o$.next(data);
if (!needMore) break;
}
o$.complete();
};
process();
return () => (needMore = false);
};
use this observable function with RXJS
// Using Rxjs
const { Observable } = require('rxjs');
const { flatMap } = require('rxjs/operators');
function listAll() {
return Observable.create(lister(opts))
.pipe(flatMap((v) => v.Contents))
.subscribe(console.log);
}
listAll();
or use this observable function with Nodejs EventEmitter
const EventEmitter = require('events');
const _eve = new EventEmitter();
async function onData(data) {
// will be called for each set of data
console.log(data);
}
async function onError(error) {
// will be called if any error
console.log(error);
}
async function onComplete() {
// will be called when data completely received
}
_eve.on('next', onData);
_eve.on('error', onError);
_eve.on('complete', onComplete);
const stop = lister(opts)({
next: (v) => _eve.emit('next', v),
error: (e) => _eve.emit('error', e),
complete: (v) => _eve.emit('complete', v),
});
Here's Node code I wrote to assemble the S3 objects from truncated lists.
var params = {
Bucket: <yourbucket>,
Prefix: <yourprefix>,
};
var s3DataContents = []; // Single array of all combined S3 data.Contents
function s3Print() {
if (program.al) {
// --al: Print all objects
console.log(JSON.stringify(s3DataContents, null, " "));
} else {
// --b: Print key only, otherwise also print index
var i;
for (i = 0; i < s3DataContents.length; i++) {
var head = !program.b ? (i+1) + ': ' : '';
console.log(head + s3DataContents[i].Key);
}
}
}
function s3ListObjects(params, cb) {
s3.listObjects(params, function(err, data) {
if (err) {
console.log("listS3Objects Error:", err);
} else {
var contents = data.Contents;
s3DataContents = s3DataContents.concat(contents);
if (data.IsTruncated) {
// Set Marker to last returned key
params.Marker = contents[contents.length-1].Key;
s3ListObjects(params, cb);
} else {
cb();
}
}
});
}
s3ListObjects(params, s3Print);
Pay attention to listObject's documentation of NextMarker, which is NOT always present in the returned data object, so I don't use it at all in the above code ...
NextMarker — (String) When response is truncated (the IsTruncated
element value in the response is true), you can use the key name in
this field as marker in the subsequent request to get next set of
objects. Amazon S3 lists objects in alphabetical order Note: This
element is returned only if you have delimiter request parameter
specified. If response does not include the NextMarker and it is
truncated, you can use the value of the last Key in the response as
the marker in the subsequent request to get the next set of object
keys.
The entire program has now been pushed to https://github.com/kenklin/s3list.
In fact aws2js supports listing of objects in a bucket on a low level via s3.get() method call. To do it one has to pass prefix parameter which is documented on Amazon S3 REST API page:
var s3 = require('aws2js').load('s3', awsAccessKeyId, awsSecretAccessKey);
s3.setBucket(bucketName);
var folder = encodeURI('some/path/to/S3/folder');
var url = '?prefix=' + folder;
s3.get(url, 'xml', function (error, data) {
console.log(error);
console.log(data);
});
The data variable in the above snippet contains a list of all objects in the bucketName bucket.
Published knox-copy when I couldn't find a good existing solution. Wraps all the pagination details of the Rest API into a familiar node stream:
var knoxCopy = require('knox-copy');
var client = knoxCopy.createClient({
key: '<api-key-here>',
secret: '<secret-here>',
bucket: 'mrbucket'
});
client.streamKeys({
// omit the prefix to list the whole bucket
prefix: 'buckets/of/fun'
}).on('data', function(key) {
console.log(key);
});
If you're listing fewer than 1000 files a single page will work:
client.listPageOfKeys({
prefix: 'smaller/bucket/o/fun'
}, function(err, page) {
console.log(page.Contents); // <- Here's your list of files
});
Meekohi provided a very good answer, but the (new) documentation states that NextMarker can be undefined. When this is the case, you should use the last key as the marker.
So his codesample can be changed into:
var allKeys = [];
function listAllKeys(marker, cb) {
s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
allKeys.push(data.Contents);
if(data.IsTruncated)
listAllKeys(data.NextMarker || data.Contents[data.Contents.length-1].Key, cb);
else
cb();
});
}
Couldn't comment on the original answer since I don't have the required reputation. Apologies for the bad mark-up btw.
I am using this version with async/await.
This function will return the content in an array.
I'm also using the NextContinuationToken instead of the Marker.
async function getFilesRecursivelySub(param) {
// Call the function to get list of items from S3.
let result = await s3.listObjectsV2(param).promise();
if(!result.IsTruncated) {
// Recursive terminating condition.
return result.Contents;
} else {
// Recurse it if results are truncated.
param.ContinuationToken = result.NextContinuationToken;
return result.Contents.concat(await getFilesRecursivelySub(param));
}
}
async function getFilesRecursively() {
let param = {
Bucket: 'YOUR_BUCKET_NAME'
// Can add more parameters here.
};
return await getFilesRecursivelySub(param);
}
This is an old question and I guess the AWS JS SDK has changed a lot since it was asked. Here's yet another way to do it these days:
s3.listObjects({Bucket:'mybucket', Prefix:'some-pfx'}).
on('success', function handlePage(r) {
//... handle page of contents r.data.Contents
if(r.hasNextPage()) {
// There's another page; handle it
r.nextPage().on('success', handlePage).send();
} else {
// Finished!
}
}).
on('error', function(r) {
// Error!
}).
send();
If you want to get list of keys only within specific folder inside a S3 Bucket then this will be useful.
Basically, listObjects function will start searching from the Marker we set and it will search until maxKeys: 1000 as limit. so it will search one by one folder and get you first 1000 keys it find from different folder in a bucket.
Consider i have many folders inside my bucket with prefix as prod/some date/, Ex: prod/2017/05/12/ ,prod/2017/05/13/,etc.
I want to fetch list of objects (file names) only within prod/2017/05/12/ folder then i will specify prod/2017/05/12/ as my start and prod/2017/05/13/ [your next folder name] as my end and in code i'm breaking the loop when i encounter the end.
Each Keyin data.Contents will look like this.
{ Key: 'prod/2017/05/13/4bf2c675-a417-4c1f-a0b4-22fc45f99207.jpg',
LastModified: 2017-05-13T00:59:02.000Z,
ETag: '"630b2sdfsdfs49ef392bcc16c833004f94ae850"',
Size: 134236366,
StorageClass: 'STANDARD',
Owner: { }
}
Code:
var list = [];
function listAllKeys(s3bucket, start, end) {
s3.listObjects({
Bucket: s3bucket,
Marker: start,
MaxKeys: 1000,
}, function(err, data) {
if (data.Contents) {
for (var i = 0; i < data.Contents.length; i++) {
var key = data.Contents[i].Key; //See above code for the structure of data.Contents
if (key.substring(0, 19) != end) {
list.push(key);
} else {
break; // break the loop if end arrived
}
}
console.log(list);
console.log('Total - ', list.length);
}
});
}
listAllKeys('BucketName', 'prod/2017/05/12/', 'prod/2017/05/13/');
Output:
[ 'prod/2017/05/12/05/4bf2c675-a417-4c1f-a0b4-22fc45f99207.jpg',
'prod/2017/05/12/05/a36528b9-e071-4b83-a7e6-9b32d6bce6d8.jpg',
'prod/2017/05/12/05/bc4d6d4b-4455-48b3-a548-7a714c489060.jpg',
'prod/2017/05/12/05/f4b8d599-80d0-46fa-a996-e73b8fd0cd6d.jpg',
... 689 more items ]
Total - 692
I ended up building a wrapper function around ListObjectsV2, works the same way and takes the same parameters but works recursively until IsTruncated=false and returns all the keys found as an array in the second parameter of the callback function
const AWS = require('aws-sdk')
const s3 = new AWS.S3()
function listAllKeys(params, cb)
{
var keys = []
if(params.data){
keys = keys.concat(params.data)
}
delete params['data']
s3.listObjectsV2(params, function(err, data){
if(err){
cb(err)
} else if (data.IsTruncated) {
params['ContinuationToken'] = data.NextContinuationToken
params['data'] = data.Contents
listAllKeys(params, cb)
} else {
keys = keys.concat(data.Contents)
cb(null,keys)
}
})
}
Here's what I came up with based on the other answers.
You can await listAllKeys() without having to use callbacks.
const listAllKeys = () =>
new Promise((resolve, reject) => {
let allKeys = [];
const list = marker => {
s3.listObjects({ Marker: marker }, (err, data) => {
if (err) {
reject(err);
} else if (data.IsTruncated) {
allKeys.push(data.Contents);
list(data.NextMarker || data.Contents[data.Contents.length - 1].Key);
} else {
allKeys.push(data.Contents);
resolve(allKeys);
}
});
};
list();
});
This assumes you've initialized the s3 variable like so
const s3 = new aws.S3({
apiVersion: API_VERSION,
params: { Bucket: BUCKET_NAME }
});
I made it as simple as possible. You can iterate uploading objects using for loop, it is quite simple, neat and easy to understand.
package required: fs, express-fileupload
server.js :-
router.post('/upload', function(req, res){
if(req.files){
var file = req.files.filename;
test(file);
res.render('test');
}
} );
test function () :-
function test(file){
// upload all
if(file.length){
for(var i =0; i < file.length; i++){
fileUP(file[i]);
}
}else{
fileUP(file);
}
// call fileUP() to upload 1 at once
function fileUP(fyl){
var filename = fyl.name;
var tempPath = './temp'+filename;
fyl.mv(tempPath, function(err){
fs.readFile(tempPath, function(err, data){
var params = {
Bucket: 'BUCKET_NAME',
Body: data,
Key: Date.now()+filename
};
s3.upload(params, function (err, data) {
if (data) {
fs.unlink(tempPath, (err) => {
if (err) {
console.error(err)
return
}
else{
console.log("file removed from temp loaction");
}
});
console.log("Uploaded in:", data.Location);
}
});
});
});
}
}
This should work,
var listAllKeys = async function (token) {
if(token) params.ContinuationToken = token;
return new Promise((resolve, reject) => {
s3.listObjectsV2(params, function (err, data) {
if (err){
reject(err)
}
resolve(data)
});
});
}
var collect_all_files = async function () {
var allkeys = []
conti = true
token = null
while (conti) {
data = await listAllKeys(token)
allkeys = allkeys.concat(data.Contents);
token = data.NextContinuationToken
conti = data.IsTruncated
}
return allkeys
};
Using the new API s3.listObjectsV2 the recursive solution will be:
S3Dataset.prototype.listFiles = function(params,callback) {
var self=this;
var options = {
};
for (var attrname in params) { options[attrname] = params[attrname]; }
var results=[];
var s3=self.s3Store.GetInstance();
function listAllKeys(token, callback) {
var opt={ Bucket: self._options.s3.Bucket, Prefix: self._options.s3.Key, MaxKeys: 1000 };
if(token) opt.ContinuationToken = token;
s3.listObjectsV2(opt, (error, data) => {
if (error) {
if(self.logger) this.logger.error("listFiles error:", error);
return callback(error);
} else {
for (var index in data.Contents) {
var bucket = data.Contents[index];
if(self.logger) self.logger.debug("listFiles Key: %s LastModified: %s Size: %s", bucket.Key, bucket.LastModified, bucket.Size);
if(bucket.Size>0) {
var Bucket=self._options.s3.Bucket;
var Key=bucket.Key;
var components=bucket.Key.split('/');
var name=components[components.length-1];
results.push({
name: name,
path: bucket.Key,
mtime: bucket.LastModified,
size: bucket.Size,
sizehr: formatSizeUnits(bucket.Size)
});
}
}
if( data.IsTruncated ) { // truncated page
return listAllKeys(data.NextContinuationToken, callback);
} else {
return callback(null,results);
}
}
});
}
return listAllKeys.apply(this,['',callback]);
};
where
function formatSizeUnits(bytes){
if (bytes>=1099511627776) {bytes=(bytes/1099511627776).toFixed(4)+' PB';}
else if (bytes>=1073741824) {bytes=(bytes/1073741824).toFixed(4)+' GB';}
else if (bytes>=1048576) {bytes=(bytes/1048576).toFixed(4)+' MB';}
else if (bytes>=1024) {bytes=(bytes/1024).toFixed(4)+' KB';}
else if (bytes>1) {bytes=bytes+' bytes';}
else if (bytes==1) {bytes=bytes+' byte';}
else {bytes='0 byte';}
return bytes;
}//formatSizeUnits
Although #Meekohi's answer does technically work, I've had enough heartache with the S3 portion of the AWS SDK for NodeJS. After all the previous struggling with modules such as aws-sdk, s3, knox, I decided to install s3cmd via the OS package manager and shell-out to it using child_process
Something like:
var s3cmd = new cmd_exec('s3cmd', ['ls', filepath, 's3://'+inputBucket],
function (me, data) {me.stdout += data.toString();},
function (me) {me.exit = 1;}
);
response.send(s3cmd.stdout);
(Using the cmd_exec implementation from this question)
This approach just works really well - including for other problematic things like file upload.
The cleanest way to do it for me was through execution of s3cmd from my node script like this (The example here is to delete files recursively):
var exec = require('child_process').exec;
var child;
var bucket = "myBucket";
var prefix = "myPrefix"; // this parameter is optional
var command = "s3cmd del -r s3://" + bucket + "/" + prefix;
child = exec(command, {maxBuffer: 5000 * 1024}, function (error, stdout, stderr) { // the maxBuffer is here to avoid the maxBuffer node process error
console.log('stdout: ' + stdout);
if (error !== null) {
console.log('exec error: ' + error);
}
});

Resources