Error scanning DynamoDB table - node.js

I have the following code that scan's the DynamoDB table and returns a count of the number of people who have the key value test = true. For some reason, this code is not scanning the whole table. Does anyone know why?
var aws = require('aws-sdk');
var config = require('./config.js');
aws.config.update({accessKeyId: config.key, secretAccessKey: config.secret});
aws.config.update({region: 'us-east-1'});
function getItems() {
var db = new aws.DynamoDB.DocumentClient();
db.scan({
TableName : config.db,
}, function(err, data) {
if (err) { console.log(err); return; }
var count = 0;
for (var ii in data.Items) {
ii = data.Items[ii];
if (ii.setRemoveBrandingEmailOptin) {
console.log(ii.test);
count += 1;
}
}
console.log(count);
});
}
getItems();

Per the documentation:
If the total number of scanned items exceeds the maximum data set size
limit of 1 MB, the scan stops and results are returned to the user as
a LastEvaluatedKey value to continue the scan in a subsequent
operation. The results also include the number of items exceeding the
limit.
You shouldn't need to dump the entire table into your application for a simple count anyway. You're doing this in the most inefficient way possible. Try something like this:
db.scan({
TableName : config.db,
Select: 'COUNT',
FilterExpression: "#emailOptInField = :emailOptInValue",
ExpressionAttributeNames: {
"#emailOptInField": "setRemoveBrandingEmailOptin",
},
ExpressionAttributeValues: {
":emailOptInValue": true
}
}, function(err, data) {
if (err) { console.log(err); return; }
var count = data.Count;
console.log(count);
});

Related

AWS Lambda NodeJS: Scan with optional filter attributes

I built an AWS Lambda that scans and filters my DynamoDB and returns to my AWS API. Thereby I had three possible search parameters (itemname, author and type), whereas I don’t know which of them are used in a query. At first, I implemented a version, in which all search parameters where hard coded. As a result, I got errors if not all search parameters where defined. At the end I reworked the code to build individual scan params dependent on the inputted search parameter.
The code works fine, but I think that there are better implementations for this problem, maybe you can give me some improvement advice. Otherwise this will help people which face the same issues with their optional search parameters.
var AWS = require('aws-sdk');
var docClient = new AWS.DynamoDB.DocumentClient();
//This is the Lambda function
exports.handler = function(event, context, callback)
{
//In case we query without query attributes
if(!event.hasOwnProperty("queryStringParameters"))
{
console.log("NO queryStringParameters FOUND");
var emptyparams =
{
TableName: "blackboard-items",
};
docClient.scan(emptyparams, onScan);
return;
}
//we want to tailor this attributes for the params for docClient.scan(params, onScan);
var queryParam = event["queryStringParameters"];
var filterexpression = "";
var expressionAttributeNames = {}; //Instantiate
var expressionAttributeValues = {};
console.log("QUERY PARAMETERS: " + JSON.stringify(queryParam));
//Do we look for an author?
if(queryParam.hasOwnProperty("author"))
{
console.log("FOUND AUTHOR");
filterexpression += "contains(#author, :author)"; //Collect scan params
expressionAttributeNames['#author'] = 'author';
expressionAttributeValues[':author'] = event["queryStringParameters"]["author"];
}
//Do we look for an itemname?
if(queryParam.hasOwnProperty("itemname"))
{
console.log("FOUND ITEMNAME");
if(filterexpression !== "")
filterexpression += " AND contains(#itemname, :itemname)";
else
filterexpression += "contains(#itemname, :itemname)";
expressionAttributeNames['#itemname'] = 'itemname';
expressionAttributeValues[':itemname'] = queryParam["itemname"];
}
//Do we look for a type?
if(queryParam.hasOwnProperty("type"))
{
console.log("FOUND TYPE");
if(filterexpression !== "")
filterexpression += " AND #type = :type";
else
filterexpression += "#type = :type";
expressionAttributeNames['#type'] = 'type';
expressionAttributeValues[':type'] = event["queryStringParameters"]["type"];
}
//Build params based on the tailored parts
var params =
{
TableName: "blackboard-items",
FilterExpression: filterexpression,
ExpressionAttributeNames: expressionAttributeNames,
ExpressionAttributeValues: expressionAttributeValues,
};
//Use tailored params for scan()
docClient.scan(params, onScan);
var count = 0;
function onScan(err, data)
{
if (err)
{
console.error("Unable to scan the table. Error JSON:", JSON.stringify(err, null, 2));
}
else
{
console.log("Scan succeeded.");
data.Items.forEach(function(itemdata)
{
console.log("Item :", ++count,JSON.stringify(itemdata));
});
// continue scanning if we have more items
if (typeof data.LastEvaluatedKey != "undefined")
{
console.log("Scanning for more...");
params.ExclusiveStartKey = data.LastEvaluatedKey;
docClient.scan(params, onScan);
}
}
var response =
{
"isBase64Encoded": false,
"statusCode": "200",
"headers": { },
"body": JSON.stringify(data.Items)
};
callback(null, response);
}
};
Note:
The primary key of the DB is "itemname", but I will rework the DB design soon to have a sort key.
DynamoDB is very limited on its query capabilities and, as such, you should avoid scan at ALL costs. Every scan operation will consume your RCUs for every item it reads. If your table has many items, it can use up your RCUs quite quickly.
If you want to query by those 3 attributes, then DynamoDB may not be the best database for your use case. If you can narrow your query down to 1 attribute at a time instead, you can then use Global Secondary Indexes. That way you can query based on the author or type. You can see this answer on how to query for GSIs in DynamoDB

Inconsistent results with query/scan DynamoDB - NodeJS

I have a DynamoDB table that looks like this:
(there are like 1500000 entries like this one with different timestamps)
I have 2 GSI:
I'm trying to retrieve all the rows in the table for a given day.
This is what my code looks like (NodeJS):
var AWS = require("aws-sdk");
AWS.config.update({accessKeyId: "", secretAccessKey: ""});
AWS.config.update({region: 'us-east-1'});
var docClient = new AWS.DynamoDB.DocumentClient();
var params = {
TableName: "QfGamingTransactionsProd",
IndexName: 'Result-RedeemedAt-index',
KeyConditionExpression: "#rs = :result and begins_with (#rat, :Rat)",
ExpressionAttributeNames: {
"#rs": "Result",
"#rat": "RedeemedAt"
},
ExpressionAttributeValues: {
":result": "SUCCESS",
":Rat": "2016-10-20"
}
};
docClient.query(params, function (err, data) {
if (err) {
console.error("Unable to query. Error:", JSON.stringify(err, null, 2));
} else {
console.log("\nQuery succeeded. \n");
console.log("- Total", data.Count);
}
});
It seems to be working, but i'm getting (way) less results than expected. This same code works fine on a smaller Table.
Similar results with "Scan".
What am I missing?
According to the size of each record, the number of records retrieved will change since DynamoDB has a size limitation for query (1MB).
In DynamoDB, a query will return only 1MB of data.
But we can paginate through the results. It may solve your issue.
Data returned by the query will contain a "LastEvaluatedKey", if data that satisfy that query is not fully retrieved. So we have to set the "LastEvaluatedKey" as the "ExclusiveStartKey". Then the query will retrieve the remaining data. By recursively following this method, we will get the complete data.
var data = [];
async.until(function () {
return scanComplete;
},
function (callback) {
docClient.query(params, function (err, result) {
if (err) {
console.log(err);
} else {
data.push(result.Items);
if (typeof (result.LastEvaluatedKey) === 'undefined') {
scanComplete = true;
//fully retrieved
} else {
params.ExclusiveStartKey = result.LastEvaluatedKey;
}
if (!scanComplete) {
}
}
callback(err);
});
},
// this runs when the loop is complete or returns an error
function (err) {
if (err) {
console.log('error in processing scan ');
console.log(err);
reject(err);
} else {
resolve(data);
}
});
This is because by default DynamoDB will return only 1mb of data at a time, But there is a way to solve this issue.
You need to change your implementation like following
Step 1: Call DyanmoDB table, it will return 1st 1mb of data, with that it will return "Next Evaluated Key"
Step 2: Call Dynamodb table again but this time you pass "Next Evaluated Key" in "Exclusive Start key" to get new set of data
Step3: Check if "Next Evaluated Key" is still available then repeat step2 else you got all the data for that key
Here are the references:
About query limits
Blog on how to implement this code
Hope that helps

DynamoDB javascript SDK batchWriteItem doesn't complete unless I increase write capacity

I'm running a series of unit tests (node.js 4.x, aws-sdk, mocha) which load data into a table before each test then clears the table after the test.
I have two tests that are failing because of a ConditionExpression which triggers a ConditionCheckFailedException. But if I increase the read/write capacity they tests pass.
It's my understanding that the SDK handles throttling exceptions and retries them for you so why wouldn't my tests just run slower and pass? Instead it seems as though the tests fail to complete the scan -> batchWriteItem process and so there are records still left in the table when a new tests starts.
I'm told by team members that they've seen similar problems and they just increased the throughput to fix the problem. This doesn't sit right with me. Either I'm doing something wrong and there's a race condition with my tests or there should be a pattern I can implement to make sure that my operations complete when being throttled? I should be able to use throttling metrics to inform when I need to increase throughput but I should still be able to keep retrying until I run out of memory.
Has anyone else run into this and what have you done to handle the problem?
After some debugging I noticed the UnprocessedItems response element. After looking up UnprocessedItems in the docs I realize I should have read more closely. The code below will run a retry loop with a delay (exponential back-off):
var clearEventTable = function (tableName, client, cleared) {
var exclusiveStartKey = null;
var retryCount = 0;
var read = function(query, callback) {
client.scan(query, function (err, page) {
if(err) {
console.log(err);
return callback(err);
}
retryCount = 0;
exclusiveStartKey = page.LastEvaluatedKey || null;
if(page.Count == 0) {
return callback(null, {});
}
if(page.Count < 25 && exclusiveStartKey) {
console.log("read capacity limit reached: " + JSON.stringify(page, null, 2));
}
var keys = _.map(page.Items, function(n) {
return { DeleteRequest: { Key: n } };
});
var batch = {
RequestItems: {},
ReturnConsumedCapacity: "INDEXES",
ReturnItemCollectionMetrics: "SIZE"
};
batch.RequestItems[tableName] = keys;
callback(null, batch);
});
};
var write = function(batch, callback) {
if(batch && batch.RequestItems){
client.batchWriteItem(batch, function(err, result) {
if(err) {
console.log(err);
return callback(err);
}
if(Object.keys(result.UnprocessedItems).length !== 0) {
console.log("Retry batchWriteItem: " + JSON.stringify(result, null, 2));
retryCount++;
var retry = {
RequestItems: result.UnprocessedItems,
ReturnConsumedCapacity: "INDEXES",
ReturnItemCollectionMetrics: "SIZE"
};
// retry with exponential backoff
var delay = retryCount > 0 ? (50 * Math.pow(2, retryCount - 1)) : 0;
setTimeout(write(retry, callback), delay);
return;
}
callback(null, result);
});
} else {
callback(null);
}
};
var params = {
TableName: tableName,
ProjectionExpression: "aggregateId,id",
Limit: 25, // max 25 per batchWriteItem
ConsistentRead: false,
ReturnConsumedCapacity: "TOTAL"
};
async.doWhilst(function (next) {
// retrieve entities
if (exclusiveStartKey)
params.ExclusiveStartKey = exclusiveStartKey;
async.compose(write, read)(params, function (err, result) {
if (err) next(err);
else next(null, result);
});
}, function () {
// test if we need to load more
return exclusiveStartKey !== null;
}, function (err, r) {
// return results
if (err) {
console.log(err);
return cleared(err);
}
return cleared(null);;
});
};
Also take a look at the amount of memory provisioned for the Lambda. Might be too low and hitting the max leads to unpredictable results IMX.

Get item from dynamodb table using separate index table with Node.js

I am creating a weather station using the Particle Electron and AWS. I have managed to get the returned data sent to a DynamoDB table "weather" which contains all of the weather data with the following schema (with included sample values):
Item{13}
deviceId: 540056000a51343334363138 (String) (Primary Partition Key)
tm: 1458754711 (Number) (Primary Sort Key)
batSoC: 89 (String)
batV: 4.01 (String)
hum: 27.9 (String)
lat: 41.2083 (String)
lon: -73.3439 (String)
pres: 968.4 (String)
temp: 19.8 (String)
uvI: 0.1 (String)
wDir: 0 (String)
wGst: 0.0 (String)
wSpd: 0.0 (String)
as well as a separate "weather_index" table which contains only the deviceId and tm attributes for the most recent data that was written to the main table (kind of like an atomic counter but for a periodically updated unix timestamp value). So if the "weather_index" item above was the most recent entry, the item in the "weather_index" table would look like this:
Item{2}
deviceIdString: 540056000a51343334363138 (String) (Primary Partition Key)
tmNumber: 1458754711 (Number)
I am currently trying to write a very basic web frontend in Node.js (which, prior to this project, I have had no experience with, so I am still learning) and can't figure out how to:
Perform a DynamoDB getItem which contains a parameter retrieved via a previous getItem. Like:
latestTime = getItem(weather_index, deviceId) // Gets the time "tm" of the most recent weather observation and stores it in "latestTime"
// Where "weather_index" is the table name
currentWeather = getItem(deviceId, tm) // Gets the weather observation for the specified "tm" value and stores it in "currentWeather"
// Where "tm" is the unix time-stamp of the most recent observation
I then want to be able to print the individual values to the terminal/webpage/carrier pigeon/etc... (Something along the lines of currentWeather.deviceId, currentWeather.tm, currentWeather.batSoC, etc...
I have the following code that I can't really make work properly:
/*
* Module dependencies
*/
var AWS = require('aws-sdk')
// weathermon_dev credentials
AWS.config.update({accessKeyId: 'REDACTED for obvious reasons', secretAccessKey: 'This bit too'});
// Select AWS region
AWS.config.update({region: 'us-east-1'});
var db = new AWS.DynamoDB();
// db.listTables(function(err,data) {
// console.log(data.TableNames);
// });
var time = Date.now() / 1000;
time = Math.round(time);
//console.log("Time: ");
//console.log(time);
time = Math.round(time);
var deviceId = "540056000a51343334363138"
var params = {
Key: {
deviceId: {S: deviceId}
},
TableName: 'weather_index'
};
var timeJson;
db.getItem(params, function(err,data) {
if (err) console.log(err); // an error occurred
else console.log(data); // successful response
var timeJson = JSON.parse(data);
})
// var timeJson = JSON.parse(data);
// var itemTime = timeJson.item;
console.log("timeJSON: " + timeJson);
// console.log("itemTime: " + itemTime);
var params = {
Key: {
deviceId: {S: deviceId},
time: {N: 'tm'}
},
TableName: 'weather'
};
db.getItem(params, function(err, data) {
if (err) console.log(err); // an error occurred
else console.log(data); // successful response
})
Any help would be greatly appreciated.
You need to look into how NodeJS asynchronous calls work. You would need to wait until the callback from the first getItem() is called before you perform the second getItem().
I've rewritten the relevant part of your code here, to show you what I'm talking about, but I recommend you try to understand why the code needs to be written in this way instead of just copy/pasting it.
var deviceId = "540056000a51343334363138"
var params = {
Key: {
deviceId: {S: deviceId}
},
TableName: 'weather_index'
};
var timeJson;
db.getItem(params, function(err,data) {
if (err) console.log(err); // an error occurred
else {
console.log(data); // successful response
var timeJson = JSON.parse(data);
console.log("timeJSON: " + timeJson);
// Inside this callback we have the weather_index tm value,
// so query the weather table here.
var params = {
Key: {
deviceId: {S: deviceId},
time: {N: 'tm'}
},
TableName: 'weather'
};
db.getItem(params, function(err, data) {
if (err) console.log(err); // an error occurred
else {
console.log(data); // successful response
// TODO: Use the database response data here
}
});
}
});

node.js : For each over rows and update asynchronously?

I need to query rows from a database, process some information per row, and then update each row with the result.
This is my example code where the intention is to loop over each row and update the label:
var mysql = require('mysql');
var db = mysql.createConnection(config.database);
db.connect(function() {
db.query('SELECT id FROM testTable', function (err, rows) {
if (err) {
console.log(err);
} else {
if (rows.length) {
for (var i = 0, len = rows.length; i < len; i++) {
var row = rows[i];
console.log(row);
var label = "Label_"+row.id;
db.query('UPDATE testTable SET label = ? WHERE id = ?', [label, row.id], function(err, result) {
if (err) {
console.log(err);
} else {
console.log("Set label on row %s", row.id);
}
})
}
}
}
})
});
The output of this is:
{ id: 1 }
{ id: 2 }
{ id: 3 }
{ id: 4 }
Set label on row 4
Set label on row 4
Set label on row 4
Set label on row 4
So, as you can see, I've updated row 4 four times instead of four rows once each. Whilst I new the queries would be non-blocking, I thought the values would change for each one.
I know I can change my code to use rows.forEach(function(){...}) and that then executes each UPDATE one after the other and that would be ok. But to help my understanding I would like to know how I can correctly execute the updates asynchronously.
Your row variable is a closure in the callback function. The callback function doesn't get called until you've looped through all your results list. The sql queries are correct, but printing out the value of row.id in each callback just gives you the last iteration of the for loop each time because that is the state of the closure for every callback.
You can avoid this by using the underscore module. It can also help in making you logic simpler.
npm install underscore
Then your code would look like this:
var mysql = require('mysql');
var _ = require('underscore');
var db = mysql.createConnection(config.database);
db.connect(function() {
db.query('SELECT id FROM testTable', function (err, rows) {
if (err) { console.log(err); return; }
_.each(rows, function(one) {
console.log(one);
var label = "Label_"+one.id;
var sql = 'UPDATE testTable SET label = ? WHERE id = ?';
db.query(sql, [label, one.id], function(err, result) {
if(err) { console.log(err); return; }
console.log("Set label on row %s", one.id);
});
});
});
});

Resources