I am trying to use the elasticsearch bulk api to insert multiple records into an index. My JSON looks something like this: request json
I am inserting a new line (\\n) at the end of the document but I'm still getting the newline error.
Error: {
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "The bulk request must be terminated by a newline [\n]"
}
],
"type": "illegal_argument_exception",
"reason": "The bulk request must be terminated by a newline [\n]"
},
"status": 400
}
Based on my previous answer and https://stackoverflow.com/a/50754789/8160318:
const AWS = require('aws-sdk');
const creds = new AWS.EnvironmentCredentials('AWS');
const INDEX_NAME = 'index_name';
const esDomain = {
region: 'us-east-1',
endpoint: 'yoursearchdomain.region.amazonaws.com',
index: 'myindex',
doctype: 'mytype'
};
const endpoint = new AWS.Endpoint(esDomain.endpoint);
const req = new AWS.HttpRequest(endpoint);
const docs_as_body_params = JSON.parse(
'[' +
`{"index":{}} {"tags":["ab","cd"],"question":"test this","answer":"answer first"} {"index":{}} {"tags":["de","fg"],"question":"test second","answer":"answer second"}`.split(
/(\s?{"index":{}} )/g
)
.filter(match => match.length)
.filter((_, index) => index % 2 !== 0) +
']'
);
const bulk_body = [];
docs_as_body_params.map((doc) => {
bulk_body.push({
index: {
_index: INDEX_NAME,
_id: doc.id || null
}
});
bulk_body.push(doc);
});
/// THE MOST IMPORTANT PART -- getting to a valid ndjson
const ndjson_payload = bulk_body.map(JSON.stringify).join('\n') + '\n'
req.method = 'POST';
req.path = '_bulk'
req.region = esDomain.region;
req.headers['presigned-expires'] = false;
req.headers['Host'] = endpoint.host;
req.headers['Content-Type'] = 'application/json';
req.body = ndjson_payload;
var signer = new AWS.Signers.V4(req, 'es');
signer.addAuthorization(creds, new Date());
var send = new AWS.NodeHttpClient();
send.handleRequest(req, null, function (httpResp) {
var respBody = '';
httpResp.on('data', function (chunk) {
respBody += chunk;
});
httpResp.on('end', function (chunk) {
console.log('Response: ' + respBody);
context.succeed('Lambda added document ' + doc);
});
}, function (err) {
console.log('Error: ' + err);
context.fail('Lambda failed with error ' + err);
});
Your json was nd-json (new-line-delimited) JSON at some point but looks all messed up now so we'll have to do some cleanup beforehand.
Initialize:
const {
Client
} = require("#elastic/elasticsearch");
const client = new Client({
node: 'http://localhost:9200'
});
const INDEX_NAME = 'index_name';
Convert the would-be ndjson into a consumable array or objects:
const docs_as_body_params = JSON.parse(
'[' +
`{"index":{}} {"tags":["ab","cd"],"question":"test this","answer":"answer first"} {"index":{}} {"tags":["de","fg"],"question":"test second","answer":"answer second"}`.split(
/(\s?{"index":{}} )/g
)
// filter out empty strings
.filter(match => match.length)
// take every odd member (skipping `{"index":{}}`)
.filter((_, index) => index % 2 !== 0) +
']'
);
Construct the bulk body
const bulk_body = [];
docs_as_body_params.map((doc) => {
bulk_body.push({
index: {
_index: INDEX_NAME,
_id: doc.id || null
}
});
bulk_body.push(doc);
});
Perform bulk indexing:
client.bulk({
body: bulk_body
},
(err, resp) => {
if (err || resp.errors) {
console.err(err || resp.errors)
}
console.info(resp.body.items);
}
);
Related
I am trying to query an autogenerated Amplify API using postman. I'm banging my head against the wall on something that should be simple. Can someone explain why this query URL doesn't return a JSON object?. The data exists in dynamo but returns an empty array in postman (and a 200 status):
POSTMAN (this is what I expected to work):
https://xxxxx.execute-api.us-east-1.amazonaws.com/staging/api/getShipContainer?location=fl
UPDATE after staring at the code for longer I see that req.params[partitionKeyName] is somehow evaluating to getShipContainer which would explain my issue, but how do I fix this? And why did it happen:
condition[partitionKeyName]['AttributeValueList'] = [ convertUrlType(req.params[partitionKeyName], partitionKeyType) ];
This syntax works (returns dynamo object) but is very clearly incorrect (location is a dynamo column, and fl is the filter param): https://xxxxxx.execute-api.us-east-1.amazonaws.com/staging/api/fl?location
Query method:
const userIdPresent = false; // TODO: update in case is required to use that definition
const partitionKeyName = "location";
const partitionKeyType = "S";
const sortKeyName = "containerId";
const sortKeyType = "S";
const hasSortKey = sortKeyName !== "";
const path = "/api";
const UNAUTH = 'UNAUTH';
const hashKeyPath = '/:' + partitionKeyName;
const sortKeyPath = hasSortKey ? '/:' + sortKeyName : '';
// declare a new express app
var app = express()
app.use(bodyParser.json())
app.use(awsServerlessExpressMiddleware.eventContext())
//Enable CORS for all methods
app.use(function(req, res, next) {
res.header("Access-Control-Allow-Origin", "*")
res.header("Access-Control-Allow-Headers", "*")
next()
});
// convert url string param to expected Type
const convertUrlType = (param, type) => {
switch(type) {
case "N":
return Number.parseInt(param);
default:
return param;
}
}
/********************************
* HTTP Get method for list objects *
********************************/
//api/:location
app.get(path + hashKeyPath, function(req, res) {
var condition = {}
condition[partitionKeyName] = {
ComparisonOperator: 'EQ'
}
if (userIdPresent && req.apiGateway) {
condition[partitionKeyName]['AttributeValueList'] = [req.apiGateway.event.requestContext.identity.cognitoIdentityId || UNAUTH ];
} else {
try {
condition[partitionKeyName]['AttributeValueList'] = [ convertUrlType(req.params[partitionKeyName], partitionKeyType) ];
} catch(err) {
res.statusCode = 500;
res.json({error: 'Wrong column type ' + err});
}
}
let queryParams = {
TableName: tableName,
KeyConditions: condition
}
console.log(`req gg cond::`,JSON.stringify(condition),`params`,Object.entries(req.params).map(([i,k])=>i+' '+k).join(','))
dynamodb.query(queryParams, (err, data) => {
if (err) {
res.statusCode = 500;
res.json({error: 'Could not load items: ' + err});
} else {
res.json(data.Items);
}
});
});
results of the console.log I put in to debug:
req gg cond::
{
"location": {
"ComparisonOperator": "EQ",
"AttributeValueList": [
"getShipContainer"
]
}
}
params location getShipContainer
shouldn't the expected query be using LOCATION and ignoring "getShipContainer" completely? Im very confused because the code was auto generated. getShipContainer is the name of the lambda function that is being called
I also tested this in the API Gateway test console with the same result:
Here is a quick screenshot of my dynamo table as well:
the issue is mentioned in this github issue.
change your handler function to this, and https://xxxxx.execute-api.us-east-1.amazonaws.com/staging/api/location will return the list of items.
app.get(path + hashKeyPath, function (req, res) {
let scanParams = {
TableName: tableName,
};
dynamodb.scan(scanParams, (err, data) => {
if (err) {
res.statusCode = 500;
res.json({ error: "Could not load items: " + err });
} else {
res.json(data.Items);
}
});
});
And here's the code for filtering if the parameter is specified in the URL.
Location is one of the dynamodb reserved word, so I've used attribute mapping.
https://xxxxx.execute-api.us-east-1.amazonaws.com/staging/api/location?location=fl
app.get(path + hashKeyPath, function (req, res) {
var filterParams = {};
const location = req.query[partitionKeyName] || "";
if (location) {
filterParams = {
FilterExpression: "#loc = :loc",
ExpressionAttributeNames: {
"#loc": "location",
},
ExpressionAttributeValues: {
":loc": location,
},
};
}
let scanParams = {
TableName: tableName,
...filterParams,
};
dynamodb.scan(scanParams, (err, data) => {
if (err) {
res.statusCode = 500;
res.json({ error: "Could not load items: " + err });
} else {
res.json(data.Items);
}
});
});
Usually I use DynamoDB Global Secondary Index to query all items. But it might be depends on the use case. For example, I've used to query List of Items by its Site Id.
Table would be something like this:
itemID
itemName
siteID
router.get("/", function (req, res) {
const sortKeyName = "siteID";
let queryParams = {
TableName: tableName,
IndexName: "siteIDGSI",
KeyConditionExpression: "siteID = :site_id",
ExpressionAttributeValues: {
":site_id": req.params[sortKeyName],
},
};
dynamodb.query(queryParams, (err, data) => {
if (err) {
res.statusCode = 500;
res.json({ error: "Could not load items: " + err });
} else {
res.json({
statusCode: 200,
message: "List of Items in " + req.params[sortKeyName],
items: data.Items,
});
}
});
});
So you can create GSI for containerId
I am creating an excel file at nodejs end and returning base64 data to reactJS to download the file. At nodejs end, I am using promise all and fetch data from a server in chunks and append data into Excel as
worksheet.addRows(data);
For data around 20-30k, it is working fine but for data like 100k, it shows me an error heap out of memory at nodejs end.
I have increase memory allocate to nodejs also but same error
node --max_old_space_size=5000 app.js
What I am doing wrong any suggestions?
Nodejs
const axios = require('axios');
var excel = require("exceljs");
const workbook = new excel.Workbook();
const worksheet = workbook.addWorksheet("My Sheet");
worksheet.columns = [
{ header: "TicketId", key: "ticketId" },
{ header: "Email", key: 'user_email' },
{ header: "User", key : 'user_name' },
{ header: "Subject", key: "subject" },
...//many more headers
];
exports.getTicketData = async (req, res, next) => {
res.connection.setTimeout(0);
const { body } = req;
const token = body.token;
const organization_id = body.organization_id;
const server = body.server;
const sideFilter = body.sideFilter;
let baseurl = 'url for server end to fetch data';
if (baseurl) {
let data = new Array();
let limit = 3000;
const promises = [];
try {
let count = await getCount(token,limit, organization_id, baseurl, sideFilter);
for(var i = 1;i<=count;i++) {
promises.push(getData(i,limit,organization_id,token, baseurl, sideFilter));
}
await Promise.all(promises).then((results) => {
}).catch((e) => {
throw e;
});
var base64File = await writeExcelAndUpload(workbook);
return res.status(200).json({ file:base64File });
} catch (err) {
return res.status(400).json({ type:'error', msg:'File not generated please contact support staff' });
}
} else {
return res.status(400).json({ type:'error', msg:'please define server name' });
}
};
let getData = (page,limit, organization_id,token, baseurl, sideFilter) =>{
return new Promise((resolve, reject) => {
axios.post(baseurl+`/v2/get-export`, {
page:page,
organization_id:organization_id,
per_page:limit,
filter: "",
sorted:"",
...sideFilter
},{ headers: {"Authorization" : `Bearer ${token}`} }).then(function (response) {
let dataTemp = response.data.data.data.map((t,i)=>{
return {
...t,
name:t.name,
...//many more columns like 70
}
});
worksheet.addRows(dataTemp);
resolve(true);
}).catch(function (error) {
reject(error);
});
});
}
let getCount = (token,limit, organization_id, baseurl, sideFilter) => {
// run an api and return count against limit
}
let writeExcelAndUpload = async (workbook) => {
const fileBuffer = await workbook.xlsx.writeBuffer();
let base64File = Buffer.from(fileBuffer).toString('base64');
base64File = 'data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,'+base64File;
return base64File;
}
Client side reactjs
exportLink = () => {
postData ={
...
};
return axios.post(`${baseurl}/api/ticketing/get-ticket`, postData).then(function (response) {
const downloadLink = document.createElement("a");
const fileName = "export.xlsx";
downloadLink.href = response.data.file;
downloadLink.download = fileName;
downloadLink.click();
}).catch(function(error){
throw error;
});
}
Well, it is kinda expected that you may get a heap out of memory when working with such an amount of entries like 100k.
I could suggest you start using pagination, and instead of fetching e.g. 100k of entries at once fetch 1k of entries do what you need with them, then fetch the next 1k of entries repeat until you processed all entries.
I am new to using cloud functions for Firebase with an http triggered function and I am confused on how to properly terminate the function. I'm not sure if I should be using res.sendStatus, returning a promise, or both.
The goal of my function is to loop through several documents in the collection 'communities'. Each community has a collection of documents where I query the document with the highest value of 'hotScore'. I then send an iOS push notification containing that document to a topic (all users in that given community).
Unfortunately, I am getting several errors when the code is run such as Error [ERR_HTTP_HEADERS_SENT]: Cannot set headers after they are sent to the client and Unhandled rejection. I'm pretty such this is due to my negligence in handling function termination, although I have been confused by the online resources I have looked at so far. Would someone mind taking a look at my code/pointing me in the right direction? Thank you so much!
exports.sendNotificationTrendingPost = functions.https.onRequest(async (req, res) => {
//Get communities collection from Firestore
return admin.firestore().collection('communities').get().then((communities) => {
var communityPromises = [];
//Loop through each community
communities.forEach((community) => {
let communityID = community.get('communityID');
let communityName = community.get('name');
//Get the post with the highest hotScore
let communityPromise = admin.firestore().collection('communities').doc(communityID).collection('posts').orderBy('hotScore', 'desc').limit(1).get().then((posts) => {
let hottestPost = posts[0];
let postID = hottestPost.get('postID');
let postText = hottestPost.get('text');
let currentDate = Date.now() / 1000;
var message;
//Verify that the hottest post was posted in the past 24 hours
if (hottestPost.get('date') > (currentDate - 86400)) {
//Build the notification text (shortening if too long)
let shortenedPostText = postText.substring(0,60);
var textEnd = '';
if (postText.length > 60) {
textEnd = '...';
}
let notificationText = 'Trending post on ' + communityName + ': ' + shortenedPostText + textEnd;
//Build the push notification
message = {
apns: {
headers: {
'apns-push-type': 'alert'
},
payload: {
aps: {
alert: {
body: notificationText,
},
},
postID: postID,
},
},
topic: communityID
}
}
//Send the message and return the promise
if (message === null) {
return null;
} else {
return admin.messaging().send(message);
}
})
.catch(error => {
console.log(error);
res.status(500).send(error);
})
if (communityPromise !== null) {
communityPromises.push(communityPromise);
}
})
res.sendStatus(200);
return Promise.all(communityPromises);
})
.catch(error => {
console.log(error);
res.status(500).send(error);
})
})
As samthecodingman advised, it is much better to use async/await in your case, as it will simplify the code and will make it much easier to read.
The following changes should do the trick (untested). Note how we use an Array of Community names in order to pass the names from one loop to the other. This works because, with Promise.all(), the returned values are in order of the Promises passed, regardless of completion order.
exports.sendNotificationTrendingPost = functions.https.onRequest(async (req, res) => {
try {
const db = admin.firestore();
const communitiesQuerySnap = await db.collection('communities').get();
const communityPromises = [];
const communityNames = [];
communitiesQuerySnap.forEach((community) => {
let communityID = community.get('communityID');
let communityName = community.get('name');
communityNames.push(communityName);
communityPromises.push(db.collection('communities').doc(communityID).collection('posts').orderBy('hotScore', 'desc').limit(1).get())
});
const postsQuerySnapArray = await Promise.all(communityPromises);
const messagePromises = [];
postsQuerySnapArray.forEach((postsQuerySnap, index) => {
const hottestPost = postsQuerySnap.docs[0];
const postID = hottestPost.get('postID');
const postText = hottestPost.get('text');
const currentDate = Date.now() / 1000;
let message;
if (hottestPost.get('date') > (currentDate - 86400)) {
//Build the notification text (shortening if too long)
let shortenedPostText = postText.substring(0, 60);
var textEnd = '';
if (postText.length > 60) {
textEnd = '...';
}
const communityName = communityNames[index]; // The two Arrays postsQuerySnapArray and communityName have the same order, because Promise.all keeps the order.
let notificationText = 'Trending post on ' + communityName + ': ' + shortenedPostText + textEnd;
//Build the push notification
message = {
apns: {
headers: {
'apns-push-type': 'alert'
},
payload: {
aps: {
alert: {
body: notificationText,
},
},
postID: postID,
},
},
topic: communityID
}
messagePromises.push(admin.messaging().send(message));
}
})
await Promise.all(messagePromises);
res.status(200).send({ result: "completed" }); // Or res.end()
} catch (error) {
console.log(error);
res.status(500).send(error);
}
});
I'm very new to node, and I'm trying to pull a list of IDs from an API, iterate through that list for each ID saving the output, and ultimately rename each file generated. The code below is the closest I've come, and while it works sometimes, it frequently fails as I believe one function isn't waiting for the other to complete (e.g. tries to read before a write), but I'm sure I have other issues going on.
const apiKey = inputData.apiKey
var https = require('https');
var sync = require('sync');
var fs = require('fs');
var JSONfileloc = "./pdfs/file.json"
var queryurl = 'https://intakeq.com/api/v1/intakes/summary?startDate=2018-01-01'
var authHeaders = { 'X-Auth-Key': apiKey }
var queryOpts = { method: 'GET', headers: authHeaders}
function handleFile (error, file)
{
if (error) return console.error('Ran into a problem here', error)
}
fetch(queryurl, queryOpts)
.then
(function findAPI(res, err)
{
if( err )
{ console.log('I cant find the API '+err) }
return res.json()
{console.log('found the API!')}
}
)
.then (function itID(res, err)
{
if( err )
{ console.log('I cant iterate the API '+err) }
for(var i = 0; i < res.length; i++)
{
var intakeID=res[i].Id;
var APIoptions={ host:"intakeq.com", path:"/api/v1/intakes/"+ intakeID, headers: authHeaders };
var PDFoptions={ host:"intakeq.com", path:"/api/v1/intakes/"+ intakeID+'/pdf', headers: authHeaders };
console.log('Working on ID:'+intakeID)
var JSONrequest = https.get(APIoptions, writeJSON)
}})
//READ JSON FUNCTION
function readJSON (err, data)
{
if (err) throw err;
if(data.indexOf('New Patient Forms') >= 0)
var contents = fs.readFileSync(JSONfileloc, handleFile);
var jsonContent = JSON.parse(contents)
//pull PT Name
pName = (jsonContent.ClientName);
console.log('The Patient Name Is ' + jsonContent.ClientName)
//pull PT DOB
pDob = (jsonContent.Questions[3].Answer)
console.log('Patient DOB Is ' + jsonContent.Questions[3].Answer)
//pull Form Type
pForm = (jsonContent.QuestionnaireName)
console.log('The Form Submitted is ' + jsonContent.QuestionnaireName)
//rename and move JSON
fs.rename("./pdfs/file.json", './JSONLogs/'+pName+' '+pForm+' '+Date.now()+'.json', function(err) {
if ( err ) console.log('Problem renaming! ' + err)
else console.log('Copying & Renaming JSON File!');
})
};
//WRITE JSON FUNCTION
function writeJSON(response, err)
{
var JSONfile = fs.createWriteStream(JSONfileloc, handleFile);
if (err) throw err;
response.pipe(JSONfile);
console.log('JSON Created')
fs.readFile(JSONfileloc, readJSON)
}
The research I've done leads me to believe that async.forEach is probably the right approach here, but I've been having a hard time getting that to work properly. Thanks in advance and any suggestions are much appreciated.
const apiKey = inputData.apiKey
var https = require('https');
var sync = require('sync');
var fs = require('fs');
var JSONfileloc = "./pdfs/file.json"
var queryurl = 'https://intakeq.com/api/v1/intakes/summary?startDate=2018-01-01'
var authHeaders = {
'X-Auth-Key': apiKey
}
var queryOpts = {
method: 'GET',
headers: authHeaders
}
function handleFile(error, file) {
if (error) return console.error('Ran into a problem here', error)
}
fetch(queryurl, queryOpts)
.then(function findAPI(res) {
return res.json();
})
.then(function itID(res) {
const JSONRequests = [];
for (var i = 0; i < res.length; i++) {
var intakeID = res[i].Id;
var APIoptions = {
host: "intakeq.com",
path: "/api/v1/intakes/" + intakeID,
headers: authHeaders
};
var PDFoptions = {
host: "intakeq.com",
path: "/api/v1/intakes/" + intakeID + '/pdf',
headers: authHeaders
};
// https.get has response as a stream and not a promise
// This `httpsGet` function converts it to a promise
JSONRequests.push(httpsGet(APIoptions, i));
}
return Promise.all(JSONRequests);
})
function httpsGet(options, filename) {
return new Promise((resolve, reject) => {
https.get(options, (response) => {
// The WriteJSON function, just for brewity
// Otherwise pass resolve to the seperate writeJSON and call it in there
var JSONfile = fs.createWriteStream(filename + ".json");
response.pipe(JSONfile);
JSONfile.on('close', () => {
readJSON(filename + ".json").then(() => {
resolve();
})
})
})
})
}
//READ JSON FUNCTION
function readJSON(filename) {
// if (err) throw err;
var contents = fs.readFileSync(filename, 'utf-8'); // removed handleFile as readFileSync does not allow callbacks, added format
var jsonContent = JSON.parse(contents)
// Make your conditional checks here with the jsonContents
//pull PT Name
pName = (jsonContent.ClientName);
console.log('The Patient Name Is ' + jsonContent.ClientName)
//pull PT DOB
pDob = (jsonContent.Questions[3].Answer)
console.log('Patient DOB Is ' + jsonContent.Questions[3].Answer)
//pull Form Type
pForm = (jsonContent.QuestionnaireName)
console.log('The Form Submitted is ' + jsonContent.QuestionnaireName)
//rename and move JSON
return new Promise((resolve, reject) => {
fs.rename("./pdfs/file.json", './JSONLogs/' + pName + ' ' + pForm + ' ' + Date.now() + '.json', function (err) {
if (err) {
console.log('Problem renaming! ' + err);
reject(err);
} else {
console.log('Copying & Renaming JSON File!');
resolve();
}
})
})
};
Updated to convert https.get response stream to return a Promise which can be handled much better.
I am getting an error that "The provided key element does not match the schema". uuid is my primary partition key. I also have a primary sort key for version. I figured I can use batchWrite (docs) to delete all items with same uuid.
My ES6 code is as follows:
delete(uuid) {
const promise = new Promise();
const params = {
RequestItems: {
[this.TABLE]: [
{
DeleteRequest: {
Key: { uuid: uuid }
}
}
]
}
};
// this._client references the DocumentClient
this._client.batchWrite(params, function(err, data) {
if (err) {
// this gets hit with error
console.log(err);
return promise.reject(err);
}
console.log(result);
return promise.resolve(result);
});
return promise;
}
Not sure why it is erroring on the key that is the primary. I have seen posts about needing other indexes for times when I am searching by something that isn't a key. But I don't believe that's the case here.
Here is the batch write delete request sample. This code has been tested and working fine. If you change this code for your requirement, it should work.
Table Definition:-
Bag - Table Name
bag - Hash Key
No partition key in 'Bag' table
Batch Write Code:-
var AWS = require("aws-sdk");
AWS.config.update({
region : "us-west-2",
endpoint : "http://localhost:8000"
});
var documentclient = new AWS.DynamoDB.DocumentClient();
var itemsArray = [];
var item1 = {
DeleteRequest : {
Key : {
'bag' : 'b1'
}
}
};
itemsArray.push(item1);
var item2 = {
DeleteRequest : {
Key : {
'bag' : 'b2'
}
}
};
itemsArray.push(item2);
var params = {
RequestItems : {
'Bag' : itemsArray
}
};
documentclient.batchWrite(params, function(err, data) {
if (err) {
console.log('Batch delete unsuccessful ...');
console.log(err, err.stack); // an error occurred
} else {
console.log('Batch delete successful ...');
console.log(data); // successful response
}
});
Output:-
Batch delete successful ...
{ UnprocessedItems: {} }
This is doable with Node lambda, but there are a few things you need to consider to address concurrency while processing large databases:
Handle paging while querying all of the matching elements from a secondary index
Split into chunks of 25 requests as per BatchWrite/Delete requirements https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_BatchWriteItem.html
Above 40,000 matches you might need a 1 second delay between cycles https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html
Here a snipped that I wrote:
const AWS = require("aws-sdk");
const dynamodb = new AWS.DynamoDB.DocumentClient();
const log = console.log;
exports.handler = async (event) => {
log(event);
let TableName = event.tableName;
let params = {
let TableName,
FilterExpression: "userId = :uid",
ExpressionAttributeValues: {
":uid": event.userId,
},
};
let getItems = async (lastKey, items) => {
if (lastKey) params.ExclusiveStartKey = lastKey;
let resp = await dynamodb.scan(params).promise();
let items = resp.Items.length
? items.concat(resp.Items.map((x) => x.id))
: items;
if (resp.LastEvaluatedKey)
return await getItems(resp.LastEvaluatedKey, items);
else return items;
};
let ids = await getItems(null, []);
let idGroups = [];
for (let i = 0; i < ids.length; i += 25) {
idGroups.push(ids.slice(i, i + 25));
}
for (const gs of idGroups) {
let delReqs = [];
for (let id of gs) {
delReqs.push({ DeleteRequest: { Key: { id } } });
}
let RequestItems = {};
RequestItems[TableName] = delReqs;
let d = await dynamodb
.batchWrite({ RequestItems })
.promise().catch((e) => log(e));
}
log(ids.length + " items processed");
return {};
};
Not sure why nobody provided a proper answer.
Here's a lambda I did in nodeJS. It will perform a full scan on the table, then batch delete every 25 items per request.
Remember to change TABLE_NAME.
const AWS = require('aws-sdk');
const docClient = new AWS.DynamoDB.DocumentClient({ apiVersion: '2012-08-10' });
//const { TABLE_NAME } = process.env;
TABLE_NAME = "CHANGE ME PLEASE"
exports.handler = async (event) => {
let params = {
TableName: TABLE_NAME,
};
let items = [];
let data = await docClient.scan(params).promise();
items = [...items, ...data.Items];
while (typeof data.LastEvaluatedKey != 'undefined') {
params.ExclusiveStartKey = data.LastEvaluatedKey;
data = await docClient.scan(params).promise();
items = [...items, ...data.Items];
}
let leftItems = items.length;
let group = [];
let groupNumber = 0;
console.log('Total items to be deleted', leftItems);
for (const i of items) {
const deleteReq = {
DeleteRequest: {
Key: {
id: i.id,
},
},
};
group.push(deleteReq);
leftItems--;
if (group.length === 25 || leftItems < 1) {
groupNumber++;
console.log(`Batch ${groupNumber} to be deleted.`);
const params = {
RequestItems: {
[TABLE_NAME]: group,
},
};
await docClient.batchWrite(params).promise();
console.log(
`Batch ${groupNumber} processed. Left items: ${leftItems}`
);
// reset
group = [];
}
}
const response = {
statusCode: 200,
// Uncomment below to enable CORS requests
// headers: {
// "Access-Control-Allow-Origin": "*"
// },
body: JSON.stringify('Hello from Lambda!'),
};
return response;
};
Be aware, that you need to follow instructions:
src: https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_BatchWriteItem.html
DeleteRequest - Perform a DeleteItem operation on the specified item. The item to be deleted is identified by a Key subelement:
Key - A map of primary key attribute values that uniquely identify the item. Each entry in this map consists of an attribute name and an attribute value. For each primary key, you must provide all of the key attributes. For example, with a simple primary key, you only need to provide a value for the partition key. For a composite primary key, you must provide values for both the partition key and the sort key.
For batch delete, we can use batchWrite with DeleteRequest. Here is an example, in this, we are providing tableName whose data is to be deleted, the payload is an array of ids which we need to remove.
In single request 25 items can be deleted.
const AWS = require('aws-sdk');
const dynamodb= new AWS.DynamoDB.DocumentClient({ apiVersion: '2012-08-10' });
const tableName = "PlayerData";
const payload = [{id=101}, {id=105}, {id=106}];
const deleteBatchData = async (tableName, payload, dynamodb) => {
try {
await dynamodb.batchWrite({
RequestItems: {
[tableName]: payload.map(item => {
return {
DeleteRequest: {
Key: {
id: item.id
}
}
};
})
}
}).
promise().
then((response) => {
return response;
})
.catch((err) => {
console.log("err ::", JSON.stringify(err))
});
} catch (err) {
console.log('Error in deleteBatchData ', err);
}
}
Why not use PartiQL. This approach is much more readable. (This too has a limit of 25 items per request just like BatchWriteITems)
// Import required AWS SDK clients and commands for Node.js.
import { BatchExecuteStatementCommand } from "#aws-sdk/client-dynamodb";
import { ddbDocClient } from "../libs/ddbDocClient.js";
const tableName = process.argv[2];
const movieYear1 = process.argv[3];
const movieTitle1 = process.argv[4];
const movieYear2 = process.argv[5];
const movieTitle2 = process.argv[6];
export const run = async (
tableName,
movieYear1,
movieTitle1,
movieYear2,
movieTitle2
) => {
try {
const params = {
Statements: [
{
Statement: "DELETE FROM " + tableName + " where year=? and title=?",
Parameters: [{ N: movieYear1 }, { S: movieTitle1 }],
},
{
Statement: "DELETE FROM " + tableName + " where year=? and title=?",
Parameters: [{ N: movieYear2 }, { S: movieTitle2 }],
},
],
};
const data = await ddbDocClient.send(
new BatchExecuteStatementCommand(params)
);
console.log("Success. Items deleted.", data);
return "Run successfully"; // For unit tests.
} catch (err) {
console.error(err);
}
};
run(tableName, movieYear1, movieTitle1, movieYear2, movieTitle2);