unable to read lines of log file - node.js

I have an AWS S3 bucket with a bunch of log files in them. I have a lambda function using node.js 8.10 runtime that reads each line of the log file. This is what I have:
const readline = require('readline');
exports.handler = async (event) => {
try {
let bucket = event.Records[0].s3.bucket.name;
let key = event.Records[0].s3.object.key;
// documentation for this method:
// https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html#getObject-property
let readStream = S3.getObject({ Bucket: bucket, Key: key }).createReadStream(); // 1
const rl = readline.createInterface({
input: readStream
});
rl.on('line', async (line) => {
console.log(line);
// process line as needed
});
} catch (err) {
console.log(err);
return err;
}
};
In the snippet above I'm printing the line in the log file to the console just for testing, but I don't see any output.
But if I refactor the code like this it works:
const stream = require('stream');
const bufferStream = new stream.PassThrough();
const readline = require('readline');
exports.handler = async (event) => {
try {
// retrieving first record only just for
// testing
let bucket = event.Records[0].s3.bucket.name;
let key = event.Records[0].s3.object.key;
// data.Body is a Buffer
let data = await S3.getObject({ Bucket: bucket, Key: key }).promise();
bufferStream.end(data.Body); // 2
const rl = readline.createInterface({
input: bufferStream
});
rl.on('line', (line) => {
console.log(line);
// process line as needed
});
} catch (err) {
console.log(err);
return err;
}
};
For the line marked 2, the getObject function returns a buffer and is converted to a stream.
Is it possible to do this without using a buffer? My thinking is if the log file is very large it is inefficient to convert a buffer to a stream. I'm wondering if I can use a stream directly like the line marked 1.
EDIT:
I did some more testing and got it to work but without an async lambda function. Here it is:
exports.handler = function (event, context, callback) {
// for testing I'm looking at the first record
let bucket = event.Records[0].s3.bucket.name;
let key = event.Records[0].s3.object.key;
const readStream = S3.getObject({ Bucket: bucket, Key: key }).createReadStream();
const rl = readline.createInterface({
input: readStream,
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(line);
});
}
Does anyone know why this refactored code works, but not with async lambda ?

Related

Is using an await sleep(ms) function the best way to add a delay between a write file and read file function in nodejs?

I am trying to implement the following flow in my code:
Get request comes in with filename in the parameter.
Nodejs makes a call to was s3 bucket to retrieve the file that is requested.
The file is retrieved and saved.
The file is read and sent back in the response.
Here is my /get endpoint:
router.get('/getPDF/:fileName', async (req, res, next) => {
const key = req.params.fileName
console.log(key)
console.log(key.toString())
const goodKey = key + '.pdf'
console.log(goodKey)
const saveToFile = 'currentPDF.pdf'
getFileAndSave(goodKey, saveToFile)
await sleep(1000);
const data = await fs.readFileSync('./s3Storage/currentPDF.pdf');
res.contentType("application/pdf");
res.send(data)
});
This is the getFileandSave function:
const getFileAndSave = async (key, filename) => {
var params = {
Bucket: BUCKET_NAME,
Key: key
};
console.log(key)
let readStream = await s3.getObject(params).createReadStream();
let writeStream = await fs.createWriteStream(path.join(__dirname, filename));
await readStream.pipe(writeStream);
return key
}
I am currently using a sleep function to add a delay, but there must be a better way to do this.
router.get('/getPDF/:fileName', async (req, res, next) => {
const key = req.params.fileName
console.log(key)
console.log(key.toString())
const goodKey = key + '.pdf'
console.log(goodKey)
const saveToFile = 'currentPDF.pdf'
let file = await getFileAndSave(goodKey, saveToFile)
const data = await fs.readFileSync('./s3Storage/currentPDF.pdf');
res.contentType("application/pdf");
res.send(data)
});
const getFileAndSave = async (key, filename) => {
return new Promise((resolve) => {
var params = {
Bucket: BUCKET_NAME,
Key: key
};
console.log(key)
let readStream = await s3.getObject(params).createReadStream();
let writeStream = await fs.createWriteStream(path.join(__dirname, filename));
await readStream.pipe(writeStream);
resolve(key);
})
}
Yes well advised by aymcg31
check above modified code. Please test and mark the answer appropriately thanks.

How to run binary from a lambda properly?

I'm trying to make sox audio binary run from the lambda, I followed this guide: http://marcelog.github.io/articles/static_sox_transcoding_lambda_mp3.html
So I created sox binary using docker with last Amazon Linux version and deployed lambda as zip package with structure:
sox
index.js
Lambda code:
exports.handler = async (event) => {
initializeContext();
try {
const object = await getFile("test.mp3");
const fs = require('fs');
fs.writeFileSync("/tmp/test.mp3", object.Body);
let result = child_process.execFileSync('sox', ['/tmp/test.mp3', '/tmp/testOutput.mp3', ... <filter params here>], {
encoding: 'ascii'
// shell: true
});
const file = fs.readFileSync("/tmp/testOutput.mp3");
await putFile("testOutput.mp3", file);
}
catch(err) {
try {
await log("error", err);
}
catch(err) {}
}
};
let getFile = async function(fileName) {
const params = {
Bucket: bucket,
Key: fileName
};
return await s3.getObject(params).promise();
};
let putFile = async function(fileName, body) {
const params = {
Bucket: bucket,
Key: fileName,
Body: body
};
await s3.putObject(params).promise();
};
So, I need to get file from S3, process it and return result. S3 loading is tested to work. However, I get "EACCES" error on trying to start "sox" process.
What could I miss? Can it originate from the fact that I used the last Amazon Linux image but Lambda use more old version?
I was struggling with the same for processing audio files. The npm package came handy.
https://github.com/stojanovic/lambda-audio
If your command is,
sox input.mp3 -c 1 output.wav
your node code will be like the following,
const lambdaAudio = require('lambda-audio')
lambdaAudio.sox('./input.mp3 -c 1 /tmp/output.wav')
.then(response => {
// Do something when the file was converted
})
.catch(errorResponse => {
console.log('Error from the sox command:', errorResponse)
})
and leave all the complex issues to the npm package.
Hope it helps.
Finally I've found that to ensure file permissions I have to create zip package inside Amazon Linux docker image. So you need to
npm i lambda-audio
there, then zip node_modules along with your index.js.
My lambda package zip root folder is:
node_modules // npm i lambda-audio result
index.js
Working solution for Node 8.10 Runtime with memory consumption optimizations:
process.env["PATH"] = process.env["PATH"] + ":" + process.env["LAMBDA_TASK_ROOT"];
const AWS = require("aws-sdk");
const s3 = new AWS.S3();
const lambdaAudio = require("lambda-audio");
let bucket;
exports.handler = async (event) => {
try {
bucket = event.bucket;
const { inputFileName, outputFileName } = event;
const fs = require("fs");
const inputStream = fs.createWriteStream("/tmp/input.mp3");
await getFileToStream(inputFileName, inputStream);
await lambdaAudio.sox("/tmp/input.mp3 /tmp/output.mp3 <YOUR FILTERS HERE>);
fs.unlinkSync("/tmp/input.mp3"); // removing unused input file
const outputStream = fs.createReadStream("/tmp/output.mp3");
await uploadFileFromStream(outputFileName, outputStream);
}
catch(err) {
// Logging
}
};
let getFileToStream = async function(fileName, writeStream) {
const params = {
Bucket: bucket,
Key: fileName
};
const readStream = s3.getObject(params).createReadStream();
const end = new Promise((resolve, reject) => {
writeStream.on('close', () => resolve());
writeStream.on('error', (err) => reject(err));
readStream.on('error', (err) => reject(err));
});
readStream.pipe(writeStream);
return end;
};
let uploadFileFromStream = async function(fileName, readStream) {
const params = {
Bucket: bucket,
Key: fileName,
Body: readStream
};
return await s3.upload(params).promise();
};

Read a file line by line using Lambda / S3

I want to read a file line by line located on S3. I tried the following code which I found searching online, but the Lambda function is exiting without invoking any of the readline callbacks. What am I doing wrong?
const aws = require('aws-sdk');
const s3 = new aws.S3({ apiVersion: '2006-03-01' });
const readline = require('readline');
exports.handler = async (event, context, callback) => {
const bucket = event.Records[0].s3.bucket.name;
const key = decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, ' '));
const params = {
Bucket: bucket,
Key: key,
};
const s3ReadStream = s3.getObject(params).createReadStream();
const rl = readline.createInterface({
input: s3ReadStream,
terminal: false
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
rl.on('error', () => {
console.log('error');
});
rl.on('close', function () {
console.log('closed');
context.succeed();
});
console.log('done');
};
I've found the problem. It's being awhile that I haven't coded on Lambda and I thought it would only exit when context was called. I'm now waiting for the promise to be resolved (or rejected which I'll implement later).
const aws = require('aws-sdk');
const s3 = new aws.S3({ apiVersion: '2006-03-01' });
const readline = require('readline');
exports.handler = async (event, context, callback) => {
const bucket = event.Records[0].s3.bucket.name;
const key = decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, ' '));
const params = {
Bucket: bucket,
Key: key,
};
const s3ReadStream = s3.getObject(params).createReadStream();
const rl = readline.createInterface({
input: s3ReadStream,
terminal: false
});
let myReadPromise = new Promise((resolve, reject) => {
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
rl.on('error', () => {
console.log('error');
});
rl.on('close', function () {
console.log('closed');
resolve();
});
});
try { await myReadPromise; }
catch(err) {
console.log('an error has occurred');
}
console.log('done reading!');
};
getObject doesn't just return the object that was stored S3. It return a JSON object whose Body field holds the blob of the object stored to S3. See also in the Response part of the documentation here.

untar/decompress to a stream in node

I am trying to write an AWS Lambda that will take a tar.gz from a S3 bucket, inflate it and then unpack it whilst streaming the files back to another S3 bucket.
I have this code:
var AWS = require('aws-sdk');
var fs = require('fs');
var zlib = require('zlib');
var uuid = require('uuid/v4');
var tar = require('tar-stream')
var pack = tar.pack()
var s3 = new AWS.S3();
exports.handler = (event, context, callback) => {
var bucket = event.Records[0].s3.bucket.name;
var key = event.Records[0].s3.object.key;
var file = 'S3://' + bucket + '/' + key;
console.log(bucket)
console.log(key)
var readParams = {
Bucket: bucket,
Key: key
};
var dataStream = s3.getObject(readParams).createReadStream();
var extract = tar.extract()
extract.on('entry', function(header, stream, next) {
console.log(header.name)
var writeParams = {
Bucket: process.env.JOB_PROCESSING_BUCKET,
Key: uuid() + '-' + header.name,
Body: stream
};
s3.upload(writeParams).
on('httpUploadProgress', function(evt) {
console.log('Progress:', evt.loaded, '/', evt.total);
}).
send(function(err, data) {
if (err) console.log("An error occurred", err);
console.log("Uploaded the file at", data.Location);
});
stream.on('end', function() {
next() // ready for next entry
})
stream.resume() // just auto drain the stream
})
extract.on('finish', function() {
// all entries read
})
dataStream.pipe(zlib.createGunzip()).pipe(extract);
callback(null, 'Gunzip Lambda Function');
};
It pulls the file, sorts the gzipping out and then i can see each file being extracted on entry. The code then tries to steam the file to S3 which creates a 0kb file hangs around like its reading the stream then continues onto the next.
Why cant it seem to read/processes the stream body?
Is there a better way of doing this?
Thanks
I don't know if it's the best solution but the following code works for me.
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
const tar = require('tar-stream');
const zlib = require('zlib');
const stream = require('stream');
const uuid = require('uuid');
exports.get = (event, context) => {
var params = {
Bucket: event.Records[0].s3.bucket.name,
Key: event.Records[0].s3.object.key
};
var dataStream = s3.getObject(params).createReadStream();
var extract = tar.extract();
extract.on('entry', function(header, inputStream, next) {
inputStream.pipe(uploadFromStream(s3,header));
inputStream.on('end', function() {
next(); // ready for next entry
});
inputStream.resume(); // just auto drain the stream
});
extract.on('finish', function() {
// all entries read
});
dataStream.pipe(zlib.createGunzip()).pipe(extract);
}
function uploadFromStream(s3,header) {
var pass = new stream.PassThrough();
var writeParams = {
Bucket: process.env.JOB_PROCESSING_BUCKET,
Key: uuid.v1() + '-' + header.name,
Body: pass
};
s3.upload(writeParams, function(err, data) {
context.done(err, data);
});
return pass;
}
Tried for a couple of hours to get this to work, turns out the 'finish' event has been replaced with 'end'. So - answer above works great, just small change -
inputStream.on('end', function() {
next(); // ready for next entry
});
- Should be -
inputStream.on('finish', function() {
next(); // ready for next entry
});

stream the contents of an S3 object into hash algorithm node.js

I'm new to node.js and I'm trying to write a AWS lambda function that would stream the content of an s3 object into the node's crypto module to create a md5 checksum value of the s3 object. Not sure why but everytime I run the code it would generate different hash values on the console.log. can anyone point me in the right direction to fix my code? appreciate the help!
var crypto = require('crypto');
var fs = require('fs');
var AWS = require('aws-sdk');
var s3 = new AWS.S3();
exports.handler = (event, context, callback) => {
var params = {
Bucket: 'bucket_name',
Key: 'key',
};
var hash = crypto.createHash('md5');
var stream = s3.getObject(params, function(err, data) {
if (err){
console.log(err);
return;
}
}).createReadStream();
stream.on('data', function (data) {
hash.update(data, 'utf-8')
})
stream.on('end', function () {
console.log(hash.digest('hex'))
})
};
You were close. You are mixing the "callback" style method signature with a "createReadStream" signature. Try this:
const crypto = require('crypto');
const fs = require('fs');
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
exports.handler = (event, context, callback) => {
let params = {
Bucket: 'bucket_name',
Key: 'key',
};
let hash = crypto.createHash('md5');
let stream = s3.getObject(params).createReadStream();
stream.on('data', (data) => {
hash.update(data);
});
stream.on('end', () => {
let digest = hash.digest('hex');
console.log(digest);
callback(null, digest);
});
};
Not directly an answer, but you can also add the md5 has as a ETag when uploading a file to S3.
const crypt = require('crypto');
const fs = require('fs').promises;
const aws = require('aws-sdk');
async function uploadFileToS3WithMd5Hash(bucket, filename, s3Key = null) {
const data = await fs.readFile(filename);
const md5Base64 = crypt.createHash("md5").update(data).digest('base64');
if (!s3Key) {
s3Key = filename;
}
/** Should you want to get the MD5 in hex format: */
// const md5Hex = Buffer.from(md5Base64, 'base64').toString('hex');
return new Promise((res, rej) => {
const s3 = new aws.S3();
s3.putObject({
Bucket: bucket,
Key: s3Key,
Body: data,
ContentMD5: md5Base64,
}, (err, resp) => err ? rej(err) : res(resp));
})
}
uploadFileToS3WithMd5Hash('your-own-bucket', 'file.txt')
.then(console.log)
.catch(console.error);
So by checking the ETag for an object on S3, you would get the hex-string of the files MD5 hash.
In some cases (see this post by Dennis), MD5 checksum is computed automatically upon upload.

Resources