I am trying to extract images from a csv file by doing the following:
Parsing/streaming in a large csv file using csv-parse and the fs createReadStream method
Grabbing each line for processing using stream-transform
Extraction of image and other row data for processing using the async waterfall method.
Download and write image to server using request and the fs createWriteStream method
For some reason after the data gets piped into createWriteStream, there is some event in which an async callback never gets called. I have run this same code only using request, without piping to createWriteStream, and it works. I've also run createWriteStream w/ a drain event, and then some how it works? Can anyone explain this to me?
In the code below, request is trying to pipe 14,970 images, but the createWriteStream close or finish events only fire 14,895 times, with error firing 0 times. Could this be a draining issue? Could highWaterMark be exceeded and a write fail could be occurring undetected?
Here is my csv line getting code:
var first = true;
var parser = parse();
var transformer = transform( (line, complete) => {
if(!first)
extractData(line,complete)
else {
first = false;
complete(null);
}
},
() => {
console.log('Done: parseFile');
});
fs.createReadStream(this.upload.location).pipe(parser).pipe(transformer);
extractData function that doesn't always do a required async callback:
extractData(line,complete){
var now = new Date();
var image = {
createdAt: now,
updatedAt: now
};
async.waterfall([
next => { // Data Extraction
async.forEachOf(line, (data, i, complete) => {
if(i === 2) image.src = data;
if(i === 3) image.importSrc = data;
complete(null);
}, err => {
if(err) throw err;
next(null);
});
},
next => { // Download Image
var file = fs.createWriteStream('public/'+image.src);
var sendReq = request.get(image.importSrc);
sendReq.on('response', response => {
if (response.statusCode !== 200) {
this.upload.report.image.errors++;
return next(null);
}
});
sendReq.on('error', err => {
this.upload.report.image.errors++;
next(null);
});
sendReq.pipe(file);
file.on('finish', () => {
this.upload.report.image.inserts++;
file.close(next); // Close file and callback
});
file.on('error', err => {
this.upload.report.image.errors++;
next(null);
});
}
], err => {
if(err) throw err;
complete(null);
});
}
As suggested by #mscdex, I've also tried switching out finish for his replacement close approach.
file.close(next); is unnecessary as the file stream is closed automatically by default. What you can do instead is to listen for the close event to know when the file descriptor for the stream has been closed. So replace the entire finish event handler with:
file.on('close', () => {
this.upload.report.image.inserts++;
next(null);
});
Related
I am trying to create a read stream and then pipe the contents of a word document XML file to a write stream and then read from that finished write stream. The problem I am running into is that on the first sequence of reading then writing and then reading I get a [Error: ENOENT: no such file or directory, open] error. However after this file was created from the first attempt the code runs smoothly and returns the pageCount value as expected.
I have tried to read from the completed file and then return the pageCount value inside of the 'finish' event, but that just leaves me with an undefined returned value. As such, I am not sure what to do.
Any help would be appreciated for this struggling junior.
Update, the following code worked for me.
console.log("unzipping");
const createWordOutput = new Promise((resolve, reject) => {
console.log("end is firing");
fs.createReadStream(data)
.pipe(unzipper.Parse())
.on("entry", async function (entry) {
const fileName = entry.path;
const type = entry.type;
const size = entry.vars.uncompressedSize;
//docProps has the meta data of the document, the word/document.xml is the actual document
if (fileName === "docProps/app.xml") {
// console.log(fileName);
entry.pipe(fs.createWriteStream("./wordOutput")).on("finish", () => {
console.log("finished writing the file");
console.log("resolving");
return resolve();
});
//once the piping is completed and the XML structure is fully writen a 'finish' event is emitted. This event accepts a callback. Here I put the cb and call readTheFile on the ./output file. This successfully reads the metadata of each file
} else {
entry.autodrain();
}
});
});
await createWordOutput;
const pageCount = await readWordFile("./wordOutput");
if (pageCount === undefined) {
console.log("PAGECOUNT IS UNDEFINED");
}
console.log("logging page count in unzip the file");
console.log(pageCount);
return pageCount;
};```
The error is coming from readWordFile, because it runs before the stream is done.
You need to move reading to the finish part
Try this:
console.log("unzipping");
let pageCount = "";
fs.createReadStream(data)
.pipe(unzipper.Parse())
.on("entry", function(entry) {
const fileName = entry.path;
const type = entry.type;
const size = entry.vars.uncompressedSize;
if (fileName === "docProps/app.xml") {
// console.log(fileName);
entry.pipe(fs.createWriteStream("./wordOutput")).on("finish", async() => {
console.log("finished writing the file");
// finished writing, do everything here, and return
pageCount = await readWordFile("./wordOutput");
if (pageCount === undefined) {
console.log("PAGECOUNT IS UNDEFINED");
}
console.log("logging page count in unzip the file");
console.log(pageCount);
return pageCount;
});
} else {
entry.autodrain();
}
});
};
const readWordFile = (data) => {
return new Promise(async(resolve, reject) => {
console.log("this is the data that readWordFile received");
console.log(data);
console.log("reading word file");
const XMLData = await fsp.readFile(data, {
encoding: "utf-8"
});
console.log(XMLData);
const pageCount = XMLData.split("<Pages>")
.join(",")
.split("</Pages>")
.join(",")
.split(",")[1];
console.log("getting page count from read word file");
console.log(pageCount);
resolve(pageCount);
});
};
I updated the function to create the CSV file but now I'm getting an error:
In upload function
internal/streams/legacy.js:57
throw er; // Unhandled stream error in pipe.
^
Error: ENOENT: no such file or directory, open 'C:\Users\shiv\WebstormProjects\slackAPIProject\billingData\CSV\1548963844106output.csv'
var csvFilePath = '';
var JSONFilePath = '';
function sendBillingData(){
var message = '';
axios.get(url, {
params: {
token: myToken
}
}).then(function (response) {
message = response.data;
fields = billingDataFields;
// saveFiles(message, fields, 'billingData/');
saveFilesNew(message, fields, 'billingData/');
var file = fs.createReadStream(__dirname + '/' + csvFilePath); // <--make sure this path is correct
console.log(__dirname + '/' + csvFilePath);
uploadFile(file);
})
.catch(function (error) {
console.log(error);
});
}
The saveFilesNew function is:
function saveFilesNew(message, options, folder){
try {
const passedData = message;
var relevantData='';
if (folder == 'accessLogs/'){
const loginsJSON = message.logins;
relevantData = loginsJSON;
console.log(loginsJSON);
}
if(folder == 'billingData/'){
relevantData = passedData.members;
const profile = passedData.members[0].profile;
}
//Save JSON to the output folder
var date = Date.now();
var directoryPath = folder + 'JSON/' + date + "output";
JSONFilePath = directoryPath + '.json';
fs.writeFileSync(JSONFilePath, JSON.stringify(message, null, 4), function(err) {
if (err) {
console.log(err);
}
});
//parse JSON onto the CSV
const json2csvParser = new Json2csvParser({ fields });
const csv = json2csvParser.parse(relevantData);
// console.log(csv);
//function to process the CSV onto the file
var directoryPath = folder + 'CSV/' + date + "output";
csvFilePath = directoryPath + '.csv';
let data = [];
let columns = {
real_name: 'real_name',
display_name: 'display_name',
email: 'email',
account_type: 'account_type'
};
var id = passedData.members[0].real_name;
console.log(id);
console.log("messageLength is" +Object.keys(message.members).length);
for (var i = 0; i < Object.keys(message.members).length; i++) {
console.log("value of i is" + i);
var display_name = passedData.members[i].profile.display_name;
var real_name = passedData.members[i].profile.real_name_normalized;
var email = passedData.members[i].profile.email;
var account_type = 'undefined';
console.log("name: " + real_name);
if(passedData.members[i].is_owner){
account_type = 'Org Owner';
}
else if(passedData.members[i].is_admin){
account_type = 'Org Admin';
}
else if(passedData.members[i].is_bot){
account_type = 'Bot'
}
else account_type = 'User';
data.push([real_name, display_name, email, account_type]);
}
console.log(data);
stringify(data, { header: true, columns: columns }, (err, output) => {
if (err) throw err;
fs.writeFileSync(csvFilePath, output, function(err) {
console.log(output);
if (err) {
console.log(err);
}
console.log('my.csv saved.');
});
});
} catch (err) {
console.error(err);
}
}
The upload file function is:
function uploadFile(file){
console.log("In upload function");
const form = new FormData();
form.append('token', botToken);
form.append('channels', 'testing');
form.append('file', file);
axios.post('https://slack.com/api/files.upload', form, {
headers: form.getHeaders()
}).then(function (response) {
var serverMessage = response.data;
console.log(serverMessage);
});
}
So I think the error is getting caused because node is trying to upload the file before its being created. I feel like this has something to do with the asynchronous nature of Node.js but I fail to comprehend how to rectify the code. Please let me know how to correct this and mention any improvements to the code structure/design too.
Thanks!
You don't wait for the callback provided to stringify to be executed, and it's where you create the file. (Assuming this stringify function really does acccept a callback.)
Using callbacks (you can make this cleaner with promises and these neat async/await controls, but let's just stick to callbacks here), it should be more like:
function sendBillingData() {
...
// this callback we'll use to know when the file writing is done, and to get the file path
saveFilesNew(message, fields, 'billingData/', function(err, csvFilePathArgument) {
// this we will execute when saveFilesNew calls it, not when saveFilesNew returns, see below
uploadFile(fs.createReadStream(__dirname + '/' + csvFilePathArgument))
});
}
// let's name this callback... "callback".
function saveFilesNew(message, options, folder, callback) {
...
var csvFilePath = ...; // local variable only instead of your global
...
stringify(data, { header: true, columns: columns }, (err, output) => {
if (err) throw err; // or return callbcack(err);
fs.writeFile(csvFilePath , output, function(err) { // NOT writeFileSync, or no callback needed
console.log(output);
if (err) {
console.log(err);
// callback(err); may be a useful approach for error-handling at a higher level
}
console.log('my.csv saved.'); // yes, NOW the CSV is saved, not before this executes! Hence:
callback(null, csvFilePath); // no error, clean process, pass the file path
});
});
console.log("This line is executed before stringify's callback is called!");
return; // implicitly, yes, yet still synchronous and that's why your version crashes
}
Using callbacks that are called only when the expected events happen (a file is done writing, a buffer/string is done transforming...) allows JS to keep executing code in the meantime. And it does keep executing code, so when you need data from an async code, you need to tell JS you need it done before executing your piece.
Also, since you can pass data when calling back (it's just a function), here I could avoid relying on a global csvFilePath. Using higher level variables makes things monolithic, like you could not transfer saveFilesNew to a dedicated file where you keep your toolkit of file-related functions.
Finally, if your global process is like:
function aDayAtTheOffice() {
sendBillingData();
getCoffee();
}
then you don't need to wait for the billing data to be processed before starting making coffee. However, if your boss told you that you could NOT get a coffee until the billing data was settled, then your process would look like:
function aDayAtTheOffice() {
sendBillingData(function (err) {
// if (err) let's do nothing here: you wanted a coffee anyway, right?
getCoffee();
});
}
(Note that callbacks having potential error as first arg and data as second arg is a convention, nothing mandatory.)
IMHO you should read about scope (the argument callback could be accessed at a time where the call to saveFilesNew was already done and forgotten!), and about the asynchronous nature of No... JavaScript. ;) (Sorry, probably not the best links but they contain the meaningful keywords, and then Google is your buddy, your friend, your Big Brother.)
I've been using node-oracledb for a few months and I've managed to achieve what I have needed to so far.
I'm currently working on a search app that could potentially return about 2m rows of data from a single call. To ensure I don't get a disconnect from the browser and the server, I thought I would try queryStream so that there is a constant flow of data back to the client.
I implemented the queryStream example as-is, and this worked fine for a few hundred thousand rows. However, when the returned rows is greater than one million, Node runs out of memory. By logging and watching both client and server log events, I can see that client is way behind the server in terms of rows sent and received. So, it looks like Node is falling over because it's buffering so much data.
It's worth noting that at this point, my selectstream implementation is within a req/res function called via Express.
To return the data, I do something like....
stream.on('data', function (data) {
rowcount++;
let obj = new myObjectConstructor(data);
res.write(JSON.stringify(obj.getJson());
});
I've been reading about how streams and pipe can help with flow, so what I'd like to be able to do is to be able to pipe the results from the query to a) help with flow and b) to be able to pipe the results to other functions before sending back to the client.
E.g.
function getData(req, res){
var stream = myQueryStream(connection, query);
stream
.pipe(toSomeOtherFunction)
.pipe(yetAnotherFunction)
.pipe(res);
}
I'm spent a few hours trying to find a solution or example that allows me to pipe results, but I'm stuck and need some help.
Apologies if I'm missing something obvious, but I'm still getting to grips with Node and especially streams.
Thanks in advance.
There's a bit of an impedance mismatch here. The queryStream API emits rows of JavaScript objects, but what you want to stream to the client is a JSON array. You basically have to add an open bracket to the beginning, a comma after each row, and a close bracket to the end.
I'll show you how to do this in a controller that uses the driver directly as you have done, instead of using separate database modules as I advocate in this series.
const oracledb = require('oracledb');
async function get(req, res, next) {
try {
const conn = await oracledb.getConnection();
const stream = await conn.queryStream('select * from employees', [], {outFormat: oracledb.OBJECT});
res.writeHead(200, {'Content-Type': 'application/json'});
res.write('[');
stream.on('data', (row) => {
res.write(JSON.stringify(row));
res.write(',');
});
stream.on('end', () => {
res.end(']');
});
stream.on('close', async () => {
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
stream.on('error', async (err) => {
next(err);
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
} catch (err) {
next(err);
}
}
module.exports.get = get;
Once you get the concepts, you can simplify things a bit with a reusable Transform class which allows you to use pipe in the controller logic:
const oracledb = require('oracledb');
const { Transform } = require('stream');
class ToJSONArray extends Transform {
constructor() {
super({objectMode: true});
this.push('[');
}
_transform (row, encoding, callback) {
if (this._prevRow) {
this.push(JSON.stringify(this._prevRow));
this.push(',');
}
this._prevRow = row;
callback(null);
}
_flush (done) {
if (this._prevRow) {
this.push(JSON.stringify(this._prevRow));
}
this.push(']');
delete this._prevRow;
done();
}
}
async function get(req, res, next) {
try {
const toJSONArray = new ToJSONArray();
const conn = await oracledb.getConnection();
const stream = await conn.queryStream('select * from employees', [], {outFormat: oracledb.OBJECT});
res.writeHead(200, {'Content-Type': 'application/json'});
stream.pipe(toJSONArray).pipe(res);
stream.on('close', async () => {
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
stream.on('error', async (err) => {
next(err);
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
} catch (err) {
next(err);
}
}
module.exports.get = get;
Rather than writing your own logic to create a JSON stream, you can use JSONStream to convert an object stream to (stringified) JSON, before piping it to its destination (res, process.stdout etc) This saves the need to muck around with .on('data',...) events.
In the example below, I've used pipeline from node's stream module rather than the .pipe method: the effect is similar (with better error handling I think). To get objects from oracledb.queryStream, you can specify option {outFormat: oracledb.OUT_FORMAT_OBJECT} (docs). Then you can make arbitrary modifications to the stream of objects produced. This can be done using a transform stream, made perhaps using through2-map, or if you need to drop or split rows, through2. Below the stream is sent to process.stdout after being stringified as JSON, but you could equally send to it express's res.
require('dotenv').config() // config from .env file
const JSONStream = require('JSONStream')
const oracledb = require('oracledb')
const { pipeline } = require('stream')
const map = require('through2-map') // see https://www.npmjs.com/package/through2-map
oracledb.getConnection({
user: process.env.DB_USER,
password: process.env.DB_PASSWORD,
connectString: process.env.CONNECT_STRING
}).then(connection => {
pipeline(
connection.queryStream(`
select dual.*,'test' as col1 from dual
union select dual.*, :someboundvalue as col1 from dual
`
,{"someboundvalue":"test5"} // binds
,{
prefetchRows: 150, // for tuning
fetchArraySize: 150, // for tuning
outFormat: oracledb.OUT_FORMAT_OBJECT
}
)
,map.obj((row,index) => {
row.arbitraryModification = index
return row
})
,JSONStream.stringify() // false gives ndjson
,process.stdout // or send to express's res
,(err) => { if(err) console.error(err) }
)
})
// [
// {"DUMMY":"X","COL1":"test","arbitraryModification":0}
// ,
// {"DUMMY":"X","COL1":"test5","arbitraryModification":1}
// ]
I am making use of "socket.io-client" and "socket.io stream" to make a request and then stream some data. I have the following code that handles this logic
Client Server Logic
router.get('/writeData', function(req, res) {
var io = req.app.get('socketio');
var nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
var nameNodeData = {};
async.waterfall([
checkForDataNodes,
readFileFromS3
], function(err, result) {
if (err !== null) {
res.json(err);
}else{
res.json("Finished Writing to DN's");
}
});
function checkForDataNodes(cb) {
nameNodeSocket.on('nameNodeData', function(data) {
nameNodeData = data;
console.log(nameNodeData);
cb(null, nameNodeData);
});
if (nameNodeData.numDataNodes === 0) {
cb("No datanodes found");
}
}
function readFileFromS3(nameNodeData, cb) {
for (var i in nameNodeData['blockToDataNodes']) {
var IP = nameNodeData['blockToDataNodes'][i]['ipValue'];
var dataNodeSocket = io.connect('http://'+ IP +":5000");
var ss = require("socket.io-stream");
var stream = ss.createStream();
var byteStartRange = nameNodeData['blockToDataNodes'][i]['byteStart'];
var byteStopRange = nameNodeData['blockToDataNodes'][i]['byteStop'];
paramsWithRange['Range'] = "bytes=" + byteStartRange.toString() + "-" + byteStopRange.toString();
//var file = require('fs').createWriteStream('testFile' + i + '.txt');
var getFileName = nameNodeData['blockToDataNodes'][i]['key'].split('/');
var fileData = {
'mainFile': paramsWithRange['Key'].split('/')[1],
'blockName': getFileName[1]
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
s3.getObject(paramsWithRange).createReadStream().pipe(stream);
//dataNodeSocket.disconnect();
}
cb(null);
}
});
Server Logic (that gets the data)
var dataNodeIO = require('socket.io')(server);
var ss = require("socket.io-stream");
dataNodeIO.on('connection', function(socket) {
console.log("Succesfully connected!");
ss(socket).on('sendData', function(stream, data) {
var IP = data['ipValue'];
var blockName = data['blockName'];
var mainFile = data['mainFile'];
dataNode.makeDir(mainFile);
dataNode.addToReport(mainFile, blockName);
stream.pipe(fs.createWriteStream(mainFile + '/' + blockName));
});
});
How can I properly disconnect the connections in function readFileFromS3. I have noticed using dataNodeSocket.disconnect() at the end does not work as I cannot verify the data was received on the 2nd server. But if I comment it out, I can see the data being streamed to the second server.
My objective is to close the connections in Client Server side
It appears that the main problem with closing the socket is that you weren't waiting for the stream to be done writing before trying to close the socket. So, because the writing is all asynchronous and finishes sometime later, you were trying to close the socket before the data had been written.
Also because you were putting asynchronous operations inside a for loop, you were also running all your operations in parallel which may not be exactly what you want as it makes error handling more difficult and server load more difficult.
Here's the code I would suggest that does the following:
Create a function streamFileFromS3() that streams a single file and returns a promise that will notify when it's done.
Use await in a for loop with that streamFileFromS3() to serialize the operations. You don't have to serialize them, but then you would have to change your error handling to figure out what to do if one errors while the others are already running and you'd have to be more careful about concurrency issues.
Use try/catch to catch any errors from streamFileFromS3().
Add error handling on the stream.
Change all occurrences of data['propertyName'] to data.propertyName. The only time you need to use brackets is if the property name contains a character that is not allowed in a Javascript identifier or if the property name is in a variable. Otherwise, the dot notation is preferred.
Add socket.io connection error handling logic for both socket.io connections.
Set returned status to 500 when there's an error processing the request
So, here's the code for that:
const ss = require("socket.io-stream");
router.get('/writeData', function(req, res) {
const io = req.app.get('socketio');
function streamFileFromS3(ip, data) {
return new Promise((resolve, reject) => {
const dataNodeSocket = io.connect(`http://${ip}:5000`);
dataNodeSocket.on('connect_error', reject);
dataNodeSocket.on('connect_timeout', () {
reject(new Error(`timeout connecting to http://${ip}:5000`));
});
dataNodeSocket.on('connection', () => {
// dataNodeSocket connected now
const stream = ss.createStream().on('error', reject);
paramsWithRange.Range = `bytes=${data.byteStart}-${data.byteStop}`;
const filename = data.key.split('/')[1];
const fileData = {
'mainFile': paramsWithRange.Key.split('/')[1],
'blockName': filename
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
// get S3 data and pipe it to the socket.io stream
s3.getObject(paramsWithRange).createReadStream().on('error', reject).pipe(stream);
stream.on('close', () => {
dataNodeSocket.disconnect();
resolve();
});
});
});
}
function connectError(msg) {
res.status(500).send(`Error connecting to ${NAMENODE_ADDRESS}`);
}
const nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
nameNodeSocket.on('connect_error', connectError).on('connect_timeout', connectError);
nameNodeSocket.on('nameNodeData', async (nameNodeData) => {
try {
for (let item of nameNodeData.blockToDataNodes) {
await streamFileFromS3(item.ipValue, item);
}
res.json("Finished Writing to DN's");
} catch(e) {
res.status(500).json(e);
}
});
});
Other notes:
I don't know what paramsWithRange is as it is not declared here and when you were doing everything in parallel, it was getting shared among all the connections which is asking for a concurrency issue. In my serialized implementation, it's probably safe to share it, but the way it is now bothers me as it's a concurrency issue waiting to happen.
This feels like an obvious question but it's perplexing me: I want a Node function that downloads a resource at a URI. I need it to work for several different content types without the user needing to specify which type it is.
I know how to pipe request to fs.createWriteStream when you know it's going to be an image, but not how to handle it when you've already invoked the callback from request. Here's where I am:
var request = require('request'),
fs = require('graceful-fs');
function cacheURI(uri, cache_path, cb) {
request(uri, function(err, resp, body) {
var content_type = resp.headers['content-type'].toLowerCase().split("; ")[0],
type = content_type.split("/")[0],
sub_type = content_type.split("/")[1];
if (sub_type == "json") {
body = JSON.parse(body);
}
if (type == "image") {
// this is where the trouble starts
var ws = fs.createWriteStream(cache_path);
ws.write(body);
ws.on('close', function() {
console.log('image done');
console.log(resp.socket.bytesRead);
ws.end();
cb()
});
} else {
// this works fine for text resources
fs.writeFile(cache_path, body, cb);
}
});
}
This answer to a previous question suggests the following:
request.get({url: 'https://someurl/somefile.torrent', encoding: 'binary'}, function (err, response, body) {
fs.writeFile("/tmp/test.torrent", body, 'binary', function(err) {
if(err)
console.log(err);
else
console.log("The file was saved!");
});
});
But I can't pass "binary" to request if I don't yet know the type of response I'll get.
UPDATE
Per the suggested answer, changing "close" to "finish" in the event handler does fire the callback:
if (opts.image) {
var ws = fs.createWriteStream(opts.path);
ws.on('finish', function() {
console.log('image done');
console.log(resp.socket.bytesRead);
});
//tried as buffer as well
//ws.write(new Buffer(body));
ws.write(body);
ws.end();
}
This does write the image file, but not correctly:
As suggested in here, try using the finish event (if you have node >= v0.10)
ws.on('finish', function() {
console.log('image done');
console.log(resp.socket.bytesRead);
ws.end();
cb()
});