Im' working with a pelias project and using package wof-admin-lookup to handle data that is read from a file.
There is a case, there is no valid data for being pushed to stream. The wof-admin-lookup will never end.
Here is my code:
const stream = recordStream.create(filePath)
.pipe(blacklistStream())
.pipe(adminLookup.create())
.pipe(model.createDocumentMapperStream())
.pipe(peliasDbclient())
stream
.on('data', data => {
count++
})
.on('finish', () => {
console.log(`Imported ${count} addresses`)
resolve()
})
.on('error', (e) => {
reject(e)
})
Here is the code in wof-admin-lookup:
module.exports = function(pipResolver, config) {
if (!pipResolver) {
throw new Error('valid pipResolver required to be passed in as the first parameter');
}
// pelias 'imports.adminLookup' config section
config = config || {};
const pipResolverStream = createPipResolverStream(pipResolver, config);
const end = createPipResolverEnd(pipResolver);
const stream = parallelTransform(config.maxConcurrentReqs || 1, pipResolverStream);
stream.on('end', end);
return stream;
};
Although the console logged "Imported 0 addresses" but the pipResolverStream will stay forever if I do not shut down it manually by Ctrl+C.
Update, this case only happens if there is no data passed through stream.
"the end event will never trigger without something like < /dev/null to generate that EOF. Otherwise the program waits for the terminal to send a ^D."
node.js: How to detect an empty stdin stream?
Related
i was trying to read content of one file and write it to another using nodeJS.everything works if i do not add fileStream.end(). Below is the code i have written:
const fs= require ('fs');
const readLine = require('readline');
//read file
const myInterface = readLine.createInterface({input:fs.createReadStream('./input/file.txt'),output: fs.createWriteStream('output.txt')});
//print to output
const fileStream=fs.createWriteStream('output1.txt');
//function to copy
let copyData=(line)=>{
//console.log(line);
fileStream.write(`copied: ${line}\n`);
}
//copy file
myInterface.on('line',copyData);
fileStream.end();
//print to console
fs.readFile('output1.txt','utf-8',(err,data)=>{
if(err)
console.log(`ERROR: ${err}`)
else
console.log(`readFile: ${data}`)
})
Thanks in advance! added the picture of the terminal output
You're calling filestream.end() synchronously before any of the reading and writing has completed.
When you do this:
myInterface.on('line',copyData);
you're just setting up an event handler and sometime IN THE FUTURE, the function copyData will be called with each line from the file. But, before any of that even happens, you call:
filestream.end()
which shuts down the writestream before anything has been written to it.
Similarly, you're also calling fs.readFile() before the reading/writing has completed.
This is all an event driven system. When you set up asynchronous operations like you have here and you want to do something upon completion of the operation, then you can't just call that code synchronously like you are here because the asynchronous events won't have finished yet. Instead, you need to register for completion events and trigger your "after" work on the completion events.
In this case, you can listen for the close event on the input stream.
const fs = require('fs');
const readLine = require('readline');
//read file
const myInterface = readLine.createInterface({ input: fs.createReadStream('./input/file.txt')});
//print to output
const fileStream = fs.createWriteStream('output1.txt');
//function to copy
let copyData = (line) => {
//console.log(line);
fileStream.write(`copied: ${line}\n`);
}
//copy file
myInterface.on('line', copyData);
// see when the read stream is done
myInterface.on('close', () => {
fileStream.end();
//print to console
fs.readFile('output1.txt', 'utf-8', (err, data) => {
if (err)
console.log(`ERROR: ${err}`)
else
console.log(`readFile: ${data}`)
})
});
// listen for errors too
myInterface.on('error', err => {
console.log(err);
});
Running NodeJS (v8.16.2) locally on the command line. I scrape an e-commerce website, gather the relevant information into a data-structure, then try to write it into a plain-text CSV file (my records don't have a fixed set of fields) manually by creating a write stream. This last step isn't working.
// Other stuff
const exitHandler = function(options, exitCode) {
if (exitCode || exitCode !== 0) console.log(exitCode);
// Other stuff
writeToCsv();
if (options.exit) process.exit();
}
const writeToCsv = function() {
let ws = fs.createWriteStream('./final-data.csv');
const crlf = '\n\r'; // might need to reverse this
// Please ignore the weird layout
for (let seller in finalData.sellers) {
ws.write('Seller:,' + seller + crlf + ',Brands:');
for (let brand of finalData.sellers[seller].brands) {
ws.write(',' + brand);
}
ws.write(crlf + ',Addresses:');
for (let addr of finalData.sellers[seller].addrs) {
ws.write(',"' + addr + '"');
}
ws.write(crlf);
}
ws.on('finish', () => {
console.log('Wrote all data'); // never prints this
});
ws.end();
}
process.on('exit', exitHandler.bind(null,{cleanup:true}));
I suspect this is because NodeJS exits before the data has been flushed to disk, but can't figure out a way to make NodeJS flush the data synchronously.
PS: new to NodeJS
please check out below example and integrate it
as per your comment i updated code
async function writeDataInCSV(filePath, dynamicHeader, data) {
const csvWriter = createCsvWriter({
path: filePath,
header: dynamicHeader
});
await csvWriter
.writeRecords(data)
.then(()=> console.log('The CSV file was written successfully'));
}
writeDataInCSV('out.csv',dynamicHeader, Data)
here set array of header and make data and pass to writeDataInCSV method
I've code similar to the following stripped down example
const req = request('http://www.my.url.here.com/file.bin') // 80 MB file
const decripher = .... // decipher from nodejs's crypto module
const output = fs.createWriteStream('result.zip');
const archive = archiver('zip', {zlib: { level: 9 } });
archive.pipe(output);
const stream = req
.pipe(decipher)
.on('error', (error) => {
console.error(`Error deciphering file`)
req.abort() // Does nothing
decipher.unpipe() // Unpiping to prevent the next step producing a [ERR_STREAM_WRITE_AFTER_END] error
stream.end() // End the stream as an error does not end it automatically
})
archive.append(stream, { name: 'file.bin' });
Once an error occurs deciphering the file I don't want to download any more data. But I've noticed that in these scenarios a req.abort() does nothing.
In the end I have a file partially decrypted in the archive but it's still ~80 MBs. i.e. The entire file has been downloaded despite the error (which I setup to fire near the start of the file).
Why would this occur? How can I prevent the entire file from downloading?
You can destroy the underlying socket. You can get the socket in socket or response event.
const req = request(options);
req.on('response', function(response) {
....
res.socket.end(); // or res.socket.destroy();
....
});
req.pipe(...);
Maybe in your case, modify a bit, this is basically a theory but you can do:
const req = request(options);
let sock = null;
req.on('socket', function(socket) {
sock = socket;
}).on('error', ()=>{
sock.destroy()//or end();
});
req.pipe(...);
I am making use of "socket.io-client" and "socket.io stream" to make a request and then stream some data. I have the following code that handles this logic
Client Server Logic
router.get('/writeData', function(req, res) {
var io = req.app.get('socketio');
var nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
var nameNodeData = {};
async.waterfall([
checkForDataNodes,
readFileFromS3
], function(err, result) {
if (err !== null) {
res.json(err);
}else{
res.json("Finished Writing to DN's");
}
});
function checkForDataNodes(cb) {
nameNodeSocket.on('nameNodeData', function(data) {
nameNodeData = data;
console.log(nameNodeData);
cb(null, nameNodeData);
});
if (nameNodeData.numDataNodes === 0) {
cb("No datanodes found");
}
}
function readFileFromS3(nameNodeData, cb) {
for (var i in nameNodeData['blockToDataNodes']) {
var IP = nameNodeData['blockToDataNodes'][i]['ipValue'];
var dataNodeSocket = io.connect('http://'+ IP +":5000");
var ss = require("socket.io-stream");
var stream = ss.createStream();
var byteStartRange = nameNodeData['blockToDataNodes'][i]['byteStart'];
var byteStopRange = nameNodeData['blockToDataNodes'][i]['byteStop'];
paramsWithRange['Range'] = "bytes=" + byteStartRange.toString() + "-" + byteStopRange.toString();
//var file = require('fs').createWriteStream('testFile' + i + '.txt');
var getFileName = nameNodeData['blockToDataNodes'][i]['key'].split('/');
var fileData = {
'mainFile': paramsWithRange['Key'].split('/')[1],
'blockName': getFileName[1]
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
s3.getObject(paramsWithRange).createReadStream().pipe(stream);
//dataNodeSocket.disconnect();
}
cb(null);
}
});
Server Logic (that gets the data)
var dataNodeIO = require('socket.io')(server);
var ss = require("socket.io-stream");
dataNodeIO.on('connection', function(socket) {
console.log("Succesfully connected!");
ss(socket).on('sendData', function(stream, data) {
var IP = data['ipValue'];
var blockName = data['blockName'];
var mainFile = data['mainFile'];
dataNode.makeDir(mainFile);
dataNode.addToReport(mainFile, blockName);
stream.pipe(fs.createWriteStream(mainFile + '/' + blockName));
});
});
How can I properly disconnect the connections in function readFileFromS3. I have noticed using dataNodeSocket.disconnect() at the end does not work as I cannot verify the data was received on the 2nd server. But if I comment it out, I can see the data being streamed to the second server.
My objective is to close the connections in Client Server side
It appears that the main problem with closing the socket is that you weren't waiting for the stream to be done writing before trying to close the socket. So, because the writing is all asynchronous and finishes sometime later, you were trying to close the socket before the data had been written.
Also because you were putting asynchronous operations inside a for loop, you were also running all your operations in parallel which may not be exactly what you want as it makes error handling more difficult and server load more difficult.
Here's the code I would suggest that does the following:
Create a function streamFileFromS3() that streams a single file and returns a promise that will notify when it's done.
Use await in a for loop with that streamFileFromS3() to serialize the operations. You don't have to serialize them, but then you would have to change your error handling to figure out what to do if one errors while the others are already running and you'd have to be more careful about concurrency issues.
Use try/catch to catch any errors from streamFileFromS3().
Add error handling on the stream.
Change all occurrences of data['propertyName'] to data.propertyName. The only time you need to use brackets is if the property name contains a character that is not allowed in a Javascript identifier or if the property name is in a variable. Otherwise, the dot notation is preferred.
Add socket.io connection error handling logic for both socket.io connections.
Set returned status to 500 when there's an error processing the request
So, here's the code for that:
const ss = require("socket.io-stream");
router.get('/writeData', function(req, res) {
const io = req.app.get('socketio');
function streamFileFromS3(ip, data) {
return new Promise((resolve, reject) => {
const dataNodeSocket = io.connect(`http://${ip}:5000`);
dataNodeSocket.on('connect_error', reject);
dataNodeSocket.on('connect_timeout', () {
reject(new Error(`timeout connecting to http://${ip}:5000`));
});
dataNodeSocket.on('connection', () => {
// dataNodeSocket connected now
const stream = ss.createStream().on('error', reject);
paramsWithRange.Range = `bytes=${data.byteStart}-${data.byteStop}`;
const filename = data.key.split('/')[1];
const fileData = {
'mainFile': paramsWithRange.Key.split('/')[1],
'blockName': filename
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
// get S3 data and pipe it to the socket.io stream
s3.getObject(paramsWithRange).createReadStream().on('error', reject).pipe(stream);
stream.on('close', () => {
dataNodeSocket.disconnect();
resolve();
});
});
});
}
function connectError(msg) {
res.status(500).send(`Error connecting to ${NAMENODE_ADDRESS}`);
}
const nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
nameNodeSocket.on('connect_error', connectError).on('connect_timeout', connectError);
nameNodeSocket.on('nameNodeData', async (nameNodeData) => {
try {
for (let item of nameNodeData.blockToDataNodes) {
await streamFileFromS3(item.ipValue, item);
}
res.json("Finished Writing to DN's");
} catch(e) {
res.status(500).json(e);
}
});
});
Other notes:
I don't know what paramsWithRange is as it is not declared here and when you were doing everything in parallel, it was getting shared among all the connections which is asking for a concurrency issue. In my serialized implementation, it's probably safe to share it, but the way it is now bothers me as it's a concurrency issue waiting to happen.
I am trying to extract images from a csv file by doing the following:
Parsing/streaming in a large csv file using csv-parse and the fs createReadStream method
Grabbing each line for processing using stream-transform
Extraction of image and other row data for processing using the async waterfall method.
Download and write image to server using request and the fs createWriteStream method
For some reason after the data gets piped into createWriteStream, there is some event in which an async callback never gets called. I have run this same code only using request, without piping to createWriteStream, and it works. I've also run createWriteStream w/ a drain event, and then some how it works? Can anyone explain this to me?
In the code below, request is trying to pipe 14,970 images, but the createWriteStream close or finish events only fire 14,895 times, with error firing 0 times. Could this be a draining issue? Could highWaterMark be exceeded and a write fail could be occurring undetected?
Here is my csv line getting code:
var first = true;
var parser = parse();
var transformer = transform( (line, complete) => {
if(!first)
extractData(line,complete)
else {
first = false;
complete(null);
}
},
() => {
console.log('Done: parseFile');
});
fs.createReadStream(this.upload.location).pipe(parser).pipe(transformer);
extractData function that doesn't always do a required async callback:
extractData(line,complete){
var now = new Date();
var image = {
createdAt: now,
updatedAt: now
};
async.waterfall([
next => { // Data Extraction
async.forEachOf(line, (data, i, complete) => {
if(i === 2) image.src = data;
if(i === 3) image.importSrc = data;
complete(null);
}, err => {
if(err) throw err;
next(null);
});
},
next => { // Download Image
var file = fs.createWriteStream('public/'+image.src);
var sendReq = request.get(image.importSrc);
sendReq.on('response', response => {
if (response.statusCode !== 200) {
this.upload.report.image.errors++;
return next(null);
}
});
sendReq.on('error', err => {
this.upload.report.image.errors++;
next(null);
});
sendReq.pipe(file);
file.on('finish', () => {
this.upload.report.image.inserts++;
file.close(next); // Close file and callback
});
file.on('error', err => {
this.upload.report.image.errors++;
next(null);
});
}
], err => {
if(err) throw err;
complete(null);
});
}
As suggested by #mscdex, I've also tried switching out finish for his replacement close approach.
file.close(next); is unnecessary as the file stream is closed automatically by default. What you can do instead is to listen for the close event to know when the file descriptor for the stream has been closed. So replace the entire finish event handler with:
file.on('close', () => {
this.upload.report.image.inserts++;
next(null);
});