Getting CSV headers without reading whole Stream - node.js

I'm using the csv-parser node module to capture the headers of streaming csv files; there's a 'headers' event emitter I'm using that's built into the module.
I'm looping through an array of csv readstreams that I'm passing through my getHeaders() function. Right after I capture the headers of the streaming csv, I want to close that stream, then pass the next csv stream into the function so I can get those headers, then close the stream, and so on...
What is the best way to do this? I've tried a few different methods, including .destroy(), but I haven't found a solution that works. Maybe I am attempting to close the stream in the wrong spot? Here's what I have:
function getHeaders(dataStream) {
return new Promise((resolve, reject) => {
dataStream
.pipe(csv())
.on('headers', (headers) => {
let headerArray = Array.prototype.slice.call(headers);
resolve(headerArray);
dataStream.destroy();
})
.on('error', (err) => {
reject(`err from readingFile ${err}`)
})
});
}

How about reading first 2 lines of stream and then piping array to csv as readable stream?
Try this:
const lineReader = require('readline');
const Stream = require('stream')
function getHeaders(dataStream) {
return new Promise((resolve, reject) => {
let wantedLines = [];
lineReader.createInterface({
input: dataStream
});
lineReader.on('line', (line) => {
if(wantedLines.length >= 2) {
lineReader.close();
}
});
lineReader.on('close', () => {
let resolved = false;
const readableStream = new Stream.Readable();
readableStream
.pipe(csv())
.on('headers', (headers) => {
if (!resolved) resolved = true;
resolve(headers);
})
.on('error', reject);
.on('close', () => {
if (!resolved) resolve([]);
});
wantedLines.forEach(line => readableStream.push(line));
readableStream.destroy();
});
});
}
Have not checked, comment if something wrong.

Related

NodeJS cannot return data from busboy finish event

I am currently trying to develop a google cloud function to parse multipart files (excel format or csv) in order to populate the firestore database.
I am using busboy in a helper function to parse the file, convert it to json and return it to the main function.
Everything goes well until I am trying to return the parsed data. I thought the most logic way of doing was to return the data from the busboy 'finish' event but it seems not to return the data as once back in the main function it is undefined. I first thought of some issue related to asynchronous code execution but when I tried to only print the data in the busboy finish event it worked properly.
I've tried to find some related content online but unfortunately didnt success. Here is my helper function :
// Takes a multipart request and sends back redable data
const processRequest = (req) => {
const busboy = Busboy({headers: req.headers});
formats = ['application/vnd.ms-excel', 'text/csv', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'];
var finalData;
// fieldname is the request key name of the file
// file is the stream
// fname is the name of the fileq
busboy.on('file', (fieldname, file, fname) => {
// Checks if file is right format
if(!formats.includes(fname.mimeType)) throw new FileFormatError('File must be excel or csv');
bytes = [];
// Checks that the request key is the right one
if(fieldname == 'file') {
// Data is the actual bytes, adds it to the buffer each time received
file.on('data', (data) => {
bytes.push(data);
});
// Concatenates the bytes into a buffer and reads data given mimetype
file.on('end', async () => {
buffer = Buffer.concat(bytes);
if(fname.mimeType === 'application/vnd.ms-excel' ||
fname.mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') {
workbook = XLSX.read(buffer, {type: 'buffer'});
json = excelToJson(workbook);
console.log(json);
finalData = json;
}
if (fname.mimeType === 'text/csv') {
var csv = [];
const stream = Readable.from(buffer.toString());
stream.pipe(CSV.parse({delimiter: ','}))
.on('error', (err) => {
console.log('csv parsing error');
console.log(err.message);
}).on('data', (row) => {
csv.push(row);
}).on('end', () => {
console.log('csv file properly processed');
console.log(csv);
// CSV PARSING LOGIC TO COME, JUST TESTING RIGHT NOW
finalData = csv;
});
}
});
}
});
busboy.on('finish', () => {
console.log('busboy finished');
return finalData;
// WHEN ONLY PRINTED THE DATA IS PRESENT AND DISPLAYS PROPERLY HERE
})
// Processes request body bytes
busboy.end(req.rawBody);
}
There must be something I am misunderstanding but as of yet I cannot point out what.
Thanks in advance for your time :)
You're not waiting for your CSV parsing to actually finish.
It would be better to refactor your async code to use async/await.
Since you're using libraries that might only support callback-style async, you'll need to do some new Promise wrapping yourself.
Understandably, I haven't tested the below code, but something like this...
/**
* Parse the given buffer as a CSV, return a promise of rows
*/
function parseCSV(buffer) {
return new Promise((resolve, reject) => {
const csv = [];
const stream = Readable.from(buffer.toString());
stream
.pipe("text/csv".parse({ delimiter: "," }))
.on("error", reject)
.on("data", (row) => csv.push(row))
.on("end", () => resolve(csv));
});
}
/**
* Parse the given buffer as a spreadsheet, return a promise
*/
async function parseSpreadsheet(mimeType, buffer) {
if (
mimeType === "application/vnd.ms-excel" ||
mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
) {
const workbook = XLSX.read(buffer, { type: "buffer" });
return excelToJson(workbook);
}
if (mimeType === "text/csv") {
return parseCSV(buffer);
}
throw new Error(`Unknown mime type ${mimeType}`);
}
/**
* Get the bytes of the field `fieldName` in the request.
*/
function getFileFromRequest(req, fieldName) {
return new Promise((resolve, reject) => {
const busboy = Busboy({ headers: req.headers });
busboy.on("file", (name, file, info) => {
// Only process the field we care about
if (name != fieldName) {
return;
}
const bytes = [];
file.on("data", (data) => bytes.push(data));
file.on("end", () =>
resolve({
info,
buffer: Buffer.concat(bytes),
}),
);
file.on("error", reject);
});
busboy.end(req.rawBody);
});
}
async function parseRequest(req) {
// (1) Get the file as a buffer
const { info, buffer } = await getFileFromRequest(req, "file");
// (2) Try parsing it as a spreadsheet
const data = await parseSpreadsheet(info.mimeType, buffer);
// (3) Do something with the data?
return data;
}

Working with createReadStream in node.js using async/await

After tried TFischer's response, I am still having difficulty with using fs.creadReadStream to process my csv file asynchronously:
return new Promise((resolve, reject) => {
const promises = [];
fs.createReadStream(inputFile)
.pipe(csv())
.on("data", row => promises.push(processData(row, myRepository)))
.on("error", reject)
.on("end", async () => {
await Promise.all(promises);
resolve();
});
});
async function processData(row, myRepository) {
console.log('Current row: ', row); // logs all CSV rows
const record = await myRepository.findOne({where: {id: row.id}});
console.log(record); // row.id is undefined here
return record;
}
I want to be able to use row argument inside processData(row, myRepository) function as the current row of the actual CSV file being parsed but it doesn't seem to work.
Can someone explain what is happening there?
If you just want to read chunks, createReadStream is your guy as it is already an async iterable:
async function main() {
const reader = createReadStream(join(__dirname, "index.html"), "utf8");
for await (const chunk of reader) {
console.log(chunk);
}
}

nodejs async await inside createReadStream

I am reading a CSV file line by line and inserting/updating in MongoDB. The expected output will be
1. console.log(row);
2. console.log(cursor);
3.console.log("stream");
But getting output like
1. console.log(row);
console.log(row); console.log(row); console.log(row); console.log(row); ............ ............
2. console.log(cursor);
3.console.log("stream");
Please let me know what i am missing here.
const csv = require('csv-parser');
const fs = require('fs');
var mongodb = require("mongodb");
var client = mongodb.MongoClient;
var url = "mongodb://localhost:27017/";
var collection;
client.connect(url,{ useUnifiedTopology: true }, function (err, client) {
var db = client.db("UKCompanies");
collection = db.collection("company");
startRead();
});
var cursor={};
async function insertRec(row){
console.log(row);
cursor = await collection.update({CompanyNumber:23}, row, {upsert: true});
if(cursor){
console.log(cursor);
}else{
console.log('not exist')
}
console.log("stream");
}
async function startRead() {
fs.createReadStream('./data/inside/6.csv')
.pipe(csv())
.on('data', async (row) => {
await insertRec(row);
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
In your startRead() function, the await insertRec() does not stop more data events from flowing while the insertRec() is processing. So, if you don't want the next data event to run until the insertRec() is done, you need to pause, then resume the stream.
async function startRead() {
const stream = fs.createReadStream('./data/inside/6.csv')
.pipe(csv())
.on('data', async (row) => {
try {
stream.pause();
await insertRec(row);
} finally {
stream.resume();
}
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
FYI, you also need some error handling if insertRec() fails.
That is expected behavior in this case because your on data listener triggers the insertRec asynchronously as and when data is available in stream. So that is why your first line of insert method is getting executed kind of in parallel. If you want to control this behavior you can use highWaterMark (https://nodejs.org/api/stream.html#stream_readable_readablehighwatermark) property while creating the read stream. This way you will get 1 record at a time but I am not sure what your use case is.
something like this
fs.createReadStream(`somefile.csv`, {
"highWaterMark": 1
})
Also you are not awaiting your startRead method. I would wrap it inside the promise and resolve it in end listener else you will not know when the processing got finished. Something like
function startRead() {
return new Promise((resolve, reject) => {
fs.createReadStream(`somepath`)
.pipe(csv())
.on("data", async row => {
await insertRec(row);
})
.on("error", err => {
reject(err);
})
.on("end", () => {
console.log("CSV file successfully processed");
resolve();
});
});
}
From Node 10+ ReadableStream got property Symbol.asyncIterator and is's allow processing stream using for-await-of
async function startRead() {
const readStream = fs.createReadStream('./data/inside/6.csv');
for await (const row of readStream.pipe(csv())) {
await insertRec(row);
}
console.log('CSV file successfully processed');
}

Node.js: Return promises in a defined order

I have some hundreds of JSON files that I need to process in a defined sequence and write back the content as CSV in the same order as in the JSON files:
Write a CSV file with header
Collect an array of JSON files to process
Read the file and return an array with the required information
Append the CSV file, created under #1, with the information
Continue with the next JSON file at step #3
'use strict';
const glob = require('glob');
const fs = require('fs');
const fastcsv = require('fast-csv');
const readFile = require('util').promisify(fs.readFile);
function writeHeader(fileName) {
return new Promise((resolve, reject) => {
fastcsv
.writeToStream(fs.createWriteStream(fileName), [['id', 'aa', 'bb']], {headers: true})
.on('error', (err) => reject(err))
.on('finish', () => resolve(true));
});
}
function buildFileList(globPattern) {
return new Promise((resolve, reject) => {
glob(globPattern, (err, files) => {
if (err) {
reject(err);
} else {
resolve(files);
}
});
});
}
function readFromFile(file) {
return new Promise((resolve, reject) => {
readFile(file, 'utf8', (err, data) => {
if (err) {
reject(err);
} else {
const obj = JSON.parse(data);
const key = Object.keys(obj['776'])[0];
const solarValues = [];
obj['776'][key].map((item, i) => solarValues.push([i, item[0], item[1][0][0]]));
resolve(solarValues);
}
});
});
}
function csvAppend(fileName, rows = []) {
return new Promise((resolve, reject) => {
const csvFile = fs.createWriteStream(fileName, {flags: 'a'});
csvFile.write('\n');
fastcsv
.writeToStream(csvFile, rows, {headers: false})
.on('error', (err) => reject(err))
.on('finish', () => resolve(true));
});
}
writeHeader('test.csv')
.then(() => buildFileList('data/*.json'))
.then(fileList => Promise.all(fileList.map(item => readFromFile(item))))
.then(result => Promise.all(result.map(item => csvAppend('test.csv', item))))
.catch(err => console.log(err.message));
JSON examples:
https://gist.github.com/Sineos/a40718c13ad0834b4a0056091e3ac4ca
https://gist.github.com/Sineos/d626c3087074c23a073379ecef84a55c
Question
While the code basically works, my problem is that the CSV is not written back in a defined order but mixed up like in an asynchronous process.
I tried various combinations with and without Promise.all resulting in either pending promises or mixed up CSV file.
This is my first take on Node.js Promises so every input on how to do it correctly is greatly appreciated. Many thanks in advance.
This code should process your files in order, we'll use async/await and for .. of to loop in sequence:
async function processJsonFiles() {
try {
await writeHeader('test.csv');
let fileList = await buildFileList('data/*.json');
for(let file of fileList) {
let rows = await readFromFile(file);
await csvAppend('test.csv', rows);
}
} catch (err) {
console.error(err.message);
}
}
processJsonFiles();

Node JS - Cannot get stream data without pipe on promise-ftp

Hi guys I'm facing problem with my Node.js api with Express when I'm trying to get files from FTP and then send then over my API as base64.
I'm using -> promise-ftp (https://www.npmjs.com/package/promise-ftp).
This is how endpoint looks like:
getData = (req, res, next) => {
const ftp = new PromiseFtp();
let data = [];
ftp.connect({host: 'xxxl',user: 'xxx',password: 'xxx'})
.then(() => {
return ftp.get('xxx.pdf');
}).then((stream) => {
return new Promise((resolve, reject) => {
stream.once('close', resolve);
stream.once('error', reject);
stream.pipe(fs.createReadStream('test.pdf'));
stream
.on('error', (err) => {
return res.send({errorMessage: err});
})
.on('data', (chunk) => data.push(chunk))
.on('end', () => {
const buffer = Buffer.concat(data);
label = buffer.toString('base64');
return res.send(label);
});
});
}).then(() => {
return ftp.end();
});
}
The problem is that I don't want to save this file localy next to api files and when I remove line stream.pipe(fs.createReadStream('test.pdf')); it doesn't work.
I'm not sure what pipe is doing here.
May you please help me?
readable.pipe(writable) is part of Node's Stream API, which transparently writes the data that is read from the readable into the writable stream, handling backpressure for you. Piping the data to the filesystem is unnecessary, and Express Response object implements the writable stream interface so you could just pipe the stream returned from the FTP promise directly to the res object.
getData = async (req, res) => {
const ftp = new PromiseFtp();
try {
await ftp.connect({host: 'xxxl',user: 'xxx',password: 'xxx'});
const stream = await ftp.get('xxx.pdf');
res.type('pdf');
await new Promise((resolve, reject) => {
res.on('finish', resolve);
stream.once('error', reject);
stream.pipe(res);
});
} catch(e) {
console.error(e);
} finally {
await ftp.end();
}
}
If you don't have a Node version that supports async/await, here's a Promise-only version:
getData = (req, res) => {
const ftp = new PromiseFtp();
ftp
.connect({host: 'xxxl',user: 'xxx',password: 'xxx'})
.then(() => ftp.get('xxx.pdf'))
.then(stream => {
res.type('pdf');
return new Promise((resolve, reject) => {
res.on('finish', resolve);
stream.once('error', reject);
stream.pipe(res);
});
})
.catch(e => {
console.error(e);
})
.finally(() => ftp.end());
}
Here you have a good use-case for using a Promise's finally()-method or a try/catch/finally block, which will ensure that ftp.end() is called even if an error occurs or not.
Note that I've deliberately left out sending the error back to clients as doing such things could possibly leak sensitive information. A better solution is to setup proper server-side logging with request context.

Resources