Working with createReadStream in node.js using async/await - node.js

After tried TFischer's response, I am still having difficulty with using fs.creadReadStream to process my csv file asynchronously:
return new Promise((resolve, reject) => {
const promises = [];
fs.createReadStream(inputFile)
.pipe(csv())
.on("data", row => promises.push(processData(row, myRepository)))
.on("error", reject)
.on("end", async () => {
await Promise.all(promises);
resolve();
});
});
async function processData(row, myRepository) {
console.log('Current row: ', row); // logs all CSV rows
const record = await myRepository.findOne({where: {id: row.id}});
console.log(record); // row.id is undefined here
return record;
}
I want to be able to use row argument inside processData(row, myRepository) function as the current row of the actual CSV file being parsed but it doesn't seem to work.
Can someone explain what is happening there?

If you just want to read chunks, createReadStream is your guy as it is already an async iterable:
async function main() {
const reader = createReadStream(join(__dirname, "index.html"), "utf8");
for await (const chunk of reader) {
console.log(chunk);
}
}

Related

How to break out of promise after number of requests-then chain in nodejs

I was trying to read a CSV file (900 000 data) and want to pass it to an POST API. But I am unable to pass the complete data as the data is huge. So I want to pause the promises after every 100k rows and make the axios call and then want to resume the promises back. So that I can reduce the load on the server.
Please correct me if something is wrong as I am very new to nodejs.
async function readcsv(path) {
return new Promise((resolve, reject) => {
let csvdata = [];
fs.createReadStream(path)
.on('error', () => {
// handle error
})
.pipe(csv())
.on('data', (row) => {
csvdata.push([evname,uid,data])
console.log(csvdata)
})
.on('end', async() => {
resolve (csvdata)
})
})
}
readcsv('filename.csv').then(data => {
let token = jwt.sign({payload:data},secrete_key)
axios.get(`${URL}?c=${comp_id}&jwt=${token}`).then(function(res){
console.log(res.data)
}).catch(e => {
console.log(e)
})
})

nodejs async await inside createReadStream

I am reading a CSV file line by line and inserting/updating in MongoDB. The expected output will be
1. console.log(row);
2. console.log(cursor);
3.console.log("stream");
But getting output like
1. console.log(row);
console.log(row); console.log(row); console.log(row); console.log(row); ............ ............
2. console.log(cursor);
3.console.log("stream");
Please let me know what i am missing here.
const csv = require('csv-parser');
const fs = require('fs');
var mongodb = require("mongodb");
var client = mongodb.MongoClient;
var url = "mongodb://localhost:27017/";
var collection;
client.connect(url,{ useUnifiedTopology: true }, function (err, client) {
var db = client.db("UKCompanies");
collection = db.collection("company");
startRead();
});
var cursor={};
async function insertRec(row){
console.log(row);
cursor = await collection.update({CompanyNumber:23}, row, {upsert: true});
if(cursor){
console.log(cursor);
}else{
console.log('not exist')
}
console.log("stream");
}
async function startRead() {
fs.createReadStream('./data/inside/6.csv')
.pipe(csv())
.on('data', async (row) => {
await insertRec(row);
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
In your startRead() function, the await insertRec() does not stop more data events from flowing while the insertRec() is processing. So, if you don't want the next data event to run until the insertRec() is done, you need to pause, then resume the stream.
async function startRead() {
const stream = fs.createReadStream('./data/inside/6.csv')
.pipe(csv())
.on('data', async (row) => {
try {
stream.pause();
await insertRec(row);
} finally {
stream.resume();
}
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
FYI, you also need some error handling if insertRec() fails.
That is expected behavior in this case because your on data listener triggers the insertRec asynchronously as and when data is available in stream. So that is why your first line of insert method is getting executed kind of in parallel. If you want to control this behavior you can use highWaterMark (https://nodejs.org/api/stream.html#stream_readable_readablehighwatermark) property while creating the read stream. This way you will get 1 record at a time but I am not sure what your use case is.
something like this
fs.createReadStream(`somefile.csv`, {
"highWaterMark": 1
})
Also you are not awaiting your startRead method. I would wrap it inside the promise and resolve it in end listener else you will not know when the processing got finished. Something like
function startRead() {
return new Promise((resolve, reject) => {
fs.createReadStream(`somepath`)
.pipe(csv())
.on("data", async row => {
await insertRec(row);
})
.on("error", err => {
reject(err);
})
.on("end", () => {
console.log("CSV file successfully processed");
resolve();
});
});
}
From Node 10+ ReadableStream got property Symbol.asyncIterator and is's allow processing stream using for-await-of
async function startRead() {
const readStream = fs.createReadStream('./data/inside/6.csv');
for await (const row of readStream.pipe(csv())) {
await insertRec(row);
}
console.log('CSV file successfully processed');
}

Node.js: Return promises in a defined order

I have some hundreds of JSON files that I need to process in a defined sequence and write back the content as CSV in the same order as in the JSON files:
Write a CSV file with header
Collect an array of JSON files to process
Read the file and return an array with the required information
Append the CSV file, created under #1, with the information
Continue with the next JSON file at step #3
'use strict';
const glob = require('glob');
const fs = require('fs');
const fastcsv = require('fast-csv');
const readFile = require('util').promisify(fs.readFile);
function writeHeader(fileName) {
return new Promise((resolve, reject) => {
fastcsv
.writeToStream(fs.createWriteStream(fileName), [['id', 'aa', 'bb']], {headers: true})
.on('error', (err) => reject(err))
.on('finish', () => resolve(true));
});
}
function buildFileList(globPattern) {
return new Promise((resolve, reject) => {
glob(globPattern, (err, files) => {
if (err) {
reject(err);
} else {
resolve(files);
}
});
});
}
function readFromFile(file) {
return new Promise((resolve, reject) => {
readFile(file, 'utf8', (err, data) => {
if (err) {
reject(err);
} else {
const obj = JSON.parse(data);
const key = Object.keys(obj['776'])[0];
const solarValues = [];
obj['776'][key].map((item, i) => solarValues.push([i, item[0], item[1][0][0]]));
resolve(solarValues);
}
});
});
}
function csvAppend(fileName, rows = []) {
return new Promise((resolve, reject) => {
const csvFile = fs.createWriteStream(fileName, {flags: 'a'});
csvFile.write('\n');
fastcsv
.writeToStream(csvFile, rows, {headers: false})
.on('error', (err) => reject(err))
.on('finish', () => resolve(true));
});
}
writeHeader('test.csv')
.then(() => buildFileList('data/*.json'))
.then(fileList => Promise.all(fileList.map(item => readFromFile(item))))
.then(result => Promise.all(result.map(item => csvAppend('test.csv', item))))
.catch(err => console.log(err.message));
JSON examples:
https://gist.github.com/Sineos/a40718c13ad0834b4a0056091e3ac4ca
https://gist.github.com/Sineos/d626c3087074c23a073379ecef84a55c
Question
While the code basically works, my problem is that the CSV is not written back in a defined order but mixed up like in an asynchronous process.
I tried various combinations with and without Promise.all resulting in either pending promises or mixed up CSV file.
This is my first take on Node.js Promises so every input on how to do it correctly is greatly appreciated. Many thanks in advance.
This code should process your files in order, we'll use async/await and for .. of to loop in sequence:
async function processJsonFiles() {
try {
await writeHeader('test.csv');
let fileList = await buildFileList('data/*.json');
for(let file of fileList) {
let rows = await readFromFile(file);
await csvAppend('test.csv', rows);
}
} catch (err) {
console.error(err.message);
}
}
processJsonFiles();

awaiting completion of a highland stream

I am writing a small script to stream download and process a number of sequentially named files from url. I am using highlandjs and have it working perfectly one by one. I am attempting to refactor into a loop wherein I await the completion of one highland stream prior to starting another:
// my attempt to abstract the highland into a promise I can await
const processFile = async (url, path) => {
const writeStream = fs.createWriteStream(path);
return hl(request(url))
.split()
// various working transforms
.map(splitDelim)
.filter(filterSchedAOnly)
.map(appendIdentity)
.filter(filterOnlyIdentified)
.map(convertToCSVsafeString)
// end result should write to a file
.pipe(writeStream)
.toPromise();
// also tried this
// return new Promise((resolve, reject) => {
// writeStream.on("end", () => resolve());
// writeStream.on("error", () => reject());
// });
};
(async () => {
let i = 1;
// infinite loop
while (true) {
const url = `http://.../${i}.txt`;
const filePath = `${i}.processed.csv`;
try {
// does not work!
await processFile(url, filePath);
} catch (err) {
console.log(err);
}
i++;
}
})();
How should I wrap my processFile func such that I can await it's completion before moving on to the next iteration?
This appears to be working:
function streamToPromise(stream) {
return new Promise(function(resolve, reject) {
// resolve with location of saved file
stream.on("finish", () => resolve());
// stream.on("end", () => resolve());
stream.on("error", () => reject());
});
}
const processFile = async (url, path, i) => {
const stream = hl(request(url))
.split()
// ......
.pipe(fs.createWriteStream(path));
return streamToPromise(stream);
};

Getting CSV headers without reading whole Stream

I'm using the csv-parser node module to capture the headers of streaming csv files; there's a 'headers' event emitter I'm using that's built into the module.
I'm looping through an array of csv readstreams that I'm passing through my getHeaders() function. Right after I capture the headers of the streaming csv, I want to close that stream, then pass the next csv stream into the function so I can get those headers, then close the stream, and so on...
What is the best way to do this? I've tried a few different methods, including .destroy(), but I haven't found a solution that works. Maybe I am attempting to close the stream in the wrong spot? Here's what I have:
function getHeaders(dataStream) {
return new Promise((resolve, reject) => {
dataStream
.pipe(csv())
.on('headers', (headers) => {
let headerArray = Array.prototype.slice.call(headers);
resolve(headerArray);
dataStream.destroy();
})
.on('error', (err) => {
reject(`err from readingFile ${err}`)
})
});
}
How about reading first 2 lines of stream and then piping array to csv as readable stream?
Try this:
const lineReader = require('readline');
const Stream = require('stream')
function getHeaders(dataStream) {
return new Promise((resolve, reject) => {
let wantedLines = [];
lineReader.createInterface({
input: dataStream
});
lineReader.on('line', (line) => {
if(wantedLines.length >= 2) {
lineReader.close();
}
});
lineReader.on('close', () => {
let resolved = false;
const readableStream = new Stream.Readable();
readableStream
.pipe(csv())
.on('headers', (headers) => {
if (!resolved) resolved = true;
resolve(headers);
})
.on('error', reject);
.on('close', () => {
if (!resolved) resolve([]);
});
wantedLines.forEach(line => readableStream.push(line));
readableStream.destroy();
});
});
}
Have not checked, comment if something wrong.

Resources