nodejs async await inside createReadStream - node.js

I am reading a CSV file line by line and inserting/updating in MongoDB. The expected output will be
1. console.log(row);
2. console.log(cursor);
3.console.log("stream");
But getting output like
1. console.log(row);
console.log(row); console.log(row); console.log(row); console.log(row); ............ ............
2. console.log(cursor);
3.console.log("stream");
Please let me know what i am missing here.
const csv = require('csv-parser');
const fs = require('fs');
var mongodb = require("mongodb");
var client = mongodb.MongoClient;
var url = "mongodb://localhost:27017/";
var collection;
client.connect(url,{ useUnifiedTopology: true }, function (err, client) {
var db = client.db("UKCompanies");
collection = db.collection("company");
startRead();
});
var cursor={};
async function insertRec(row){
console.log(row);
cursor = await collection.update({CompanyNumber:23}, row, {upsert: true});
if(cursor){
console.log(cursor);
}else{
console.log('not exist')
}
console.log("stream");
}
async function startRead() {
fs.createReadStream('./data/inside/6.csv')
.pipe(csv())
.on('data', async (row) => {
await insertRec(row);
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}

In your startRead() function, the await insertRec() does not stop more data events from flowing while the insertRec() is processing. So, if you don't want the next data event to run until the insertRec() is done, you need to pause, then resume the stream.
async function startRead() {
const stream = fs.createReadStream('./data/inside/6.csv')
.pipe(csv())
.on('data', async (row) => {
try {
stream.pause();
await insertRec(row);
} finally {
stream.resume();
}
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
FYI, you also need some error handling if insertRec() fails.

That is expected behavior in this case because your on data listener triggers the insertRec asynchronously as and when data is available in stream. So that is why your first line of insert method is getting executed kind of in parallel. If you want to control this behavior you can use highWaterMark (https://nodejs.org/api/stream.html#stream_readable_readablehighwatermark) property while creating the read stream. This way you will get 1 record at a time but I am not sure what your use case is.
something like this
fs.createReadStream(`somefile.csv`, {
"highWaterMark": 1
})
Also you are not awaiting your startRead method. I would wrap it inside the promise and resolve it in end listener else you will not know when the processing got finished. Something like
function startRead() {
return new Promise((resolve, reject) => {
fs.createReadStream(`somepath`)
.pipe(csv())
.on("data", async row => {
await insertRec(row);
})
.on("error", err => {
reject(err);
})
.on("end", () => {
console.log("CSV file successfully processed");
resolve();
});
});
}

From Node 10+ ReadableStream got property Symbol.asyncIterator and is's allow processing stream using for-await-of
async function startRead() {
const readStream = fs.createReadStream('./data/inside/6.csv');
for await (const row of readStream.pipe(csv())) {
await insertRec(row);
}
console.log('CSV file successfully processed');
}

Related

how to access a local variable value outside the scope of its function in nodejs

I want to compare the data of two files and for that, I'm reading that file using the fs module but since I want to compare the values so I thought to store the value in an external variable but when I do console.log(budget_details) I get nothing in console. Please someone help. Please point me out if my approach is wrong and if we don't need to do that in nodejs. I'm new to nodejs.
import csv from 'csv-parser'
import fs from 'fs';
let budget_details
const budgetProcessing = (budget_file_path) => {
try{
fs.createReadStream(budget_file_path)
.pipe(csv())
.on('data', (row) => {
budget_details = row
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
catch(error){
console.log(error)
}
}
budgetProcessing('budget.csv')
console.log(budget_details)
Let's first explain why you don't get the expected result, it doesnt have to do with scope actually:
import csv from 'csv-parser'
import fs from 'fs';
let budget_details
const budgetProcessing = (budget_file_path) => {
try{
fs.createReadStream(budget_file_path)
.pipe(csv())
.on('data', (row) => {
budget_details = row
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
catch(error){
console.log(error)
}
}
budgetProcessing('budget.csv')
console.log(budget_details)
fs.createReadStream is not itslef exactly asynchronous but then we pipe the returned stream to csv-parser which does event based parsing, so even if you call budgetProcessing before the console.log(budget_details) the stream reading has most likely not runned yet and budget_details is still undefined.
To fix this, you could move this console.log(budget_details) where it is set like so:
let budget_details
const budgetProcessing = (budget_file_path) => {
try{
fs.createReadStream(budget_file_path)
.pipe(csv())
.on('data', (row) => {
budget_details = row
console.log(budget_details)
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
catch(error){
console.log(error)
}
}
budgetProcessing('budget.csv')
but then the variable itself wouldnt serve any real purpose so instead you could do this:
const budgetProcessing = (budget_file_path, callback) => {
try{
fs.createReadStream(budget_file_path)
.pipe(csv())
.on('data', (row) => {
callback(row)
})
.on('end', () => {
console.log('CSV file successfully processed');
});
}
catch(error){
console.log(error)
}
}
budgetProcessing('budget.csv', (budget_details) => {
console.log(budget_details) // or anything with budget_details
})
Lastly, I want to make clear that the callback will be called for each row of the csv as specified in csv-parser's documentation
your code is not asynchronous. Anything with 'on', which takes a function, would indicate that it is event driven. You need something like:
import csv from 'csv-parser'
import fs from 'fs';
let budget_details
const budgetProcessing = (budget_file_path) => new Promise((resolve, reject) => {
try {
fs.createReadStream(budget_file_path)
.pipe(csv())
.on('data', (row) => {
budget_details = row
})
.on('end', () => {
console.log('CSV file successfully processed');
resolve()
});
} catch (error) {
console.log(error)
reject(error)
}
})
budgetProcessing('budget.csv')
.then(() => console.log(budget_details))

In node.js, why is my data not getting passed back after a asynchronous file read using a Promise

I know for sure that my pullData module is getting the data back from the file read but the function calling it, though it has an await, is not getting the data.
This is the module (./initialise.js) that reads the data:
const fs = require('fs');
const getData = () => {
return new Promise((resolve, reject) => {
fs.readFile('./Sybernika.txt',
{ encoding: 'utf8', flag: 'r' },
function (err, data) {
if (err)
reject(err);
else
resolve(data);
});
});
};
module.exports = {getData};
And this is where it gets called (app.js):
const init = require('./initialise');
const pullData = async () => {
init.getData().then((data) => {
return data;
}).catch((err) => {
console.log(err);
});
};
const start = async() => {
let data = await pullData();
console.log(data);
}
start();
putting 'console.log(data)' just before return(data) in the resolve part of the call shows the data so I know it's being read OK. However, that final console.log shows my data variabkle as being undefined.
Any suggestions?
It's either
const pullData = async () => {
return init.getData().then((data) => {
return data;
}).catch((err) => {
console.log(err);
});
};
or
const pullData = async () =>
init.getData().then((data) => {
return data;
}).catch((err) => {
console.log(err);
});
Both versions make sure a promise returned by then/catch is passed down to the caller.

Waiting for a write stream to complete before returning a function

I'm trying to grab several JSON files from Google Drive, then export them into objects within my code. I am using a write stream to create the file locally and then parse it with JSON.parse(). This is my current implementation:
function setDB(dbId, dbName) {
let database = {};
drive.files.get({
fileId: dbId,
alt: 'media'
}, {responseType: 'stream'}).then(res => {
console.log(`Writing to ${dbName}.`);
const db = fs.createWriteStream(dbName);
res.data.on('end', () => {
console.log(`Done downloading file ${dbName}.`);
}).on('error', err => {
console.error(`Error downloading file ${dbName}.`);
throw err;
}).pipe(db).on('finish', () => {
database = JSON.parse(fs.readFileSync(`./${dbName}`));
});
});
return database;
}
let variable1 = setDB('variable1Id', 'variable1.json');
let variable2 = setDB('variable2Id', 'variable2.json');
let variable3 = setDB('variable3Id', 'variable3.json');
The issue here is setDB returns undefined since the function returns its value before the file finishes downloading and the stream finishes writing into the file. I understand that I should be using some kind of async/await, but I couldn't find how or where to place those. This might also not be the best way to go about this, and I'll gladly take any advice about making a better implementation of this, or making the code cleaner or more elegant. However, the main issue here is, how do I make it so that setDB returns JSON.parse(fs.readFileSync(`./${dbName}`)) ?
I rewrote your code to return a promise that resolves on stream finish event and rejects on stream error. Your original code doesn't wait for your promise.
function async setDB(dbId, dbName) {
const db = fs.createWriteStream(dbName);
const res = await drive.files.get({
alt: 'media',
fileId: dbId,
}, {responseType: 'stream'});
console.log(`Writing to ${dbName}.`);
return new Promise((resolve, reject) => {
res.data
.on('end', () => {
console.log(`Done downloading file ${dbName}.`);
})
.on('error', (err) => {
console.error(`Error downloading file ${dbName}.`);
reject(err);
})
.pipe(db)
.on('finish', async () => {
try {
const fileContent = await fs.readFile(`./${dbName}`);
resolve(JSON.parse(fileContent));
} catch (err) {
reject(err);
}
});
});
}

Working with createReadStream in node.js using async/await

After tried TFischer's response, I am still having difficulty with using fs.creadReadStream to process my csv file asynchronously:
return new Promise((resolve, reject) => {
const promises = [];
fs.createReadStream(inputFile)
.pipe(csv())
.on("data", row => promises.push(processData(row, myRepository)))
.on("error", reject)
.on("end", async () => {
await Promise.all(promises);
resolve();
});
});
async function processData(row, myRepository) {
console.log('Current row: ', row); // logs all CSV rows
const record = await myRepository.findOne({where: {id: row.id}});
console.log(record); // row.id is undefined here
return record;
}
I want to be able to use row argument inside processData(row, myRepository) function as the current row of the actual CSV file being parsed but it doesn't seem to work.
Can someone explain what is happening there?
If you just want to read chunks, createReadStream is your guy as it is already an async iterable:
async function main() {
const reader = createReadStream(join(__dirname, "index.html"), "utf8");
for await (const chunk of reader) {
console.log(chunk);
}
}

Wait for callback of async function in last stream.on('data') event

I am using fast-csv to iterate over a CSV file using a stream. For each row of the CSV file, I want to create a job in redis, for which I use kue. Parsing a row is synchronous function. The whole thing looks like this:
var csvStream = fastCsv(_config.csvOptions)
.on('data', function(data) {
var stream = this;
stream.pause();
var payload = parseRow(data, _config);
console.log(payload); // the payload is always printed to the console
var job = kue.create('csv-row', {
payload: payload
})
.save(function(err) {
if (!err) console.log('Enqueued at ' + job.id);
else console.log('Redis error ' + JSON.stringify(err));
stream.resume();
});
})
.on('end', function() {
callback(); // this ends my parsing
});
The simple console.log(payload); shows for every row of my CSV file, however the job is not created. I.e., none of the outputs in the callback of save are being printed, and the job is not in my redis.
I assume, because it's the last row of the CSV file, the stream already emits end and so the last kue.create() cannot be executed before the process terminates?
Is there a way to halt the stream's end until kue is done?
You can solve this problem using async library. You can use the below pattern for any streams.
var AsyncLib = require('async');
var worker = function (payload, cb) {
//do something with payload and call callback
return cb();
};
var concurrency = 5;
var streamQueue = AsyncLib.queue(worker, concurrency);
var stream = //some readable stream;
stream.on('data', function(data) {
//no need to pause and resume
var payload = '//some payload';
streamQueue.push(payload);
})
.on('end', function() {
//register drain event on end and callback
streamQueue.drain = function () {
callback();
};
});
I have just found myself in the same situation. And I resolved it with pattern similar to sync.WaitGroup from Go language.
In its simplest form this will look like (function will return Promise):
function process() {
const locker = {
items: 0,
resolve: null,
lock: function() { this.items++ },
unlock: function() {
if (this.items > 0) this.items--;
if (this.items === 0) {
this.resolve()
this.resolve = null
}
},
};
return new Promise(function(resolve, reject) {
locker.resolve = resolve;
locker.lock();
fastCsv(_config.csvOptions)
.on('end', function() { locker.unlock(); })
.on('data', function(data) {
locker.lock();
const payload = parseRow(data, _config);
kue.create('csv-row', { payload: payload })
.save(function(err) {
if (!err) console.log('Enqueued at ' + job.id);
else console.log('Redis error ' + JSON.stringify(err));
locker.unlock();
});
});
});
}
process().then(function () {
console.log('Now ALL data processed!');
});
In real application you can extract locker to distinct class/module, add error handling, etc... But the principle is the same – wait until not only the stream is empty but also all nonblocking operations created in it are finished.

Resources