Only process 500 lines/row at a time createReadStream - node.js

I have to read a really large CSV file so search through the google and get to know about createReadStream. I am using a program that read the csv file data and insert it into the mongoDB.
process I am following
process the data using createReadStream (I think it read the file line by line).
Storing data into an array.
Insert the data into mongoDB using insertMany
Now the problem is whole file is first get stored into an array and then I insert into the database.
But what I think is the better approach would be I only store first 500 line/rows into an array insert it into the DB and again follow the same step for the next 500 records
Is it possible to achieve this ?
and also is it the right way to do this ?
my program
const test = async () => {
const stream = fs.createReadStream(workerData)
.pipe(parse())
.on('data', async function(csvrow) {
try{
stream.pause()
if(!authorName.includes(csvrow.author)) {
const author = new Author({author: csvrow.author})
authorId = author._id
authorName.push(author.author)
authorData.push(author)
}
if(!companyName.includes(csvrow.company_name)) {
const company = new Company({companyName: csvrow.company_name})
companyID = company._id
companyName.push(company.companyName)
companyData.push(company)
}
users = new User({
name: csvrow.firstname,
dob: csvrow.dob,
address: csvrow.address,
phone: csvrow.phone,
state: csvrow.state,
zip: csvrow.zip,
email: csvrow.email,
gender: csvrow.gender,
userType: csvrow.userType
})
userData.push(users)
book = new Book({
book_number: csvrow.book_number,
book_name: csvrow.book_name,
book_desc: csvrow.book_desc,
user_id: users._id,
author_id: authorId
})
bookData.push(book)
relationalData.push({
username: users.name,
author_id: authorId,
book_id: book._id,
company_id: companyID
})
}finally {
stream.resume()
}
})
.on('end', async function() {
try {
Author.insertMany(authorData)
User.insertMany(userData)
Book.insertMany(bookData)
Company.insertMany(companyData)
await Relational.insertMany(relationalData)
parentPort.postMessage("true")
}catch(e){
console.log(e)
parentPort.postMessage("false")
}
})
}
test()
This program is working fine also inserting the data into the DB, But I am looking for something like this:
const stream = fs.createReadStream(workerData)
.pipe(parse())
.on('data', async function(csvrow, maxLineToRead: 500) {
// whole code/logic of insert data into DB
})
so maxLineToRead is my imaginary term.
basically my point is I want to process 500 data at a time and insert it into the DB and want to repeat this process till the end.

You can create a higher scoped array variable where you accumulate rows of data as they arrive on the data event. When you get to 500 rows, fire off your database operation to insert them. If not yet at 500 rows, then just add the next one to the array and wait for more data events to come.
Then, in the end event insert any remaining rows still in the higher scoped array.
In this way, you will insert 500 at a time and then however many are left at the end. This has an advantage vs. inserting them all at the end that you spread out the database load over the time you are parsing.
Here's an attempt to implement that type of processing. There are some unknowns (documented with comments) based on an incomplete description of exactly what you're trying to accomplish in some circumstances):
const test = () => {
return new Promise((resolve, reject) => {
const accumulatedRows = [];
async function processRows(rows) {
// initialize data arrays that we will insert
const authorData = [],
companyData = [],
userData = [],
bookData = [],
relationalData = [];
// this code still has a problem that I don't have enough context
// to know how to solve
// If authorName contains csvrow.author, then the variable
// authorId is not initialized, but is used later in the code
// This is a problem that needs to be fixed.
// The same issue occurs for companyID
for (let csvrow of rows) {
let authorId, companyID;
if (!authorName.includes(csvrow.author)) {
const author = new Author({ author: csvrow.author })
authorId = author._id
authorName.push(author.author)
authorData.push(author)
}
if (!companyName.includes(csvrow.company_name)) {
const company = new Company({ companyName: csvrow.company_name })
companyID = company._id
companyName.push(company.companyName)
companyData.push(company)
}
let users = new User({
name: csvrow.firstname,
dob: csvrow.dob,
address: csvrow.address,
phone: csvrow.phone,
state: csvrow.state,
zip: csvrow.zip,
email: csvrow.email,
gender: csvrow.gender,
userType: csvrow.userType
});
userData.push(users)
let book = new Book({
book_number: csvrow.book_number,
book_name: csvrow.book_name,
book_desc: csvrow.book_desc,
user_id: users._id,
author_id: authorId
});
bookData.push(book)
relationalData.push({
username: users.name,
author_id: authorId,
book_id: book._id,
company_id: companyID
});
}
// all local arrays of data are populated now for this batch
// so add this data to the database
await Author.insertMany(authorData);
await User.insertMany(userData);
await Book.insertMany(bookData);
await Company.insertMany(companyData);
await Relational.insertMany(relationalData);
}
const batchSize = 50;
const stream = fs.createReadStream(workerData)
.pipe(parse())
.on('data', async function(csvrow) {
try {
accumulatedRows.push(csvRow);
if (accumulatedRows.length >= batchSize) {
stream.pause();
await processRows(accumulatedRows);
// clear out the rows we just processed
acculatedRows.length = 0;
stream.resume();
}
} catch (e) {
// calling destroy(e) will prevent leaking a stream
// and will trigger the error event to be called with that error
stream.destroy(e);
}
}).on('end', async function() {
try {
await processRows(accumulatedRows);
resolve();
} catch (e) {
reject(e);
}
}).on('error', (e) => {
reject(e);
});
});
}
test().then(() => {
parentPort.postMessage("true");
}).catch(err => {
console.log(err);
parentPort.postMessage("false");
});

Related

Return search result while the user is typing with MongoDB

Create such feature where user can see the results based on the text in the input field
I want to create a feature where user can type in an input field and see the result as the user types. I did implemented it and the code is there below. Now, I know this code is not at all scalable, or even efficient (I am sending DB request on every keystroke). I want to know how I can make this more efficient and possibly scalable.
const search = async (req, res) => {
try {
const input = req.query.text.replace(/ /g,'');
if(!input){return res.end()}
const result = await Promise.all([
User.find({ username: new RegExp(input, "i") }),
Post.find({ description: new RegExp(input, "i") }),
]);
// const users = await User.find({ username: new RegExp(input, 'i') });
res.json(result)
} catch (error) {
console.log(error);
res.status(500).json(error);
}
};

Node JS mongoose insert data at the beginning

I'm experimenting with mongodb using mongoose in NodeJS.
I have a simple web where a user can create a post.
On creation this post is saved to mongodb.
On the web, i have a scroll event listener which checks if the user is on the bottom of the page or not. If he is on the bottom, it will do a fetch to the backend to get more posts.
I want these to be retrived from the db from newest to oldest, but the model.save() method of mongoose always inserts a new model at the end of the collection.
So when a new post created the backend does this right now:
const post = new Post({
text: req.body.text,
author: {
userid: user._id,
name: user.username,
picPath: user.picPath
},
images: images
});
post.save(function (err) {
let status = true;
let message = "Post mentve."
if (err) { status = false; message = err; console.log(err) }
return res.send({ status: status, msg: message });
})
This way, a new post pushed to the collection. And not unshifted.
When the client wants new posts the backend does this:
app.get('/dynamicPostLoad/:offset/:limit', async (req, res) => {
let offset = req.params.offset;
let limit = req.params.limit;
let response = {
status: true,
posts : [],
message: "Fetched"
};
await Post.find({}).skip(offset).limit(limit).then(products => {
response.posts = products;
}).catch((err)=> {
response.status = false;
response.message = err;
});
return res.send(response);
});
So the mongoose will fetch from oldest to newst since all the new is inserted at the end of the collection.
That way, the user will see the oldest post first and as he scrolls, sees the oldest and oldest posts.
I was thinking on three ways.
Either the Post.find({}) method should crawl the documents from the end of the collection or the Post.save() method should unshift the document instead of push or i could find all the posts in the collection and reverse them. ( the last one would be painfully slow )
EDIT: Every post contains a creation date, so it could be sorted.
How can i achive this?
I solved with sort. ( still don't understand why i can't insert a document to the beginning of a collection )
Here is my solution:
app.get('/dynamicPostLoad/:offset/:limit', async (req, res) => {
let offset = req.params.offset;
let limit = req.params.limit;
let response = {
status: true,
posts : [],
message: "Fetched"
};
// Sort every find by created date before limiting.
await Post.find({}).sort({created: -1}).skip(offset).limit(limit).then(products => {
response.posts = products;
}).catch((err)=> {
response.status = false;
response.message = err;
});
return res.send(response);
});

mongoose filter by multiple conditions and execute to update data

I am wondering what would be the best approach to make schema functions using mongoose. I have never used this so the way I think is somewhat limited, same goes for looking for docs, without knowing what's available, is not very efficient.
Through docs I found that either using findOneAndUpdate might solve the problem; but there are some constraints.
Here is the code I am planning to run:
models/Bookmark.js
const mongoose = require('mongoose')
const bookmarkItemSchema = new mongoose.Schema({
restaurantId: String,
cachedAttr: {
name: String,
latitude: Number,
longitude: Number,
},
})
const bookmarkListSchema = new mongoose.Schema({
listName: String,
items: [bookmarkItemSchema],
})
const bookmarkSchema = new mongoose.Schema({
userId: {
type: mongoose.Schema.Types.ObjectId,
ref: 'User',
},
lists: [bookmarkListSchema],
})
// const add = (lists, userId) => {
// let bookmark = Bookmark.findOne({userId})
// bookmark.lists.listName === lists.listName //current, new
// ? bookmark.lists.items.push(lists.items)
// : bookmark.lists.push(lists)
// return bookmark
// }
mongoose.model('Bookmark', bookmarkSchema)
Routes/bookmark.js
router.post('/bookmarks', async (req, res) => {
const {lists} = req.body
console.log(lists)
if (!lists) {
return res.status(422).send({error: 'You must provide lists'})
}
let bookmark = Bookmark.findOne({"userId": req.user._id})
if (bookmark.lists.listName === lists.listName){
let item = lists.items
bookmark.lists.items.push(item)
await bookmark.save()
res.send(bookmark)
}
try {
// const bookmark = Bookmark.add(lists, req.user._id, obj)
// await bookmark.save()
// res.send(bookmark)
let bookmark = Bookmark.findOne({"userId": req.user._id})
if (bookmark.lists.listName === lists.listName){ // THIS IS UNDEFINED. How to get this object?
let item = lists.items
bookmark.lists.items.push(item)
await bookmark.save()
res.send(bookmark)
}
} catch (e) {
res.status(422).send({error: e.message})
}
})
The req.body looks like this:
{
"lists": {
"listName": "My Saved List",
"items": {
"restaurantId": "abcdefg",
"cachedAttr": {
"name": "abcdefg",
"latitude": 200,
"longitude": 200
}
}
}
}
Basically what I commented out in the models/Bookmark.js file is what I would really like to do.
If the userId's list name already exists, then I would like to just add an item to the list.
Otherwise, I would like to add a new list to the object.
What is the best approach for doing this? Is there a straight forward mongoose api that I could use for this problem? or do I need to make two separated function that would handle each case and make that as schema methods and handle it in the routes file?

Node.js mssql module transaction not working with async await

config = {
user: process.env.PROD_USER,
password: process.env.PROD_PASSWORD,
server: process.env.PROD_SERVER,
database: process.env.PROD_DATABASE,
options: {
abortTransactionOnError: true, // <-- SET XACT_ABORT ON
},
}
const pool = await sql.connect(config);
const transaction = new sql.Transaction();
await transaction.begin();
const result = await this.get_sp_data(
data[0],
sp.InsertTransactionMaster,
res
);
const masterId = result.recordset[0].MasterId || 0;
if (masterId) {
asyncForeach(req.body.TransactionDetail, async (value) => {
const detailData = {
MasterId: masterId,
SubServiceId: value.SubServiceId,
Rate: value.Rate,
Quantity: value.Quantity,
Amount: value.Amount,
CreatedBy: req.user.id || 0,
MemberId: value.MemberId,
SubMemberIddd: value.SubMemberId || null,
};
await this.get_sp_data(detailData, sp.InsertTransactionDetail, res);
});
}
await transaction.commit();
console.dir('Transaction commited.');
My custom code is between begin and commit to execute a stored procedure to insert master data after inserting master data master id return that id I use to insert multiple detail data using for loop and last.
I have explicitly place erroneous code in detail insert but transaction is not rolling back as a result master data inserted and giving error of detail.

How to add two records in a row?

When I want to add two records in sequence, only one record is added, on the second it throws an error due to the fact that it cannot create a field with such data:
"NOTES_ID is required","key: NOTES_ID, value: undefined, is not a
number"
How to create an entry for two related tables sequentially from the beginning for the main table, and then for the one that has the foreign key installed.
module.exports.create = async function (req, res) {
const stateMatrix = await StateMatrix.select().exec()
const noteObj = {
DATE: req.body.DATE,
TITLE: req.body.TITLE,
CONTENT: req.body.CONTENT
};
const noteStateObj = {
STATE_DATE: new Date().toLocaleDateString("en-US"),
STATES_ID: stateMatrix[0]._props.STATES_ID_CURR,
NOTES_ID: req.body.NOTE_ID,
USERS_ID: req.decoded.user_id
};
try {
await Notes.create(noteObj);
await NoteStates.create(noteStateObj);
res.status(201).json(noteObj, noteStateObj);
} catch (e) {
errorHandler(res, e);
}
};
Probably NoteStates is related to Notes through note_id field which can not be empty (I guess it's foreign key). It means that you should set it before saving noteStateObj:
// Something like this
const newNote = await Notes.create(noteObj);
noteStateObj.NOTES_ID = newNote.ID;
await NoteStates.create(noteStateObj);

Resources