I'm reading csv data and then writing it to binary. However, it seems the process of reading in a csv file, converting each data point to a float, and then writing that data to a file takes an insane amount of time. What am I doing wrong and how can I speed this up?
Is this a limitation on node? Should I be using a different language?
var csvStream = csv.createWriteStream({headers: true});
var writeStream = fs.createWriteStream(csvPath);
csvStream.pipe(writeStream);
var binaryWriteStream = fs.createWriteStream(binaryPath, {'flags': 'a'});
Object.keys(dataSources).map(function (dataSource) {
firstPromise = firstPromise.then(function () {
return new Promise(function (resolve, reject) {
var allTheData = [];
var readStream = fs.createReadStream(dataSource);
//stream contents into csv reader
csv.fromStream(readStream, {delimiter: delimiterCharacter}).on("data", function (data) {
//get all data
allTheData.push(data);
}).on("end", function () {
var promise = Promise.resolve();
//for each row
allTheData.map(function (someData) {
promise = promise.then(function () {
return new Promise(function (resolve, reject) {
var someOtherPromise = Promise.resolve();
//preallocate buffer
var buf = Buffer.allocUnsafe(4 * someData.length);
var counter = 0;
//for each individual data sample
someData.map(function (data) {
someOtherPromise = someOtherPromise.then(function () {
return new Promise(function (resolve, reject) {
//convert to float
buf.writeFloatLE(data, counter * 4);
counter++;
resolve();
});
});
});
someOtherPromise.then(function () {
//write buffer to file
binaryWriteStream.write(buf);
resolve();
});
someOtherPromise.catch(function (err) {
console.log(err);
reject(err);
});
});
});
});
promise.then(function () {
resolve();
});
promise.catch(function (err) {
console.log(err);
reject(err);
});
});
});
});
});
});
TLDR:
reading in csv data from files in sequence
allocating and converting the csv data into binary line by line
writing it to a file
With this methodology it takes me five hours to read 614 files and subsequently read, allocate, convert, and write 99 million samples to a single unified binary file.
Related
I am currently trying to develop a google cloud function to parse multipart files (excel format or csv) in order to populate the firestore database.
I am using busboy in a helper function to parse the file, convert it to json and return it to the main function.
Everything goes well until I am trying to return the parsed data. I thought the most logic way of doing was to return the data from the busboy 'finish' event but it seems not to return the data as once back in the main function it is undefined. I first thought of some issue related to asynchronous code execution but when I tried to only print the data in the busboy finish event it worked properly.
I've tried to find some related content online but unfortunately didnt success. Here is my helper function :
// Takes a multipart request and sends back redable data
const processRequest = (req) => {
const busboy = Busboy({headers: req.headers});
formats = ['application/vnd.ms-excel', 'text/csv', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'];
var finalData;
// fieldname is the request key name of the file
// file is the stream
// fname is the name of the fileq
busboy.on('file', (fieldname, file, fname) => {
// Checks if file is right format
if(!formats.includes(fname.mimeType)) throw new FileFormatError('File must be excel or csv');
bytes = [];
// Checks that the request key is the right one
if(fieldname == 'file') {
// Data is the actual bytes, adds it to the buffer each time received
file.on('data', (data) => {
bytes.push(data);
});
// Concatenates the bytes into a buffer and reads data given mimetype
file.on('end', async () => {
buffer = Buffer.concat(bytes);
if(fname.mimeType === 'application/vnd.ms-excel' ||
fname.mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') {
workbook = XLSX.read(buffer, {type: 'buffer'});
json = excelToJson(workbook);
console.log(json);
finalData = json;
}
if (fname.mimeType === 'text/csv') {
var csv = [];
const stream = Readable.from(buffer.toString());
stream.pipe(CSV.parse({delimiter: ','}))
.on('error', (err) => {
console.log('csv parsing error');
console.log(err.message);
}).on('data', (row) => {
csv.push(row);
}).on('end', () => {
console.log('csv file properly processed');
console.log(csv);
// CSV PARSING LOGIC TO COME, JUST TESTING RIGHT NOW
finalData = csv;
});
}
});
}
});
busboy.on('finish', () => {
console.log('busboy finished');
return finalData;
// WHEN ONLY PRINTED THE DATA IS PRESENT AND DISPLAYS PROPERLY HERE
})
// Processes request body bytes
busboy.end(req.rawBody);
}
There must be something I am misunderstanding but as of yet I cannot point out what.
Thanks in advance for your time :)
You're not waiting for your CSV parsing to actually finish.
It would be better to refactor your async code to use async/await.
Since you're using libraries that might only support callback-style async, you'll need to do some new Promise wrapping yourself.
Understandably, I haven't tested the below code, but something like this...
/**
* Parse the given buffer as a CSV, return a promise of rows
*/
function parseCSV(buffer) {
return new Promise((resolve, reject) => {
const csv = [];
const stream = Readable.from(buffer.toString());
stream
.pipe("text/csv".parse({ delimiter: "," }))
.on("error", reject)
.on("data", (row) => csv.push(row))
.on("end", () => resolve(csv));
});
}
/**
* Parse the given buffer as a spreadsheet, return a promise
*/
async function parseSpreadsheet(mimeType, buffer) {
if (
mimeType === "application/vnd.ms-excel" ||
mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
) {
const workbook = XLSX.read(buffer, { type: "buffer" });
return excelToJson(workbook);
}
if (mimeType === "text/csv") {
return parseCSV(buffer);
}
throw new Error(`Unknown mime type ${mimeType}`);
}
/**
* Get the bytes of the field `fieldName` in the request.
*/
function getFileFromRequest(req, fieldName) {
return new Promise((resolve, reject) => {
const busboy = Busboy({ headers: req.headers });
busboy.on("file", (name, file, info) => {
// Only process the field we care about
if (name != fieldName) {
return;
}
const bytes = [];
file.on("data", (data) => bytes.push(data));
file.on("end", () =>
resolve({
info,
buffer: Buffer.concat(bytes),
}),
);
file.on("error", reject);
});
busboy.end(req.rawBody);
});
}
async function parseRequest(req) {
// (1) Get the file as a buffer
const { info, buffer } = await getFileFromRequest(req, "file");
// (2) Try parsing it as a spreadsheet
const data = await parseSpreadsheet(info.mimeType, buffer);
// (3) Do something with the data?
return data;
}
The function in the following code will always read test.txt it is not a shared function, how to make it as a shared function
var readFile = new Promise(function (resolve, reject) {
fs.readFile('test.txt', 'utf-8', function (error, data) {
if (error){
reject(error);
}else {
resolve(data);
}
});
});
module.exports.readFile = readFile;
You can do like this
readFileFunction.js
var ReadFile= function(fileName) {
return new Promise(function (resolve, reject) {
fs.readFile(fileName, 'utf-8', function (error, data) {
if (error){
reject(error);
}else {
resolve(data);
}
});
});
}
module.exports = ReadFile;
To use this function
someOtherFile.js
var ReadFile = require('./readFileFunction.js') // correct path to readFile
ReadFile(fileName).then(function(data) {
/* stuff */
})
For your code
-nodeapp
--helper.js
--main.js
--text.txt
--readFileFunction.js
Say you need to read File in main.js.
in main.js
var ReadFile = require('./readFileFunction.js')
// other code
...
// you need to read text.txt
var fileName1= 'text.txt' //change name according to your wish here
var fileName2= 'text2.txt' //change name according to your wish here
ReadFile(fileName1).then(function(dataOfFile1) {
ReadFile(fileName2).then(function(dataOfFile2) {
/* your file stuff */
console.log(dataOfFile1) // all your file data
console.log(dataOfFile2) // all your file data
})
})
Hope this will help
Pardon me as the code is messy. I'm still learning.
I need to download the image with the URL scan from a CSV file. However i have 2000+ of URL with the same domain, and i don't think the server will let me to pull everything in a go hence i always get error after some images. Problem that i need to solve - 1) How to make sure the images are downloaded completely then only the code move on to the next URL 2) How to write a better code
Your help is appreciated. Thank You
var csv = require('fast-csv');
var Promise = require('bluebird');
var fs = require('fs');
var request = require('request');
var path = "test.csv";
var promiseCSV = Promise.method(function(path, options) {
return new Promise(function(resolve, reject) {
var records = [];
csv
.fromPath(path, options)
.on('data', function(record) {
records.push(record);
})
.on('end', function() {
resolve(records);
console.log('done');
});
});
});
var download = function(uri, filename, callback){
request.head(uri, function(err, res, body){
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
});
};
promiseCSV(path).then(function (records) {
for(i=0;i<records.length;i++)
{
download(records[i][0],'img/'+records[i][1], function(){
});
}
});
This will throttle your requests to one at a time. Another option is to use throttled-request to limit by requests per unit time.
var i = 0;
promiseCSV(path).then(function (records) {
next();
function next(){
download(records[i][0],'img/'+records[i][1], function(){
i++;
if (i < records.length) next();
});
}
});
Also, your records variable is out of scope, you need to move it out in order to access it:
var records = []; // move out to global scope to access from elsewhere
var promiseCSV = Promise.method(function(path, options) {
return new Promise(function(resolve, reject) {
csv
.fromPath(path, options)
.on('data', function(record) {
records.push(record);
})
.on('end', function() {
resolve(records);
console.log('done');
});
});
});
I'm trying to save and parse large .csv files and save the data in MongoDB, keeping the results in string type. So I'm trying to pipe the .csv file data, through a parser and then write the data to MongoDB.
I tried parsing the .csv to a json file and using mongoimport to upload it to MongoDB, which worked fine, but the values weren't kept as strings and you cant set values when using mongoimport.
I also don't want to set the memory for node and try and use as little memory as possible.
My problem at the moment is: the program runs out of memory and throws:
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - process out of memory
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
var queue, stream;
var headers = fileData.subText.meta.fields;
MongoClient.connect(url, function (err, db) {
if (err) throw err;
var collection = db.collection(fileData.collectionName);
var parser = parse({columns: fileData.subText.meta.fields, delimiter: fileData.delimiter});
stream = fs.createReadStream("filepath" + fileData.name).pipe(parser);
var data;
queue = async.queue(function (task, next) {
data = task.data;
collection.insert(data, function (err, result) {
if (err) {
db.close();
console.log(err);
} else {
next();
}
});
}, 50);
stream.on('data', function (data) {
stream.pause();
queue.push({
data: data
});
});
queue.drain = function () {
stream.resume();
};
stream.on('end', function () {
return queue.drain = function () {
db.close();
return console.log('Process Done');
};
});
});
I got the idea from this link: https://bassnutz.wordpress.com/2012/09/09/processing-large-files-with-nodejs/
Any help would be appreciated.
This may not be possible but I am trying to return a buffer object of an image on Rackspace using the pkgcloud module without having to write to the filesystem. I've seen this done before however both examples show piping the download to the File System.
function get() {
return new Promise(function (resolve, reject) {
_this._RackClient.download(options, function(err, results) {
if (err !== null) {
return reject(err);
console.log("Errow Downloading:", err);
}
resolve(buffer);
});
});
}
return get();
This is ideally how I would like it to work but there currently is not a body present in the request. Can I use a stream.passThrough() and return that similar to uploading a buffer?
.download() returns a Readable stream, so it should just be a matter of buffering that output. For example:
var stream = _this._RackClient.download(options);
var buf = [];
var nb = 0;
var hadErr = false;
stream.on('data', function(chunk) {
buf.push(chunk);
nb += chunk.length;
}).on('end', function() {
if (hadErr)
return;
switch (buf.length) {
case 0:
return resolve(new Buffer(0));
case 1:
return resolve(buf[0]);
default:
return resolve(Buffer.concat(buf, nb));
}
}).on('error', function(err) {
hadErr = true;
reject(err);
});