Pause Node csv-parser when uploading data - node.js

I am working with Node csv-parser and my read stream is from std in. I am piping that to the parser. Inside the parser readable function, I am making an async HTTP call to upload this data. During this time, I want the parser to pause reading. Until the async call has finished excecuting.
var parse = require('csv-parse');
var output = [];
// Create the parser
var parser = parse({delimiter: ',', columns: true, trim: true});
parser.on('readable', function () {
while (record = parser.read()) {
console.log('Still reading');
var jsonRec = convertIpToInt(record);
var jsonData = JSON.stringify(jsonRec);
output.push(new Buffer(jsonData + "\n"));
//parser.pause();
var assertPromise = uploadBatches1(indexName, function() {
//parser.resume();
console.log('Returned');
});
}
});
function uploadBatches1(indexToAssert, cb) {
//uploads data got form parser
}

Since csv-parse exposes a readable stream, you can pause the stream with parser.pause()
Likewise, when your update is complete, you can call parser.resume(). You can also check if you need to resume beforehand using parser.isPaused().
You can read more about readable streams and what the pause method does here:
https://nodejs.org/api/stream.html#stream_readable_pause
Based on the above, you should be implementing the parser.on('data') readable stream event, instead of the while (record = parser.read()) loop. Pausing the stream does nothing if you are actively calling the read method yourself.
Consider restructuring your code like so, and then use parser.pipe from a filesystem read:
var parse = require('csv-parse');
var output = [];
// Create the parser
var parser = parse({delimiter: ',', columns: true, trim: true});
parser.on('data', function () {
console.log('Still reading');
var jsonRec = convertIpToInt(record);
var jsonData = JSON.stringify(jsonRec);
output.push(new Buffer(jsonData + "\n"));
parser.pause();
var assertPromise = uploadBatches1(indexName, function() {
parser.resume();
console.log('Returned');
});
});
function uploadBatches1(indexToAssert, cb) {
//uploads data got form parser
}

Related

How can i save a file i download using fetch with fs

I try downloading files with the fetch() function from github.
Then i try to save the fetched file Stream as a file with the fs-module.
When doing it, i get this error:
TypeError [ERR_INVALID_ARG_TYPE]: The "transform.writable" property must be an instance of WritableStream. Received an instance of WriteStream
My problem is, that i don't know the difference between WriteStream and WritableStream or how to convert them.
This is the code i run:
async function downloadFile(link, filename = "download") {
var response = await fetch(link);
var body = await response.body;
var filepath = "./" + filename;
var download_write_stream = fs.createWriteStream(filepath);
console.log(download_write_stream.writable);
await body.pipeTo(download_write_stream);
}
Node.js: v18.7.0
You can use Readable.fromWeb to convert body, which is a ReadableStream from the web streams API, into a NodeJS Readable stream that can be used with the fs methods.
Note that readable.pipe returns another stream instantly. To wait for it to finish, you can use the promise version of stream.finished to convert it into a Promise, or else you could add listeners for the 'finish' and 'error' events to detect success or failure.
const fs = require('fs');
const { Readable } = require('stream');
const { finished } = require('stream/promises');
async function downloadFile(link, filepath = './download') {
const response = await fetch(link);
const body = Readable.fromWeb(response.body);
const download_write_stream = fs.createWriteStream(filepath);
await finished(body.pipe(download_write_stream));
}
Good question. Web streams are something new, and they are different way of handling streams. WritableStream tells us that we can create WritableStreams as follows:
import {
WritableStream
} from 'node:stream/web';
const stream = new WritableStream({
write(chunk) {
console.log(chunk);
}
});
Then, you could create a custom stream that writes each chunk to disk. An easy way could be:
const download_write_stream = fs.createWriteStream('./the_path');
const stream = new WritableStream({
write(chunk) {
download_write_stream.write(chunk);
},
});
async function downloadFile(link, filename = 'download') {
const response = await fetch(link);
const body = await response.body;
await body.pipeTo(stream);
}

Read csv files in stream and store them in database

I have a few huge csv files, what I need to store in a mongo database. Because these files are too big, I need to use stream. I pause the stream while the data writing into the database.
var fs = require('fs');
var csv = require('csv');
var mongo = require('mongodb');
var db = mongo.MongoClient.connect...
var readStream = fs.createReadStream('hugefile.csv');
readStream.on('data', function(data) {
readStream.pause();
csv.parse(data.toString(), { delimiter: ','}, function(err, output) {
db.collection(coll).insert(data, function(err) {
readStream.resume();
});
});
});
readStream.on('end', function() {
logger.info('file stored');
});
But the csv.parse drop an error, because I would need to read the files line by line to handle them as csv, and convert to json for the mongodb. Maybe I should not pause them, but use an interface. I didn't find any solution for this yet.
Any help would be appreciated!
I think you might want to create a stream of lines from your raw data stream.
Here is an example from the split package. https://www.npmjs.com/package/split
fs.createReadStream(file)
.pipe(split())
.on('data', function (line) {
//each chunk now is a seperate line!
})
Adapted to your example it might look like this
var readStream = fs.createReadStream('hugefile.csv');
var lineStream = readStream.pipe(split());
lineStream.on('data', function(data) {
//remaining code unmodified
I'm unsure if bulk() was a thing back in '15, but whosoever is trying to import items from large sources should consider using them.
var fs = require('fs');
var csv = require('fast-csv');
var mongoose = require('mongoose');
var db = mongoose.connect...
var counter = 0; // to keep count of values in the bulk()
const BULK_SIZE = 1000;
var bulkItem = Item.collection.initializeUnorderedBulkOp();
var readStream = fs.createReadStream('hugefile.csv');
const csvStream = csv.fromStream(readStream, { headers: true });
csvStream.on('data', data => {
counter++;
bulkOrder.insert(order);
if (counter === BATCH_SIZE) {
csvStream.pause();
bulkOrder.execute((err, result) => {
if (err) console.log(err);
counter = 0;
bulkItem = Item.collection.initializeUnorderedBulkOp();
csvStream.resume();
});
}
}
});

One symbol at time stream with node

I'm trying implement a stream which returns one symbol from file on each 'data' event.
I finished with code bellow:
var util = require('util'),
fs = require('fs'),
Readable = require('stream').Readable;
var util = require('util');
var Readable = require('stream').Readable;
var SymbolReadStream = function(filename, options) {
Readable.call(this);
this._readable = fs.createReadStream(filename, options).pause();
self = this;
this._readable.on('readable', function() {
var chunk;
chunk = self._readable.read(1);
// I believe the problem is here
self._readable.pause();
});
};
util.inherits(SymbolReadStream, Readable); // inherit the prototype methods
SymbolReadStream.prototype._read = function() {
this._readable.resume();
};
var r = new SymbolReadStream("test.txt", {
encoding: 'utf8',
});
r.on('data', function(el) {
console.log(el);
});
but this code doesn't work. Please help.
Is there an easier way to achieve the behavior?
This post give a great clue how to answer your question.
Also, you should take a loop at pipe that would be a cleaner way to accomplish what you're trying to do: piping an adapter to the filestream instead of wrapping it up
That said, personaly I wont reinvent the wheel here, and would just search for modules that can accomplish that. Especially "split" modules, making them split on every char, instead on new lines. As an example, event-stream has a split method that "takes the same arguments as string.split except it defaults to '\n' instead of ','". So the logic would be to try myStream.pipe(es.split('')) but the modules takes this like myStream.pipe(es.split()) which breaks on lines. So here's my solution, using a regex to say "break on each char"
var es = require('event-stream');
var fs = require('fs');
var symbolStream = fs.createReadStream(filename, options).pipe(es.split(/(?!$)/));
EDIT: event-stream seems to use split module internally, so you can even try
var split = require('split');
var fs = require('fs');
var symbolStream = fs.createReadStream(filename, options).pipe(split(/(?!$)/));
(this loose test is responsible of converting '' to \r\n)
In you stream implementation there is no emitting 'data' event to handler. Because of it, console.log are never called. After adding events, they will be streamed symbol by symbol. Example below:
var util = require('util'),
fs = require('fs'),
Readable = require('stream').Readable;
function SymbolReadStream(filename, options) {
if (!(this instanceof SymbolReadStream)) {
return new SymbolReadStream(length, options);
}
Readable.call(this);
this._readable = fs.createReadStream(filename, options);
}
util.inherits(SymbolReadStream, Readable); // inherit the prototype methods
SymbolReadStream.prototype._read = function() {
var self = this;
this._readable.on('readable', function() {
var chunk;
while (null !== (chunk = self._readable.read(1))) {
self.emit('data', chunk);
}
});
this._readable.on('end', function() {
self.emit('end');
});
};
var r = new SymbolReadStream("test.txt", {
encoding: 'utf8',
});
r.on('data', function(el) {
console.log(el);
});
r.on('end', function(el) {
console.log('done');
});

Collect data from a readable stream into a variable

I'm trying to implement a writable stream that will save the data that is written to it into a variable. This is the implementation of the writable stream:
var util = require('util');
var Writable = require('stream').Writable;
function Collector()
{
Writable.call(this, {objectMode: true});
this.entities = [];
};
util.inherits(Collector, Writable);
Collector.prototype._write = function (chunk, encoding, callback)
{
this.entities.push(chunk);
callback();
};
module.exports = Collector;
and this is how I'm trying to test it it out:
var fs = require('fs');
var Tokenizer = require('./tokenizer');
var Processor = require('../parser');
var Collector = require('./collector.js');
var tokenizer = new Tokenizer();
var processor = new Processor();
var collector = new Collector();
var readable = fs.createReadStream('./test/fixtures/test.dxf');
readable.pipe(tokenizer)
.pipe(parser)
.pipe(processor); // if this is piped to stdout, lots of data
console.log(collector.entities); // logs an empty array
I'm not sure why, but the entities property is empty after all it has been piped. If I console log this.entities within the _write function, the data is available.
Ultimately I want to be to call a function that returns an array whose elements are made up of data chunks from Processor. Collector was some hacking to see how I could do it, but I haven't gotten very far.
How can I store chunks from a readable stream into a variable and return them from a function?
It returns empty array because your streaming has not finished yet. You should listen finish event in order to poperly get your entities array:
var fs = require('fs');
var Tokenizer = require('./tokenizer');
var Processor = require('../parser');
var Collector = require('./collector.js');
var tokenizer = new Tokenizer();
var processor = new Processor();
var collector = new Collector();
var readable = fs.createReadStream('./test/fixtures/test.dxf');
readable.pipe(tokenizer)
.pipe(parser)
.pipe(processor)
.on('finish', function() {
console.log(collector.entities);
});

Parsing huge logfiles in Node.js - read in line-by-line

I need to do some parsing of large (5-10 Gb)logfiles in Javascript/Node.js (I'm using Cube).
The logline looks something like:
10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state "SUCCESS".
We need to read each line, do some parsing (e.g. strip out 5, 7 and SUCCESS), then pump this data into Cube (https://github.com/square/cube) using their JS client.
Firstly, what is the canonical way in Node to read in a file, line by line?
It seems to be fairly common question online:
http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
Read a file one line at a time in node.js?
A lot of the answers seem to point to a bunch of third-party modules:
https://github.com/nickewing/line-reader
https://github.com/jahewson/node-byline
https://github.com/pkrumins/node-lazy
https://github.com/Gagle/Node-BufferedReader
However, this seems like a fairly basic task - surely, there's a simple way within the stdlib to read in a textfile, line-by-line?
Secondly, I then need to process each line (e.g. convert the timestamp into a Date object, and extract useful fields).
What's the best way to do this, maximising throughput? Is there some way that won't block on either reading in each line, or on sending it to Cube?
Thirdly - I'm guessing using string splits, and the JS equivalent of contains (IndexOf != -1?) will be a lot faster than regexes? Has anybody had much experience in parsing massive amounts of text data in Node.js?
I searched for a solution to parse very large files (gbs) line by line using a stream. All the third-party libraries and examples did not suit my needs since they processed the files not line by line (like 1 , 2 , 3 , 4 ..) or read the entire file to memory
The following solution can parse very large files, line by line using stream & pipe. For testing I used a 2.1 gb file with 17.000.000 records. Ram usage did not exceed 60 mb.
First, install the event-stream package:
npm install event-stream
Then:
var fs = require('fs')
, es = require('event-stream');
var lineNr = 0;
var s = fs.createReadStream('very-large-file.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
lineNr += 1;
// process line here and call s.resume() when rdy
// function below was for logging memory usage
logMemoryUsage(lineNr);
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(err){
console.log('Error while reading file.', err);
})
.on('end', function(){
console.log('Read entire file.')
})
);
Please let me know how it goes!
You can use the inbuilt readline package, see docs here. I use stream to create a new output stream.
var fs = require('fs'),
readline = require('readline'),
stream = require('stream');
var instream = fs.createReadStream('/path/to/file');
var outstream = new stream;
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function(line) {
console.log(line);
//Do your stuff ...
//Then write to output stream
rl.write(line);
});
Large files will take some time to process. Do tell if it works.
I really liked #gerard answer which is actually deserves to be the correct answer here. I made some improvements:
Code is in a class (modular)
Parsing is included
Ability to resume is given to the outside in case there is an asynchronous job is chained to reading the CSV like inserting to DB, or a HTTP request
Reading in chunks/batche sizes that
user can declare. I took care of encoding in the stream too, in case
you have files in different encoding.
Here's the code:
'use strict'
const fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
parse = require("csv-parse"),
iconv = require('iconv-lite');
class CSVReader {
constructor(filename, batchSize, columns) {
this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8'))
this.batchSize = batchSize || 1000
this.lineNumber = 0
this.data = []
this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true}
}
read(callback) {
this.reader
.pipe(es.split())
.pipe(es.mapSync(line => {
++this.lineNumber
parse(line, this.parseOptions, (err, d) => {
this.data.push(d[0])
})
if (this.lineNumber % this.batchSize === 0) {
callback(this.data)
}
})
.on('error', function(){
console.log('Error while reading file.')
})
.on('end', function(){
console.log('Read entirefile.')
}))
}
continue () {
this.data = []
this.reader.resume()
}
}
module.exports = CSVReader
So basically, here is how you will use it:
let reader = CSVReader('path_to_file.csv')
reader.read(() => reader.continue())
I tested this with a 35GB CSV file and it worked for me and that's why I chose to build it on #gerard's answer, feedbacks are welcomed.
I used https://www.npmjs.com/package/line-by-line for reading more than 1 000 000 lines from a text file. In this case, an occupied capacity of RAM was about 50-60 megabyte.
const LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
The Node.js Documentation offers a very elegant example using the Readline module.
Example: Read File Stream Line-by-Line
const { once } = require('node:events');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt'),
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
await once(rl, 'close');
Note: we use the crlfDelay option to recognize all instances of CR LF ('\r\n') as a single line break.
Apart from read the big file line by line, you also can read it chunk by chunk. For more refer to this article
var offset = 0;
var chunkSize = 2048;
var chunkBuffer = new Buffer(chunkSize);
var fp = fs.openSync('filepath', 'r');
var bytesRead = 0;
while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) {
offset += bytesRead;
var str = chunkBuffer.slice(0, bytesRead).toString();
var arr = str.split('\n');
if(bytesRead = chunkSize) {
// the last item of the arr may be not a full line, leave it to the next chunk
offset -= arr.pop().length;
}
lines.push(arr);
}
console.log(lines);
I had the same problem yet. After comparing several modules that seem to have this feature, I decided to do it myself, it's simpler than I thought.
gist: https://gist.github.com/deemstone/8279565
var fetchBlock = lineByline(filepath, onEnd);
fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No.
It cover the file opened in a closure, that fetchBlock() returned will fetch a block from the file, end split to array (will deal the segment from last fetch).
I've set the block size to 1024 for each read operation. This may have bugs, but code logic is obvious, try it yourself.
Reading / Writing files using stream with the native nodejs modules (fs, readline):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('input.json'),
output: fs.createWriteStream('output.json')
});
rl.on('line', function(line) {
console.log(line);
// Do any 'line' processing if you want and then write to the output file
this.output.write(`${line}\n`);
});
rl.on('close', function() {
console.log(`Created "${this.output.path}"`);
});
Based on this questions answer I implemented a class you can use to read a file synchronously line-by-line with fs.readSync(). You can make this "pause" and "resume" by using a Q promise (jQuery seems to require a DOM so cant run it with nodejs):
var fs = require('fs');
var Q = require('q');
var lr = new LineReader(filenameToLoad);
lr.open();
var promise;
workOnLine = function () {
var line = lr.readNextLine();
promise = complexLineTransformation(line).then(
function() {console.log('ok');workOnLine();},
function() {console.log('error');}
);
}
workOnLine();
complexLineTransformation = function (line) {
var deferred = Q.defer();
// ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error));
return deferred.promise;
}
function LineReader (filename) {
this.moreLinesAvailable = true;
this.fd = undefined;
this.bufferSize = 1024*1024;
this.buffer = new Buffer(this.bufferSize);
this.leftOver = '';
this.read = undefined;
this.idxStart = undefined;
this.idx = undefined;
this.lineNumber = 0;
this._bundleOfLines = [];
this.open = function() {
this.fd = fs.openSync(filename, 'r');
};
this.readNextLine = function () {
if (this._bundleOfLines.length === 0) {
this._readNextBundleOfLines();
}
this.lineNumber++;
var lineToReturn = this._bundleOfLines[0];
this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany)
return lineToReturn;
};
this.getLineNumber = function() {
return this.lineNumber;
};
this._readNextBundleOfLines = function() {
var line = "";
while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file
this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver
this.idxStart = 0
while ((this.idx = this.leftOver.indexOf("\n", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver
line = this.leftOver.substring(this.idxStart, this.idx);
this._bundleOfLines.push(line);
this.idxStart = this.idx + 1;
}
this.leftOver = this.leftOver.substring(this.idxStart);
if (line !== "") {
break;
}
}
};
}
node-byline uses streams, so i would prefer that one for your huge files.
for your date-conversions i would use moment.js.
for maximising your throughput you could think about using a software-cluster. there are some nice-modules which wrap the node-native cluster-module quite well. i like cluster-master from isaacs. e.g. you could create a cluster of x workers which all compute a file.
for benchmarking splits vs regexes use benchmark.js. i havent tested it until now. benchmark.js is available as a node-module
import * as csv from 'fast-csv';
import * as fs from 'fs';
interface Row {
[s: string]: string;
}
type RowCallBack = (data: Row, index: number) => object;
export class CSVReader {
protected file: string;
protected csvOptions = {
delimiter: ',',
headers: true,
ignoreEmpty: true,
trim: true
};
constructor(file: string, csvOptions = {}) {
if (!fs.existsSync(file)) {
throw new Error(`File ${file} not found.`);
}
this.file = file;
this.csvOptions = Object.assign({}, this.csvOptions, csvOptions);
}
public read(callback: RowCallBack): Promise < Array < object >> {
return new Promise < Array < object >> (resolve => {
const readStream = fs.createReadStream(this.file);
const results: Array < any > = [];
let index = 0;
const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => {
index++;
results.push(await callback(data, index));
}).on('error', (err: Error) => {
console.error(err.message);
throw err;
}).on('end', () => {
resolve(results);
});
readStream.pipe(csvStream);
});
}
}
import { CSVReader } from '../src/helpers/CSVReader';
(async () => {
const reader = new CSVReader('./database/migrations/csv/users.csv');
const users = await reader.read(async data => {
return {
username: data.username,
name: data.name,
email: data.email,
cellPhone: data.cell_phone,
homePhone: data.home_phone,
roleId: data.role_id,
description: data.description,
state: data.state,
};
});
console.log(users);
})();
I have made a node module to read large file asynchronously text or JSON.
Tested on large files.
var fs = require('fs')
, util = require('util')
, stream = require('stream')
, es = require('event-stream');
module.exports = FileReader;
function FileReader(){
}
FileReader.prototype.read = function(pathToFile, callback){
var returnTxt = '';
var s = fs.createReadStream(pathToFile)
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
//console.log('reading line: '+line);
returnTxt += line;
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(){
console.log('Error while reading file.');
})
.on('end', function(){
console.log('Read entire file.');
callback(returnTxt);
})
);
};
FileReader.prototype.readJSON = function(pathToFile, callback){
try{
this.read(pathToFile, function(txt){callback(JSON.parse(txt));});
}
catch(err){
throw new Error('json file is not valid! '+err.stack);
}
};
Just save the file as file-reader.js, and use it like this:
var FileReader = require('./file-reader');
var fileReader = new FileReader();
fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/});

Resources