Using csv-parse with highlandjs

Using csv-parse with highlandjs - node.js

I would like to do a bit of parsing on csv files to convert them to JSON and extract data out of them. I'm using highland as a stream processing library. I am creating an array of csv parsing streams using
import { readdir as readdirCb, createReadStream } from 'fs';
import { promisify } from 'util';
import _ from 'highland';
import parse from 'csv-parse';
const readdir = promisify(readdirCb);
const LOGS_DIR = './logs';
const options = '-maxdepth 1';
async function main() {
const files = await readdir(LOGS_DIR)
const stream = _(files)
.map(filename => createReadStream(`${LOGS_DIR}/${filename}`))
.map(parse)
}
main();
I have tried to use stream like:
const stream = _(files)
.map(filename => createReadStream(`${LOGS_DIR}/${filename}`))
.map(parse)
.each(stream => {
stream.on('parseable', () => {
let record
while (record = stream.read()) { console.log(record) }
})
})
This does not produce any records. I am not sure as how to proceed and receive the JSON for each row for each CSV file.
EDIT:
Writing a function like this works for an individual file:
import parse from 'csv-parse';
import transform from 'stream-transform';
import { createReadStream } from 'fs';
export default function retrieveApplicationIds(filename) {
console.log('Parsing file', filename);
return createReadStream(filename).pipe(parser).pipe(getApplicationId).pipe(recordUniqueId);
}
Edit 2:
I have tried using the concat streams approach:
const LOGS_DIR = './logs';
function concatStreams(streamArray, streamCounter = streamArray.length) {
streamArray.reduce((mergedStream, stream) => {
// pipe each stream of the array into the merged stream
// prevent the automated 'end' event from firing
mergedStream = stream.pipe(mergedStream, { end: false });
// rewrite the 'end' event handler
// Every time one of the stream ends, the counter is decremented.
// Once the counter reaches 0, the mergedstream can emit its 'end' event.
stream.once('end', () => --streamCounter === 0 && mergedStream.emit('end'));
return mergedStream;
}, new PassThrough());
}
async function main() {
const files = await readdir(LOGS_DIR)
const streams = files.map(parseFile);
const combinedStream = concatStreams(streams);
combinedStream.pipe(process.stdout);
}
main();
When I use this, I get the error:
(node:1050) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 unpipe listeners added to [Transformer]. Use emitter.setMaxListeners() to increase limit

Related

How to update a redux state from a node js function

I'm creating a program that graphs in real time the data send by a serial port.
I'm using Electron, so in the main.js file (Where you create the window for the app) I have declared the event when receives data.
const { SerialPort, ReadlineParser } = require('serialport');
const port = new SerialPort({
path: 'COM5',
baudRate: 9600,
});
const parser = new ReadlineParser({ delimiter: '\r\n' });
port.pipe(parser);
let anterior = '';
let cont = 0;
// on data print the complete data from the serial port
parser.on('data', (line) => {
let value = line.substring(5, 15);
value = parseFloat(value.trim());
if (value > 0.0 && !line.includes('?') && line != anterior) {
console.log(`> ${line}`);
anterior = line;
updateStateFromNode(value, cont);
cont += 1;
}
});
The function I call is this:
import { store } from '../store/store'; // my actual store
import { addValue, addLabel } from '../store/data';
function updateStateFromNode(newValue, newLabel) {
store.dispatch(addValue(newValue));
store.dispatch(addLabel(newLabel));
}
I console log inside the function addValue to check if it reaches the function, and it does...
I also have tried with a customHook, but it didn't work also.
Anyone knows how could I achieve this?

The main and renderer processes are isolated from each other, which means that dispatching from main will not update the store on your renderer. To communicate between the two processes, you can use IPC. Assuming you use a preload file, you can do something looking like this:
Main
function updateStateFromNode(newValue, newLabel) {
// We will use a channel named "updateState"
mainWindow.webContents.send("updateState", newValue, newLabel);
}
Preload
const { contextBridge, ipcRenderer } = require('electron');
contextBridge.exposeInMainWorld("electronAPI", {
onUpdateState: (callback) => {
const channel = "updateState";
const subscription = (_event, ...args) => callback(...args);
// Sets the listener on the channel "updateState"
ipcRenderer.on(channel, subscription);
// Returns a function to remove the listener
return () => {
ipcRenderer.removeListener(channel, subscription);
};
}
})
Renderer
import { addValue, addLabel } from "../store/data";
const { electronAPI } = window; // `electronAPI` is exposed with the preload
const MyComponent = () => {
const dispatch = useDispatch();
useEffect(() => {
// Calls the function from preload to set the listener on the channel "updateState"
const removeUpdateStateListener = electronAPI.onUpdateState((newValue, newLabel) => {
dispatch(addValue(newValue));
dispatch(addLabel(newLabel));
});
return removeUpdateStateListener; // Removes the listener on unmount
}, [dispatch]);
return (...);
}

IPFS streams not buffers

I'm trying to create an API get URL that can be called with <img src='...'/> that will actually load the image from IPFS.
I'm getting the file from IPFS and I can send it as a buffer via fastify but can't send it as a stream.
here's the working buffer using ipfs.cat
import { concat as uint8ArrayConcat } from "uint8arrays/concat";
import all from "it-all";
fastify.get(
"/v1/files/:username/:cid",
async function (request: any, reply: any) {
const { cid }: { cid: string } = request.params;
const ipfs = create();
const data = uint8ArrayConcat(await all(ipfs.cat(cid)));
reply.type("image/png").send(data);
}
);
Docs for ipfs cat
Docs for fastify reply buffers
I also tried sending it as a stream to try and not load the file into the server's memory...
import { concat as uint8ArrayConcat } from "uint8arrays/concat";
import all from "it-all";
import { Readable } from "stream";
...
fastify.get(
"/v1/files/:username/:cid",
async function (request: any, reply: any) {
const { cid }: { cid: string } = request.params;
const ipfs = create();
const bufferToStream = async (buffer: any) => {
const readable = new Readable({
read() {
this.push(buffer);
this.push(null);
},
});
return readable;
};
const data = uint8ArrayConcat(await all(ipfs.cat(cid)));
const str = await bufferToStream(data);
reply.send(str);
}
);
With a new error
Error [ERR_STREAM_WRITE_AFTER_END]: write after end
Here I'm trying to push into the stream
import { concat as uint8ArrayConcat } from "uint8arrays/concat";
import all from "it-all";
import { Readable } from "stream";
fastify.get(
"/v1/files/:username/:cid",
async function (request: any, reply: any) {
const { cid }: { cid: string } = request.params;
const ipfs = create();
const myStream = new Readable();
myStream._read = () => {};
const pushChunks = async () => {
for await (const chunk of ipfs.cat(cid)) {
myStream.push(chunk);
}
};
pushChunks();
reply.send(myStream);
}
);
the error now is
INFO (9617): stream closed prematurely
and trying to dump it all in the stream
import { concat as uint8ArrayConcat } from "uint8arrays/concat";
import all from "it-all";
import { Readable } from "stream";
fastify.get(
"/v1/files/:username/:cid",
async function (request: any, reply: any) {
const { cid }: { cid: string } = request.params;
const ipfs = create();
var myStream = new Readable();
myStream._read = () => {};
myStream.push(uint8ArrayConcat(await all(ipfs.cat(cid))));
myStream.push(null);
reply.send(myStream);
}
);
with error
WARN (14295): response terminated with an error with headers already sent
Is there any benefit to converting it to a stream? Hasn't IPFs already loaded it into memory??

Is there any benefit to converting it to a stream? Hasn't IPFs already loaded it into memory??
The ipfs module returns many chunks as a byte array.
So a file is the sum of these chunks.
Now, if you push all these chunks into an Array, and then uint8ArrayConcat is called, all the chunks are actually in your memory server.
So, if the file is 10 GB, your server has an array of bytes equal to 10 GB in memory.
Since this is unwanted for sure, you should push every chunk from the ipfs file to the response. By doing this, the chunk buffer array is transitive in the server's memory, but it is not persisted. So, in this case, you will not have the 10 GB file in memory, but only a tiny slice of it.
Since ipfs.cap returns an Async iterator, you could handle manually or use something like async-iterator-to-stream to write:
const ipfsStream = asyncIteratorToStream(ipfs.cat(cid))
return reply.send(ipfsStream)
As follow up, I share this awesome resource about node.js stream and buffers

Transform stream stops processing early

I'm trying to extend the node.js Transform stream twice, once to split stdin into new lines, and another to alternate lowercase and uppercase.
SplitLines is working as intended, but AlternateUppercase is not.
index.js
const { Transform } = require('stream')
class SplitLines extends Transform {
_transform(chunk, encoding, callback) {
const parsed = chunk.toString().trim()
const results = parsed.split('\n')
results.forEach((line) => {
this.push(`${line}\n`)
})
}
}
class AlternateUppercase extends Transform {
constructor(options) {
super(options)
this.isEven = false
}
_transform(chunk, encoding, callback) {
const line = chunk.toString()
const altered = this.isEven ? line.toUpperCase() : line
this.push(`${altered}\n`)
this.isEven = !this.isEven
}
}
process.stdin
.pipe(new SplitLines())
.pipe(new AlternateUppercase())
.pipe(process.stdout)
How I'm testing
echo -e 'one\ntwo\nthree' | node index.js
What I'm seeing in the terminal
one
What I would expect to see
one
TWO
three
Am I doing something wrong which is causing

In the node documentation, you are required to call the callback function in order to receive the next chunk. This is not so apparent in the SplitLines function because you passed the whole string as a single chunk. However, when you do the push operation repeatedly SplitLines, you are sending multiple chunks, so you need to call the callback function.
const { Transform } = require('stream')
class SplitLines extends Transform {
_transform(chunk, encoding, callback) {
const parsed = chunk.toString().trim()
const results = parsed.split('\n')
results.forEach((line) => {
this.push(`${line}\n`)
})
}
}
class AlternateUppercase extends Transform {
constructor(options) {
super(options)
this.isEven = false
}
_transform(chunk, encoding, callback) {
const line = chunk.toString()
const altered = this.isEven ? line.toUpperCase() : line
this.push(`${altered}\n`)
callback() //must call callback to receive next chunk
this.isEven = !this.isEven
}
}
process.stdin
.pipe(new SplitLines())
.pipe(new AlternateUppercase())
.pipe(process.stdout)
The output would be:
one
TWO
three
The multiple new lines appeared because you \n at both of the transformer. To fix this, you need to add \n only once, ie at SplitLines or AlternateUppercase only.

How to use readline in NodeJs on an event (wait stream close)

I'm creating a bot that when it receives a message begins to read a text file and responds to the message with the contents of the file.
Unfortunately, I can't get out of this asynchronous hell and I only get errors, undefined or promise
The last experiment was this:
const fs = require('fs');
const readline = require('readline');
// bot.listen("message").reply(responseText())
function readFile(file) {
var text = '';
var readInterface = readline.createInterface({
input: fs.createReadStream(file),
terminal: false
});
readInterface.on('line', function(line) {
linea = line.trim();
console.log(linea);
text += linea;
}).on('close', function() {
return text;
});
});
}
async function responseText() {
var content = await readFile("file.txt");
content.then(function(data) {
return data;
})
}
What I would like to get then is delay the response until I get the contents of the file.
I know that node is based on async but I can't figure out how to handle it!
Thanks all

If you want to use async-await need to create a promise and return it.
function readFile(file) {
return new Promise((res, rej) => {
try {
var text = '';
var readInterface = readline.createInterface({
input: fs.createReadStream(file),
terminal: false
});
readInterface
.on('line', function (line) {
linea = line.trim();
text += linea;
})
.on('close', function () {
res(text);
});
} catch(err){
rej(err)
}
});
}

If your using express.js or any framework built on top of it, you can simply pipe the readstream to the response since express's responses are streams to begin with:
const es = require('event-stream')
...
let getFileStream = path => (
fs.createReadStream(path)
.pipe(es.split())
.pipe(es.map(function (data, cb) {
cb(null
, inspect(JSON.parse(data)))
}))
);
router.get('/message', function (req, res, next) {
let file$ = getFileStream(yourFilePath)
file$.on('error', next).pipe(res)
})
If you need to transform the file content, you can use a transform stream or as shown in the example above, a synchronous event-stream mapping. The idea is to always play around with the file content at stream level to avoid having to load the entire file content in memory.
You don't really want to buffer the whole file content in memory. It can quickly become a problem with huge files on a busy day. what you need is to pipe the file stream directly to the browser. Same principle applies for any kind of consumer.
Of course, if the mechanism is all internal, you should only pass the file path along or the actual stream until you need to actually open the file and do something with the content. In this case, you go back to your stream toolbox, whether it be the native node.js stream API implementation, the event-stream package or some kind of observable library like rxjs.

I had a similar issue in an app that watches a directory for new files, reads the file(s) and returns derived data based on the file content. My Reader function is based on this async example from the nodejs docs. I return options, which contains the context, only after the file is read completely.
const { createReadStream } = require('fs')
const { createInterface } = require('readline')
const { once } = require('events')
// Reader.js
async function Reader (options) {
let { file, cb } = options
let fileStream = createReadStream(file)
const readInterface = createInterface({
input: fileStream,
crlfDelay: Infinity
})
readInterface.on('line', (line) => {
cb(line)
})
await once(readInterface, 'close')
return options
}
module.exports = Reader
I then have a file which imports my Reader and defines how to use it. I define a callback function to pass to the line event listener. I bind the callback to the options object that I pass to my Reader function. In the
readFile function I make sure to return the call to Reader, which is a Promise.
/**
* #desc callback to instruct what to do with each line read
*
* #param {*} line
*/
const readFileLine = function (line) {
linea = line.trim();
console.log(linea);
text += linea;
this.context += linea
}
/**
* #desc once the added file is ready to be processed read file line by line
* #listens {Event} listens for `process` event
*/
const readFile = (options) => {
return Reader(options)
}
/**
* #desc Call the file reader and do what you need with the reponse
*
*/
const getResponseFromFiles = (file) => {
const opts = {}
opts.cb = readFileLine.bind(opts)
opts.context = ''
opts.file = file
readFile(opts)
.then(data => {
process.exitCode = 0
console.log(data)
return data
})
.catch(err => {
process.exitCode = 1
console.log(err.message)
})
}

What is the "reactive" way to read file line-by-line

I'm learning reactive programming using RxJS and encounter a case when I need to read a file line-by-line. Actually I solved it using a solution likes:
https://gist.github.com/yvele/447555b1c5060952a279
It works, but I need to use some normal JS code to transform the stream of Buffers to stream of lines. (use "readline" module in example above)
I wonder if there are other ways to transform an Observable of Buffer to Observable of line, using RxJS operators, likes example below.
var Rx = require('rx');
var fs = require('fs');
var lines = Rx.Observable
.fromEvent(rl, 'data') // emits buffers overtime
// some transforms ...
.subscribe(
(line) => console.log(line), // emit string line by line
err => console.log("Error: %s", err),
() => console.log("Completed")
);

You can probably achieve something pretty close to what you want with scan and concatMap.
Something like:
bufferSource
.concat(Rx.Observable.of("\n")) // parens was missing // to make sure we don't miss the last line!
.scan(({ buffer }, b) => {
const splitted = buffer.concat(b).split("\n");
const rest = splitted.pop();
return { buffer: rest, items: splitted };
}, { buffer: "", items: [] })
// Each item here is a pair { buffer: string, items: string[] }
// such that buffer contains the remaining input text that has no newline
// and items contains the lines that have been produced by the last buffer
.concatMap(({ items }) => items)
// we flatten this into a sequence of items (strings)
.subscribe(
item => console.log(item),
err => console.log(err),
() => console.log("Done with this buffer source"),
);

You can use following class
'use strict'
const lineReader = require('line-reader');
const Rx = require('rxjs');
const RxOp = require('rxjs/operators');
class CSVReader {
constructor(filepath {
this.filepath = filepath;
}
readByLines()
{
const source = new Rx.Subject();
lineReader.open(this.filepath, (err, reader)=> {
Rx.of(0).pipe(
RxOp.expand(val => {
reader.nextLine((err2, line) => source.next(line));
return Rx.of(1 + val);
}),
RxOp.takeWhile(_=> {
let has = reader.hasNextLine();
if(!has) source.complete();
return has;
})
).subscribe(_=>_);
})
return source;
}
}
module.exports = CSVReader
and use it as follows
const { bufferCount } = require('rxjs/operators');
let reader = new CSVReader('path/to/file');
reader.readByLines()
.pipe(bufferCount(2)) // chunk size
.subscribe(chunk=> {
console.log({chunk});
});

I would say like this:
const readline = require('readline');
const fs = require('fs');
const path = require('path');
const {fromEvent, race, Observable} = require('rxjs');
const {tap, takeUntil, take, map} = require('rxjs/operators');
const rl = readline.createInterface({
input: fs.createReadStream(path.resolve('./', 'myfile'))
});
let obs = new Observable(observer=>{
rl.on('line', val => observer.next(val)),
rl.on('error', err => observer.error(err)),
rl.on('close', complete => observer.complete(complete))
})
.pipe(tap(line=>console.log(`line: ${line}`)))
obs.subscribe(()=>{},
(e)=>console.log(`Error reading file: ${e}`),
()=>console.log("Read complete"))
An alternative for creating the observable could be:
let obs = fromEvent(rl, 'line')
.pipe(
takeUntil(race(
fromEvent(rl, 'close').pipe(take(1)) ,
fromEvent(rl, 'error').pipe(map((err)=>{throw err}))
)))
Ideally, rxjs could have provided an operator like: fromEvent(emitter, nextEvent, errorEvent, completeEvent ) to help keep the above code even simpler.

I tried a bunch of the above answers and built my own ugly version. Then, I poked around the code on GitHub and found that RxJS handles stream like objects - there's no point in mucking around with events. Just pass a ReadStream to from and it tests it for ReadableStreamLike and then turns it into an AsyncGenerator.
import * as readline from 'node:readline';
import { from } from 'rxjs';
const file = fs.createReadStream(fileName);
const line = readline.createInterface({ input: file });
const line$ = from(line).subscribe({
next: (dat) => { ... },
error: (err) => { ... },
complete: () => { ... }
});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Using csv-parse with highlandjs - node.js

Related

How to update a redux state from a node js function

IPFS streams not buffers

Transform stream stops processing early

How to use readline in NodeJs on an event (wait stream close)

What is the "reactive" way to read file line-by-line

Categories

Resources