I am basically trying to :
open a csv file as a stream
make some operation on each line
stream the result into a second csv file
in node.js.
Here is my code :
var fs = require("fs");
var csv = require("csv");
var readStream = fs.createReadStream("input.csv");
var writeStream = fs.createWriteStream("output.csv");
var csvStream = csv
.parse()
.on("data", function(data){
//do some stuff with data
return(JSON.stringify(data));
})
.on("end", function(){
console.log("done");
})
.on("error", function(error){
console.log(error)
});
(readStream.pipe(csvStream)).pipe(writeStream);
I am getting "TypeError: Invalid non-string/buffer chunk". What am I doing wrong ? I am totally new to node.js, so please detail your answer.
You are reading correctly the data. However using return is not the correct way to transform your data. CSV Stream cannot at the same time output untransformed data (that you are reading in your data event handler) and transformed data that you would pipe to writeStream.
To use pipe with the writeStream, you would have needed a readableStream outputing your transformed data. That would have meant creating a read/write stream around your transform function, and piping fileReader > csvReader > transformStream > writeStream.
It is way simpler to attach a function to the data event of the csv reader like you did, but you need to manually write to the file.
Correct code code may be more clear this way :
var fs = require("fs");
var csv = require("csv");
var readStream = fs.createReadStream("input.csv"); // readStream is a read-only stream wit raw text content of the CSV file
var writeStream = fs.createWriteStream("output.csv"); // writeStream is a write-only stream to write on the disk
var csvStream = csv.parse(); // csv Stream is a read and write stream : it reads raw text in CSV and output untransformed records
csvStream.on("data", function(data) {
//console.log(data)
writeStream.write(JSON.stringify(data));
})
.on("end", function(){
console.log("done");
})
.on("error", function(error){
console.log(error)
});
readStream.pipe(csvStream)
I have basically done these:
parsed (read csv strings and writes objects and arrays)
stringified (reads objects and arrays and writes csv strings)
piped all that to writeStream to my file
function readingAppendingAndWritingToCSVFile(readStream, writeStream) {
const results = [];
const p = new Promise((resolve, reject) => {
readStream.pipe(csv.parse({ columns: true }))
.on('data', (data) => {
console.log('data --------------', data);
data.name = 'somename'; // will add new column with same data in all rows
console.log('data after pushing ----------', data);
results.push(data);
})
.on('error', (err) => {
console.log('error ------------', err);
reject();
})
.on('finish', () => {
console.log();
console.log('all the csv strings parsed to objects -------------', results);
})
.pipe(csv.stringify({ header: true }))
.pipe(writeStream);
});
return p;
Related
I'm learning how to use the csv-parse module for nodejs. I wrote this code and it works perfectly:
var fs = require('fs');
const fileName = './spreadsheet.csv';
const assert = require('assert');
const { parse } = require('csv-parse');
const records = [];
// Initialize the parser
const parser = parse({
delimiter: ','
});
// Use the readable stream api to consume records
parser.on('readable', function(){
let record;
while ((record = parser.read()) !== null) {
records.push(record);
}
});
// Catch any error
parser.on('error', function(err){
console.error(err.message);
});
fs.readFile(fileName, 'utf8', function (err, f) {
if (err) {
return console.error(err);
}
const rows = f.split("\r\n");
for(let x in rows) {
parser.write(rows[x]+"\n");
}
parser.end();
console.log(records);
});
But right now, I depend on the fs module and fs.readFile to consume my csv file. Does the csv-parse have an option to read ffrom file? I ask because as you can see in my code, I ahve to specify my own line-break characters, which could differ between csv files. I thought maybe the csv-parse module would have something that can more readily address such a situation?
The parser object will do most of the work for you. It is expecting the data to arrive on its stream interface and it will do everything else. All you have to do is open a stream and the pipe it to the parser like this:
fs.createReadStream(fileName).pipe(parser);
And, here it is combined with your code:
const fs = require('fs');
const fileName = './spreadsheet.csv';
const { parse } = require('csv-parse');
const records = [];
// Initialize the parser
const parser = parse({
delimiter: ','
});
// Use the readable stream api to consume records
parser.on('readable', function(){
let record;
while ((record = parser.read()) !== null) {
records.push(record);
}
});
// Catch any error
parser.on('error', function(err){
console.error(err.message);
});
parser.on('end', function() {
console.log(records);
});
// open the file and pipe it into the parser
fs.createReadStream(fileName).pipe(parser);
P.S. It's amazing that such a simple example of getting the CSV data from a file is not shown in the documentation (at least not anywhere I could find it). I'm also surprised, they don't offer an option where they will automatically read the data from the stream, instead requiring you to implement the readable event handler. Odd, for such an otherwise complete package.
Learning how to do large file manipulation with Node and streams I'm stuck in the middle of a file change when passing down the results to a module and I think the process is still in memory when it reaches another module.
I get a zip from an s3 bucket locally and unzip the contents:
try {
const stream = fs.createReadStream(zipFile).pipe(unzipper.Extract({ path }))
stream.on('error', err => console.error(err))
stream.on('close', async () => {
fs.removeSync(zipFile)
try {
const neededFile = await dir(path) // delete files not needed from zip, rename and return named file
await mod1(neededFile) // review file, edit and return info
await mod2(neededFile, data) // pass down data for further changes
return
} catch (err) {
console.log('error')
}
})
} catch (err) {
console.log('stream error')
}
Initial unzip I learned that there is a difference between stream on close and finish because I could pass the file to the first module and start the manipulation but the file, I guess due to the size, output and file never matched. After cleaning the files I dont need I pass the renamed file to mod1 for changes and run a write file sync:
mod1.js:
const fs = require('fs-extra')
module.exports = file => {
fs.readFile(file, 'utf8', (err, data) => {
if (err) return console.log(err)
try {
const result = data.replace(/: /gm, `:`).replace(/(?<=location:")foobar(?=")/gm, '')
fs.writeFileSync(file, result)
} catch (err) {
console.log(err)
return err
}
})
}
when I tried to do the above with:
const readStream = fs.createReadStream(file)
const writeStream = fs.createWriteStream(file)
readStream.on('data', chunk => {
const data = chunk.toString().replace(/: /gm, `:`).replace(/(?<=location:")foobar(?=")/gm, '')
writeStream.write(data)
})
readStream.on('end', () => {
writeStream.close()
})
the file would always be blank. After writeFileSync I proceed with the next module to search for a line ref:
mod2.js:
const fs = require('fs-extra')
module.exports = (file, data) => {
const parseFile = fs.readFileSync(file, 'utf8')
parseFile.split(/\r?\n/).map((line, idx) => {
if (line.includes(data)) console.log(idx + 1)
})
}
but the line number returned is that of the initial unzipped file not the file that was modded from the first module. Because I thought the sync process would be for the file it would appear the file being referenced is in memory? My search results for streams when learning about them:
Working with Node.js Stream API
Stream
How to use stream.pipe
Understanding Streams in Node.js
Node.js Streams: Everything you need to know
Streams, Piping, and Their Error Handling in Node.js
Writing to Files in Node.js
Error handling with node.js streams
Node.js Readable file stream not getting data
Node.js stream 'end' event not firing
NodeJS streams not awaiting async
stream-handbook
How should a file be manipulated after an unzip stream and why does the second module reference the file after it was unzipped and not when it was already manipulated? Is it possible to write multiple streams synchronously?
I have a set of videos I want to take a screenshot from each of them, then do some processing on these generated images, and finally store them.
To be able to do the processing I need to get the screenshots as buffers.
this is my code
ffmpeg(videoFilePath)
.screenshots({
count: 1,
timestamps: ['5%'],
folder: DestinationFolderPath,
size: thumbnailWidth + 'x' + thumbnailHeight,
})
.on('err', function (error) {
console.log(err)
});
as you see the output is being directly stored in the DestinationFolderPath. Instead of that I want to get the output as a buffer.
I'm not sure how to do that directly, but the screenshot is saved in a folder in your file system, so you could read the file from there and convert it into a buffer.
const thumbnailStream = createReadStream(thumbnailPath)
const thumbnailBuffer = stream2buffer(thumbnailStream)
There's a lot of ways of transforming a stream into a buffer, you can check it out in this question.
e.g. from this answer
function stream2buffer(stream) {
return new Promise((resolve, reject) => {
const _buf = [];
stream.on("data", (chunk) => _buf.push(chunk));
stream.on("end", () => resolve(Buffer.concat(_buf)));
stream.on("error", (err) => reject(err));
});
}
const thumbnailStream = createReadStream(thumbnailPath)
const thumbnailBuffer = await stream2buffer(thumbnailStream)
And createReadStream is imported from fs
I currently have a csv file that is 1.3 million lines. I'm trying to parse this file line by line and run a processes on each line. The issue I am running into, is I run out of heap memory. I've read online and tried a bunch of solutions to not store the entire file into memory, but it seems nothing is working. Here is my current code:
const readLine = createInterface({
input: createReadStream(file),
crlfDelay: Infinity
});
readLine.on('line', async (line) => {
let record = parse2(`${line}`, {
delimiter: ',',
skip_empty_lines: true,
skip_lines_with_empty_values: false
});
// Do something with record
index++;
if (index % 1000 === 0) {
console.log(index);
}
});
// halts process until all lines have been processed
await once(readLine, 'close');
This starts off strong, but slowly the heap gets filled, and I run out of memory and the program crashes. I'm using a readstream, so I don't understand why the file is filling the heap.
Try using the library csv-parser https://www.npmjs.com/package/csv-parser
const csv = require('csv-parser');
const fs = require('fs');
fs.createReadStream('data.csv')
.pipe(csv())
.on('data', (row) => {
console.log(row);
})
.on('end', () => {
console.log('CSV file successfully processed');
});
Taken from: https://stackabuse.com/reading-and-writing-csv-files-with-node-js/
I had tried something similar for file for ~2GB and it worked without any issue with EventStream
var fs = require('fs');
var eventStream = require('event-stream');
fs
.createReadStream('veryLargeFile.txt')
.pipe(eventStream.split())
.pipe(
eventStream
.mapSync(function(line) {
// Do something with record `line`
}).on('error', function(err) {
console.log('Error while reading file.', err);
})
.on('end', function() {
// On End
})
)
Please try and let me know if it helps
The following line will download an image file from a specified url variable:
var filename = path.join(__dirname, url.replace(/^.*[\\\/]/, ''));
request(url).pipe(fs.createWriteStream(filename));
And these lines will take that image and save to MongoDB GridFS:
var gfs = Grid(mongoose.connection.db, mongoose.mongo);
var writestream = gfs.createWriteStream({ filename: filename });
fs.createReadStream(filename).pipe(writestream);
Chaining pipe like this throws Error: 500 Cannot Pipe. Not Pipeable.
request(url).pipe(fs.createWriteStream(filename)).pipe(writestream);
This happens because the image file is not ready to be read yet, right? What should I do to get around this problem?Error: 500 Cannot Pipe. Not Pipeable.
Using the following: Node.js 0.10.10, mongoose, request and gridfs-stream libraries.
request(url).pipe(fs.createWriteStream(filename)).pipe(writestream);
is the same as this:
var fileStream = fs.createWriteStream(filename);
request(url).pipe(fileStream);
fileStream.pipe(writestream);
So the issue is that you are attempting to .pipe one WriteStream into another WriteStream.
// create 'fs' module variable
var fs = require("fs");
// open the streams
var readerStream = fs.createReadStream('inputfile.txt');
var writerStream = fs.createWriteStream('outputfile.txt');
// pipe the read and write operations
// read input file and write data to output file
readerStream.pipe(writerStream);
I think the confusion in chaining the pipes is caused by the fact that the pipe method implicitly "makes choices" on it's own on what to return. That is:
readableStream.pipe(writableStream) // Returns writable stream
readableStream.pipe(duplexStream) // Returns readable stream
But the general rule says that "You can only pipe a Writable Stream to a Readable Stream." In other words only Readable Streams have the pipe() method.
You cannot chain the ReadStream to the WriteStream because the latter is not duplex, therefore you would do - for a gzipped archive
request.get(url, {
gzip: true,
encoding: null
})
.pipe(fs.createWriteStream(tmpPath))
.on('close', function() {
console.info("downloaded %s", tmpPath);
fs.createReadStream(tmpPath)
.pipe(gunzip)
.pipe(fs.createWriteStream(destPath))
.on('close', function() {
console.info("unarchived %s", destPath);
})
.on('error', (error) => {
console.warn("gzip error:%#", error);
})
})
.on('error', (error) => {
console.warn("download error:%#", error);
})