nodejs: each line in separate file - node.js

I want to split a file: each line in a separate file. The initial file is really big. I finished with code bellow:
var fileCounter = -1;
function getWritable() {
fileCounter++;
writable = fs.createWriteStream('data/part'+ fileCounter + '.txt', {flags:'w'});
return writable;
}
var readable = fs.createReadStream(file).pipe(split());
readable.on('data', function (line) {
var flag = getWritable().write(line, function() {
readable.resume();
});
if (!flag) {
readable.pause();
}
});
It works but it is ugly. Is there more nodish way to do that? maybe with piping and without pause/resume.
NB: it's not a question about lines/files/etc . The question is about streams and I just try to illustrate it with the problem

You can use Node's built-in readline module.
var fs = require('fs');
var readline = require('readline');
var fileCounter = -1;
var file = "foo.txt";
readline.createInterface({
input: fs.createReadStream(file),
terminal: false
}).on('line', function(line) {
var writable = fs.createWriteStream('data/part'+ fileCounter + '.txt', {flags:'w'});
writable.write(line);
fileCounter++
});
Note that this will lose the last line of the file if there is no newline at the end, so make sure your last line of data is followed by a newline.
Also note that the docs indicate that it is Stability index 2, meaning:
Stability: 2 - Unstable The API is in the process of settling, but has
not yet had sufficient real-world testing to be considered stable.
Backwards-compatibility will be maintained if reasonable.

How about the following? Did you try? Pause and resume logic isn't realy needed here.
var split = require('split');
var fs = require('fs');
var fileCounter = -1;
var readable = fs.createReadStream(file).pipe(split());
readable.on('data', function (line) {
fileCounter++;
var writable = fs.createWriteStream('data/part'+ fileCounter + '.txt', {flags:'w'});
writable.write(line);
writable.close();
});
Piping dynamically would be hard...
EDIT: You could create a writable (so pipe()able) object that would, on('data') event, do the "create file, open it, write the data, close it" but it :
wouldn't be reusable
wouldn't follow the KISS principle
would require a special and specific logic for file naming (It would accept a string pattern as an argument in its constructor with a placeholder for the number. Etc...)
I realy don't recommend that path, or you're going to take ages implementing a non-realy-reusable module. Though, that would make a good writable implementation exercise.

Related

How to delete lines of text from file with createWriteStream with Node.js?

I'm trying to update a huge text document by deleting text that is dynamically received from an array. I cannot use readFileSync because the file is way too large so I have to stream it. The problem im encountering is the function deletes everything instead of only deleting what's in the array. Perhaps im not understanding how to properly delete something from a stream. How can this be done?
largeFile_example.txt
test_domain_1
test_domain_2
test_domain_3
test_domain_4
test_domain_5
test_domain_6
test_domain_7
test_domain_8
test_domain_9
test_domain_10
stream.js
const es = require('event-stream');
const fs = require('fs');
//array of domains to delete
var domains = ['test_domain_2','test_domain_6','test_domain_8'];
//loop
domains.forEach(function(domain){
//domain to delete
var dom_to_delete = domain;
//stream
var s = fs
.createReadStream('largeFile_example.txt')
.pipe(es.split())
.pipe(
es
.mapSync(function(line) {
//check if found in text
if(line === dom_to_delete){
//delete
var newValue = dom_to_delete.replace(line, '');
fs.createWriteStream('largeFile_example.txt', newValue, 'utf-8');
}
})
.on('error', function(err) {
console.log('Error while reading file.', err);
})
.on('end', function() {
//...do something
}),
);
})
You can simply use readline interface with the streams and you can read line by line. When you encounter any domain from the array just don't add it.
You can use for-of with async/await
const fs = require('fs');
const readline = require('readline');
async function processLine() {
const fileStream = fs.createReadStream('yourfile');
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
});
// Note: crlfDelay recognize all instances of CR LF
// ('\r\n') in file as a single line break.
for await (const line of rl) {
// each line will be here as domain
// create a write stream and append it to the file
// line by line using { flag: a }
}
}
processLine();
To delete the domains from the existing file, you need to follow these steps:
Need to read the file as a stream.
Replace the text you don't want with the '' using regex or replace method.
add the updated content to the temp file or a new file.
There is no way you can read from one point and update the same line. I mean I am not aware of such a technique in Node.js(will be happy to know that). So that's why you need to create a new file and once updated remove the old file.
Maybe you can add some more value to how you code it as I am not sure why you want to do that. If your file is not large you can do that in-place, but your case is different.

How to append to a file in Node.js but limit the file to a certain size

I would like to truncate a file by newline \n so that it only grows to some max number of lines. How do I do that with something like fs.appendFileSync?
You can address this problem by investigating readline API from node:
const fs = require('fs');
const readline = require('readline');
async function processLineByLine() {
const fileStream = fs.createReadStream('input.txt');
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
});
for await (const line of rl) {
// count your lines in the file
// you can copy into output stream the content
// of every line till it did not pass the max line number
}
// if the counter is not yet finished using
// rl.write() you can continue appending to the file
}
processLineByLine();
A second idea very similar to this one was answered here:
Parsing huge logfiles in Node.js - read in line-by-line

Is there a more elegant way to read then write *the same file* with node js stream

I wanna read file then change it with through2 then write into the same file, code like:
const rm = require('rimraf')
const through2 = require('through2')
const fs = require('graceful-fs')
// source file path
const replacementPath = `./static/projects/${destPath}/index.html`
// temp file path
const tempfilePath = `./static/projects/${destPath}/tempfile.html`
// read source file then write into temp file
await promiseReplace(replacementPath, tempfilePath)
// del the source file
rm.sync(replacementPath)
// rename the temp file name to source file name
fs.renameSync(tempfilePath, replacementPath)
// del the temp file
rm.sync(tempfilePath)
// promiseify readStream and writeStream
function promiseReplace (readfile, writefile) {
return new Promise((res, rej) => {
fs.createReadStream(readfile)
.pipe(through2.obj(function (chunk, encoding, done) {
const replaced = chunk.toString().replace(/id="wrap"/g, 'dududud')
done(null, replaced)
}))
.pipe(fs.createWriteStream(writefile))
.on('finish', () => {
console.log('replace done')
res()
})
.on('error', (err) => {
console.log(err)
rej(err)
})
})
}
the above code works, but I wanna know can I make it more elegant ?
and I also try some temp lib like node-temp
unfortunately, it cannot readStream and writeStream into the same file as well, and I open a issues about this.
So any one know a better way to do this tell me, thank you very much.
You can make the code more elegant by getting rid of unnecessary dependencies and using the newer simplified constructor for streams.
const fs = require('fs');
const util = require('util');
const stream = require('stream');
const tempWrite = require('temp-write');
const rename = util.promisify(fs.rename);
const goat2llama = async (filePath) => {
const str = fs.createReadStream(filePath, 'utf8')
.pipe(new stream.Transform({
decodeStrings : false,
transform(chunk, encoding, done) {
done(null, chunk.replace(/goat/g, 'llama'));
}
}));
const tempPath = await tempWrite(str);
await rename(tempPath, filePath);
};
Tests
AVA tests to prove that it works:
import fs from 'fs';
import path from 'path';
import util from 'util';
import test from 'ava';
import mkdirtemp from 'mkdirtemp';
import goat2llama from '.';
const writeFile = util.promisify(fs.writeFile);
const readFile = util.promisify(fs.readFile);
const fixture = async (content) => {
const dir = await mkdirtemp();
const fixturePath = path.join(dir, 'fixture.txt');
await writeFile(fixturePath, content);
return fixturePath;
};
test('goat2llama()', async (t) => {
const filePath = await fixture('I like goats and frogs, but goats the best');
await goat2llama(filePath);
t.is(await readFile(filePath, 'utf8'), 'I like llamas and frogs, but llamas the best');
});
A few things about the changes:
Through2 is not really needed anymore. It used to be a pain to set up passthrough or transform streams properly, but that is not the case anymore thanks to the simplified construction API.
You probably don't need graceful-fs, either. Unless you are doing a lot of concurrent disk I/O, EMFILE is not usually a problem, especially these days as Node has gotten smarter about file descriptors. But that library does help with temporary errors caused by antivirus software on Windows, if that is a problem for you.
You definitely do not need rimraf for this. You only need fs.rename(). It is similar to mv on the command line, with a few nuances that make it distinct, but the differences are not super important here. The point is there will be nothing at the temporary path after you rename the file that was there.
I used temp-write because it generates a secure random filepath for you and puts it in the OS temp directory (which automatically gets cleaned up now and then), plus it handles converting the stream to a Promise for you and takes care of some edge cases around errors. Disclosure: I wrote the streams implementation in temp-write. :)
Overall, this is a decent improvement. However, there remains the boundary problem discussed in the comments. Luckily, you are not the first person to encounter this problem! I wouldn't call the actual solution particularly elegant, certainly not if you implement it yourself. But replacestream is here to help you.
const fs = require('fs');
const util = require('util');
const tempWrite = require('temp-write');
const replaceStream = require('replacestream');
const rename = util.promisify(fs.rename);
const goat2llama = async (filePath) => {
const str = fs.createReadStream(filePath, 'utf8')
.pipe(replaceStream('goat', 'llama'));
const tempPath = await tempWrite(str);
await rename(tempPath, filePath);
};
Also...
I do not like temp files
Indeed, temp files are often bad. However, in this case, the temp file is managed by a well-designed library and stored in a secure, out-of-the-way location. There is virtually no chance of conflicting with other processes. And even if the rename() fails somehow, the file will be cleaned up by the OS.
That said, you can avoid temp files altogether by using fs.readFile() and fs.writeFile() instead of streaming. The former also makes text replacement much easier since you do not have to worry about chunk boundaries. You have to choose one approach or the other, however for very big files, streaming may be the only option, aside from manually chunking the file.
Streams are useless in this situation, because they return you chunks of file that can break the string that you're searching for. You could use streams, then merge all these chunks to get content, then replace the string that you need, but that will be longer code that will provoke just one question: why do you read file by chunks if you don't use them ?
The shortest way to achieve what you want is:
let fileContent = fs.readFileSync('file_name.html', 'utf8')
let replaced = fileContent.replace(/id="wrap"/g, 'dududud')
fs.writeFileSync('file_name.html', replaced)
All these functions are synchronous, so you don't have to promisify them

How can I create a txt file that holds the contents of an array in JavaScript?

I have several arrays that contain data that I would like to export, each array to a txt file, in order to be analyzed using MATLAB.
Let's say my array is:
var xPosition = [];
// some algorithm that adds content to xPosition
// TODO: export array into a txt file let's call it x_n.txt
It would be great to store each element of an array per line.
I have found a guide for the solution to my question in this post. The following code is what I ended up using:
var fs = require('fs');
var xPosition = [];
// some algorithm that adds content to xPosition
var file = fs.createWriteStream('./positions/x_n.txt');
file.on('error', function(err) { /* error handling */ });
xPosition.forEach(function(v) { file.write(v + '\n'); });
file.end();
The solution you found works, but here's how I'd have done it:
var fs = require('fs');
var xPosition = [1,2,3]; // Generate this
var fileName = './positions/x_n.txt';
fs.writeFileSync(fileName, xPosition.join('\n'));
This uses node's synchronous file writing capability, which is ideal for your purposes. You don't have to open or close file handles, etc. I'd use streams only if I had gigabytes of data to write out.

Pipe to stdout and writeable stream

I'm piping a file through a duplex string (courtesy of through) and I'm having trouble printing information to stdout and writing to the file. One or the other works just fine.
var fs = require('fs');
var path = require('path');
var through = require('through'); // easy duplexing, i'm young
catify = new through(function(data){
this.queue(data.toString().replace(/(woof)/gi, 'meow'));
});
var reader = fs.createReadStream('dogDiary.txt'); // woof woof etc.
var writer = fs.createWriteStream(path.normalize('generated/catDiary.txt')); // meow meow etc.
// yay!
reader.pipe(catify).pipe(writer)
// blank file. T_T
reader.pipe(catify).pipe(process.stdout).pipe(writer)
I'm assuming this is because process.stdout is a writeable stream, but I'm not sure how to do what I want (i've tried passing {end: false} to no avail).
Still struggling to wrap my head around streams, so forgive me if i've missed something obvious : )
I think what you want is:
reader.pipe(catify)
catify.pipe(writer)
catify.pipe(process.stdout)
These needed to be separated because pipes return their destinations and not their source.

Resources