I've legacy code version 0.12.7 which is working perfectly fine.
However it is giving EMFILE "Too many files open" error frequently.
How can I release the file descriptor opened using:
require("fs").readFile(resobj.name, 'utf8', function (err, data)
{
});
You will most likely need to read the files in batches, like this:
const fs = require('fs/promises');
const files = [...<array of millions of file paths>];
const MAX_FILES_TO_PROCESS = 1000;
let promises;
let contents;
// Process 1000 files at a time
(async () => {
for (let a = 0; a < files.length; a += MAX_FILES_TO_PROCESS) {
promises = (files.slice(a, MAX_FILES_TO_PROCESS)).map(path => fs.readFile(path));
contents = await Promise.all(promises);
//Process the contents, then continue on the next loop
}
})();
Two observations that may help you:
Use createReadStream instead of readFile because the latter reads the entire file into memory. If you're handling thousands or millions of objects it's not scalable.
readFile doesn't return a descriptor because it closes the file automatically.
Related
I'm running into AggregateError: EMFILE: too many open files while streaming multiple files.
Machine Details:
MacOS Monterey,
MacBook Pro (14-inch, 2021),
Chip Apple M1 Pro,
Memory 16GB,
Node v16.13.0
I've tried increasing the limits with no luck.
Ideally I would like to be able to set the limit of the number of files open at one time or resolve by closing files as soon as they have been used.
Code below. I've tried to remove the unrelated code and replace it with '//...'.
const MultiStream = require('multistream');
const fs = require('fs-extra'); // Also tried graceful-fs and the standard fs
const { fdir } = require("fdir");
// Also have a require for the bz2 and split2 functions but editing from phone right now
//...
let files = [];
//...
(async() => {
const crawler = await new fdir()
.filter((path, isDirectory) => path.endsWith(".bz2"))
.withFullPaths()
.crawl("Dir/Sub Dir")
.withPromise();
for(const file of crawler){
files = [...files, fs.createReadStream(file)]
}
multi = await new MultiStream(files)
// Unzip
.pipe(bz2())
// Create chunks from lines
.pipe(split2())
.on('data', function (obj) {
// Code to filter data and extract what I need
//...
})
.on("error", function(error) {
// Handling parsing errors
//...
})
.on('end', function(error) {
// Output results
//...
})
})();
To prevent pre-opening a filehandle for every single file in your array, you want to only open the files upon demand when it's that particular file's turn to be streamed. And, you can do that with multi-stream.
Per the multi-stream doc, you can lazily create the readStreams by changing this:
for(const file of crawler){
files = [...files, fs.createReadStream(file)]
}
to this:
let files = crawler.map((f) => {
return function() {
return fs.createReadStream(f);
}
});
After reading over the npm page for multistream I think I have found something that will help. I have also edited where you are adding the stream to the files array as I don't see a need to instantiate a new array and spread existing elements like you are doing.
To lazily create the streams, wrap them in a function:
var streams = [
fs.createReadStream(__dirname + '/numbers/1.txt'),
function () { // will be executed when the stream is active
return fs.createReadStream(__dirname + '/numbers/2.txt')
},
function () { // same
return fs.createReadStream(__dirname + '/numbers/3.txt')
}
]
new MultiStream(streams).pipe(process.stdout) // => 123 ```
With that we can update your logic to include this functionality by simply wrapping the readStreams in functions, this way the streams will not be created until they are needed. This will prevent you from having too many open at once. We can do this by simply updating your file loop:
for(const file of crawler){
files.push(function() {
return fs.createReadStream(file)
})
}
I have two functions.
The first function reads all the files in a folder and writes their data to a new file.
The second function takes that new file (output of function 1) as input and creates another file. Therefore it has to wait until the write stream of function 1 has finished.
const fs = require('fs');
const path = require('path');
function f1(inputDir, outputFile) {
let stream = fs.createWriteStream(outputFile, {flags:'a'}); // new data should be appended to outputFile piece by piece (hence flag a)
let files = await fs.promises.readdir(inputDir);
for(let file of files) {
let pathOfCurrentFile = path.join(inputDir, file);
let stat = fs.statSync(pathOfCurrentFile);
if(stat.isFile()) {
data = await fs.readFileSync(pathOfCurrentFile, 'utf8');
// now the data is being modified for output
let result = data + 'other stuff';
stream.write(result);
}
}
stream.end();
}
function f2(inputFile, outputFile) {
let newData = doStuffWithMy(inputFile);
let stream = fs.createWriteStream(outputFile);
stream.write(newData);
stream.end();
}
f1('myFiles', 'myNewFile.txt');
f2('myNewFile.txt', 'myNewestFile.txt');
Here's what happens:
'myNewFile.txt' (output of f1) is created correctly
'myNewestFile.txt' is created but is either empty or only contains one or two words (it should contain a long text)
When I use a timeout before executing f2, it works fine, but I can't use a timeout because there can be thousands of input files in the inputDir, therefore I need a way to do it dynamically.
I've experimented with async/await, callbacks, promises etc. but that stuff seems to be a little to advanced for me, I couldn't get it to work.
Is there anything else I can try?
Since you asked about a synchronous version, here's what that could look like. This should only be used in a single user script or in startup code, not in a running server. A server should only use asynchronous file I/O.
// synchronous version
function f1(inputDir, outputFile) {
let outputHandle = fs.openSync(outputFile, "a");
try {
let files = fs.readdirSync(inputDir, {withFileTypes: true});
for (let f of files) {
if (f.isFile()) {
let pathOfCurrentFile = path.join(inputDir, f.name);
let data = fs.readFileSync(pathOfCurrentFile, 'utf8');
fs.writeSync(outputHandle, data);
}
}
} finally {
fs.closeSync(outputHandle);
}
}
function f2(inputFile, outputFile) {
let newData = doStuffWithMy(inputFile);
fs.writeFileSync(outputFile, newData);
}
f1('myFiles', 'myNewFile.txt');
f2('myNewFile.txt', 'myNewestFile.txt');
I have a large XML file (2GB) and I need to add a new line if a criterion is met. Example:
<chickens>
<chicken>
<name>sam</name>
<female>false</female>
</chicken>
<chicken>
<name>julia</name>
<female>true</female>
</chicken>
// many many more chickens
</chickens>
to:
<chickens>
<chicken>
<name>sam</name>
<female>false</female>
</chicken>
<chicken>
<name>julia</name>
<female>true</female>
<canLayEggs>true</canLayEggs> // <- Add this line if female is true;
</chicken>
// many many more chickens
</chickens>
However, the issue that I'm facing is that sometimes the chunk gets cut off like <female>true
and then the next chunk starts with </female>
Here is my code:
const fs = require("fs");
const input = "input.xml";
const MAX_CHUNK_SIZE = 50 * 1024 * 1024; //50 MB
const buffer = Buffer.alloc(MAX_CHUNK_SIZE);
let readStream = fs.createReadStream(input, "utf8", {
highWaterMark: MAX_CHUNK_SIZE,
});
let writeStream = fs.createWriteStream("output.xml");
readStream.on("data", (chunk) => {
let data = chunk;
if (data.includes("<category>f</category>")) {
data = data.replace(
/<female>true<\/female>/g,
"<female>true</female><canLayEggs>true</canLayEggs>"
);
}
writeStream.write(data, "utf-8");
});
readStream.on("end", () => {
writeStream.end();
})
I have tried Google but I can't seem to find the right term, and many tutorials out there doesn't really cover this. Any help is appreciated.
you are reading 50MB per chunk. so in the on data callback, you can call:
readStream.destroy();
also, you don't need to init the buffer with 50MB size, it is not used here and after the text replacement it is likely longer than 50MB.
It is good, that you close the writeStream when the readStream closes.
The files are submitted to my server and I'm trying to determine if the CSV is valid and if all the images referenced from the CSV are present in the zip. I have to populate a Mongo database with all that information but I want to do it in the background, and send a response to the client as fast as possible.
So I have two readable streams and I have three different approaches:
Unzipping the file takes 24sec, so unzip + parsing the csv + fs.exists is not an option.
Parsing the whole csv, save filenames in array and then reading the zip using node-unzip and pipe takes 5 seconds.
Reading the csv and in parallel read the zip and use a shared array to determine simultaneusly if the files are present, which is the fastest option, takes 4 seconds.
Does anyone have an idea of how to do it faster?
EDIT: The code used for validation is:
// lib/validator.js
function validateParallel(csv, zip) {
const shared = {};
return new Promise((resolve, reject) => {
const l = CSV_VALIDATORS.length - 1;
csv
.pipe(split())
.pipe(through2(validateLine.bind({ zip, reject, n: 0, l, shared })))
.on('finish', () => {
zip
.pipe(unzip.Parse())
.on('entry', (entry) => {
delete shared[entry.path];
})
.on('close', () => {
resolve(Object.keys(shared).length === 0);
});
});
});
}
// perfomance/validate.spec.js
const zip = fs.createReadStream('./performance/imports/import.zip');
const csv = fs.createReadStream('./performance/imports/stress-test.csv');
const hrstart = process.hrtime();
validator
.validateParallel(csv, zip)
.then(function(isValid) {
console.log(`valid=${isValid}`);
const hrend = process.hrtime(hrstart);
console.info("Execution time (hr): %ds %dms", hrend[0], hrend[1]/1000000);
});
ValidateLine takes the image name and pushes it into the shared object. The output is:
valid=true
Execution time (hr): 4s 926.031869ms
I have simplified the code and removed error management to make it more readable.
Do you have to also validate the images themselves, or just make sure their paths exist in the CSV file? If the latter, you can run a shell process that executes unzip -l on the zip file, which will print only the file names, should be quick.
My english is not so good so I hope to be clear.
Just for sake of curiosity I want to test multiple concurrent(*) file writes on the same file and see that t produce errors.
The manual is clear on that:
Note that it is unsafe to use fs.writeFile multiple times on the same
file without waiting for the callback. For this scenario,
fs.createWriteStream is strongly recommended.
So if I write a relatively big amount of data into a file and in the while it is still writing I'm call another file write on the same file without waiting for the callback.. I'm expecting some sort of error.
I tried to wrote a small example to test this situation but I can't manage to see any errors.
"use strict";
const fs = require('fs');
const writeToFile = (filename, data) => new Promise((resolve, reject) => {
fs.writeFile(filename, data, { flag: 'a' }, err => {
if (err) {
return reject(err);
}
return resolve();
});
});
const getChars = (num, char) => {
let result = '';
for (let i = 0; i < num; ++i) {
result += char;
}
return result + '\n';
};
const k = 10000000;
const data0 = getChars(k, 0);
const data1 = getChars(k, 1);
writeToFile('test1', data0)
.then(() => console.log('0 written'))
.catch(e => console.log('errors in write 0'));
writeToFile('test1', data1)
.then(() => console.log('1 written'))
.catch(e => console.log('errors in write 1'));
To test it instead of open the file with some editor (that is a little bit slow in my box) I use a linux command to see the end of the first buffer and the beginning of the second buffer (and that they do not overlap):
tail -c 10000010 test1 | grep 0
But I'm not sure it is the right way to test it.
Just to be clear I'm with node v6.2.2, and mac 10.11.6.
Does anyone over there can point me a small sketch that uses fs.writeFile that produce a wrong output?
(*) concurrent = don't wait for the end of one file write to begin the next one