There are some untar libraries, but I cannot get them working.
My idea would be something like
untar(bufferStreamOrFilePath).extractToDirectory("/path", function(err){})
Is something like this available?
Just an update on this answer, instead of node-tar, consider using tar-fs which yields a significant performance boost, as well as a neater interface.
var tarFile = 'my-other-tarball.tar';
var target = './my-other-directory';
// extracting a directory
fs.createReadStream(tarFile).pipe(tar.extract(target));
The tar-stream module is a pretty good one:
var tar = require('tar-stream')
var fs = require('fs')
var extract = tar.extract();
extract.on('entry', function(header, stream, callback) {
// make directories or files depending on the header here...
// call callback() when you're done with this entry
});
fs.createReadStream("something.tar").pipe(extract)
extract.on('finish', function() {
console.log('done!')
});
A function to extract a base64 encoded tar fully in memory, with the assumption that all the files in the tar are utf-8 encoded text files.
const tar=require('tar');
let Duplex = require('stream').Duplex;
function bufferToStream(buffer) {
let stream = new Duplex();
stream.push(buffer);
stream.push(null);
return stream;
}
module.exports=function(base64EncodedTar){
return new Promise(function(resolve, reject){
const buffer = new Buffer.from(base64EncodedTar, "base64");
let files={};
try{
bufferToStream(buffer).pipe(new tar.Parse())
.on('entry', entry => {
let file={
path:entry.path,
content:""
};
files[entry.path]=file;
entry.on('data', function (tarFileData) {
file.content += tarFileData.toString('utf-8');
});
// resolve(entry);
}).on('close', function(){
resolve(files);
});
} catch(e){
reject(e);
}
})
};
Expanding on the tar-stream answer, that seemed to be the simplest to use in the browser with jest-tested code. Comparing the node libraries based on my trying to implement a project:
tar: Fairly easy to use on files, or buffers as in or Gudlaugur Egilsson's answer. It had some annoying webpack polyfill issues that I had trouble with when putting it into a react app.
js-untar: This was pretty annoying to set up for jest testing because it uses web workers and blob URLs, which jest does not directly support. I didn't proceed to getting it working in the browser, though it may work fine there. In order to get jest tests working, I had to use jsdom-worker-fix, and it was very slow in that environment. (It may be faster in-browser.)
tar-stream combined with gunzip-maybe seems to be fairly performant in browser, and doesn't have any issues with being used in jest tests. Worked fine on multi-hundred megabyte tarballs I tried.
This code extracts a tar or tar.gz stream:
var tar = require("tar-stream");
const gunzip = require("gunzip-maybe");
exports.unTar = (tarballStream) => {
const results = {};
return new Promise((resolve, reject) => {
var extract = tar.extract();
extract.on("entry", async function (header, stream, next) {
const chunks = [];
for await (let chunk of stream) {
chunks.push(chunk);
}
results[header.name] = Buffer.concat(chunks);
next();
});
extract.on("finish", function () {
resolve(results);
});
tarballStream.pipe(gunzip()).pipe(extract);
});
};
Then in browser, you can use readable-web-to-node-stream to process a tarball fetched in the browser.
const { ReadableWebToNodeStream } = require("readable-web-to-node-stream");
const response = await fetch(url, headers);
const extracted = await unTar(new ReadableWebToNodeStream(response.body));
Related
I need to upload a v8 heap dump into an AWS S3 bucket after it's generated however the file that is uploaded is either 0KB or 256KB. The file on the server is over 70MB in size so it appears that the request isn't waiting until the heap dump isn't completely flushed to disk. I'm guessing the readable stream that is getting piped into fs.createWriteStream is happening in an async manner and the await with the call to the function isn't actually waiting. I'm using the v3 version of the AWS NodeJS SDK. What am I doing incorrectly?
Code
async function createHeapSnapshot (fileName) {
const snapshotStream = v8.getHeapSnapshot();
// It's important that the filename end with `.heapsnapshot`,
// otherwise Chrome DevTools won't open it.
const fileStream = fs.createWriteStream(fileName);
snapshotStream.pipe(fileStream);
}
async function pushHeapSnapshotToS3(fileName)
{
const heapDump = fs.createReadStream(fileName);
const s3Client = new S3Client();
const putCommand = new PutObjectCommand(
{
Bucket: "my-bucket",
Key: `heapdumps/${fileName}`,
Body: heapDump
}
)
return s3Client.send(putCommand);
}
app.get('/heapdump', asyncMiddleware(async (req, res) => {
const currentDateTime = Date.now();
const fileName = `${currentDateTime}.heapsnapshot`;
await createHeapSnapshot(fileName);
await pushHeapSnapshotToS3(fileName);
res.send({
heapdumpFileName: `${currentDateTime}.heapsnapshot`
});
}));
Your guess is correct. The createHeapSnapshot() returns a promise, but that promise has NO connection at all to when the stream is done. Therefore, when the caller uses await on that promise, the promise is resolved long before the stream is actually done. async functions have no magic in them to somehow know when a non-promisified asynchronous operation like .pipe() is done. So, your async function returns a promise that has no connection at all to the stream functions.
Since streams don't have very much native support for promises, you can manually promisify the completion and errors of the streams:
function createHeapSnapshot (fileName) {
return new Promise((resolve, reject) => {
const snapshotStream = v8.getHeapSnapshot();
// It's important that the filename end with `.heapsnapshot`,
// otherwise Chrome DevTools won't open it.
const fileStream = fs.createWriteStream(fileName);
fileStream.on('error', reject).on('finish', resolve);
snapshotStream.on('error', reject);
snapshotStream.pipe(fileStream);
});
}
Alternatively, you could use the newer pipeline() function which does support promises (built-in promise support added in nodejs v15) and replaces .pipe() and has built-in error monitoring to reject the promise:
const { pipeline } = require('stream/promises');
function createHeapSnapshot (fileName) {
const snapshotStream = v8.getHeapSnapshot();
// It's important that the filename end with `.heapsnapshot`,
// otherwise Chrome DevTools won't open it.
return pipeline(snapshotStream, fs.createWriteStream(fileName))
}
I am developing an internet radio web application, and I want users to live stream audio files. My approach is to create a readable stream from a music file and write to the res object as below.
On the server:
function delay(millis) {
return new Promise(function(resolve) {
setTimeout(resolve, millis);
});
}
const express = require('express');
(async function() {
const app = express();
const writables = [];
const readableStream = fs.createReadStream('music.mp3');
readableStream.on('data', async function(chunk) {
await delay(2711); // arbitrary delay
writables.forEach(function(writable) {
writable.write(chunk);
});
});
app.get('/stream', async function(req, res) {
writables.push(res);
}
await app.listen(8080);
})();
On the browser:
<audio src="http://localhost:8080/stream" autoplay></audio>
When I tested, it didn't work. I started digging into the problem, then I found out that the problem was that, as long as the data sent to the browser didn't include the first chunk, it won't play at all. Probably it's because the file chunks are not independent or something.
How can I read the chunks from the file such that they are independently playable?
I am trying to download (meaning create an instance of the file on the server) a .pdf file from a server that returns it to me in binary format, with:
Content-Type = application / octet-stream.
After a bit of online research I came to write:
http.get(url.parse(pdfURL), res => {
let data = [];
console.log(res.statusCode);
res.on('data', chunk => {
data.push(chunk);
}).on('end', () => {
let buffer = Buffer.concat(data);
console.log(buffer.toString('base64'));
fs.open(path, 'w', (e, fd) => {
if (e) throw e;
fs.write(fd, buffer, 0, buffer.length, null, e => {
if (e) throw e;
fs.close(fd, () => console.log('Wrote successfully'));
});
});
});
});
Everything works properly, but when I try to open the generated pdf, it tells me that the file is corrupt and not readable.
Any idea what might have been wrong?
Thanks
Edit:
I noticed that with postman everything works as it should, so I think the way I treat the binary is wrong
Ok, i got it,
I wasn't de-gzipping the response, now works properly
This didn't work for me, tried so many different ways until I found got, an npm library that handles http requests, here's what worked for me:
const stream = require('stream');
const { promisify } = require('util');
const fs = require('fs');
const got = require('got');
const pipeline = promisify(stream.pipeline);
async function downloadImage(url, name) {
await pipeline(
got.stream(url),
fs.createWriteStream(name)
);
}
More info here: https://bleext.com/post/downloading-images-with-nodejs
I'm trying to save a million data to a JSON file in my NodeJS application. The main idea is to save a grid of 1000x1000 pixels as an array with the x,y position and a color id so each pixel has a coordinate and a color. My actual code to generate an example :
So I have a function to generate data and using fs.writeFile() I can save it.
//resetPos('test.json');
function resetPos(path) {
let data = [];
for (let y = 1; y <= 1000; y++){
data.push([]);
}
data.forEach(function(e, i){
for (let x = 1; x <= 1000; x++) {
e.push([
"x": x,
"y": i,
"color": "#cccccc"
]);
}
});
fs.writeFile(path, JSON.stringify(data), function(err){
if(err) throw err;
});
console.log(data);
}
let global_data = fs.readFileSync("test.json");
console.log(global_data[0]);
When I read the file, it shows "91". I've tried using .toJSON() and .toString() but didn't go as I want. I looking do get an x,y coordinate as data[y][x];
Basically, there are two ways to read a .json file in Node.js.
Your first option is to use the require function, which you normally use to import .js files, but you can use it for .json files as well:
const packageJson = require('./package.json');
The advantage of this approach is that you get back the contents of the .json file as a JavaScript object or array, already parsed. The disadvantage of this approach is that you can not reload the file within the current process if something has changed, since require caches the contents of any loaded file, and you are always given back the cached value.
Your second option is exactly the opposite: It allows you to reload things, but it requires you to parse the file on your own. For that, you use the fs module's readFile function:
const fs = require('fs');
fs.readFile('./package.json', { encoding: 'utf8' }, (err, data) => {
const packageJson = JSON.parse(data);
// ...
});
If you want to use async and await, and if you use the util.promisify function of Node.js, you can also write this in a synchronous way, but keep the asynchronous code:
const fs = require('fs'),
{ promisify } = require('util');
const readFile = promisify(fs.readFile);
(async () => {
const data = await readFile('./package.json', { encoding: 'utf8' });
const packageJson = JSON.parse(data);
// ...
})();
Apart from that, there is also the fs.readFileSync function, which works in a synchronous way, but you should stay away from that one for performance reasons of your software.
For writing a JSON file, your only option is to use the writeFile function of the fs module, which works basically the same as its readFile counterpart:
const fs = require('fs');
const packageJson = { /* ... */ };
const data = JSON.parse(packageJson);
fs.writeFile('./package.json', data, { encoding: 'utf8' }, err => {
// ...
});
Again, you can use the util.promisify function as mentioned above to make things work with async and await, or you could use the synchronous fs.writeFileSync function (which, again, I would not recommend).
You can generally read in a JSON file with the require method of Node.
Example: const myJSON = requrie('./file.json') myJSON will hold the JSON data from file.json as a normal Javascript Object.
You can write a JSON file yourself, using:
const fs = require('fs');
fs.writeFile('file.json', json, 'utf8', callback);
I need to build a function for processing large CSV files for use in a bluebird.map() call. Given the potential sizes of the file, I'd like to use streaming.
This function should accept a stream (a CSV file) and a function (that processes the chunks from the stream) and return a promise when the file is read to end (resolved) or errors (rejected).
So, I start with:
'use strict';
var _ = require('lodash');
var promise = require('bluebird');
var csv = require('csv');
var stream = require('stream');
var pgp = require('pg-promise')({promiseLib: promise});
api.parsers.processCsvStream = function(passedStream, processor) {
var parser = csv.parse(passedStream, {trim: true});
passedStream.pipe(parser);
// use readable or data event?
parser.on('readable', function() {
// call processor, which may be async
// how do I throttle the amount of promises generated
});
var db = pgp(api.config.mailroom.fileMakerDbConfig);
return new Promise(function(resolve, reject) {
parser.on('end', resolve);
parser.on('error', reject);
});
}
Now, I have two inter-related issues:
I need to throttle the actual amount of data being processed, so as to not create memory pressures.
The function passed as the processor param is going to often be async, such as saving the contents of the file to the db via a library that is promise-based (right now: pg-promise). As such, it will create a promise in memory and move on, repeatedly.
The pg-promise library has functions to manage this, like page(), but I'm not able to wrap my ahead around how to mix stream event handlers with these promise methods. Right now, I return a promise in the handler for readable section after each read(), which means I create a huge amount of promised database operations and eventually fault out because I hit a process memory limit.
Does anyone have a working example of this that I can use as a jumping point?
UPDATE: Probably more than one way to skin the cat, but this works:
'use strict';
var _ = require('lodash');
var promise = require('bluebird');
var csv = require('csv');
var stream = require('stream');
var pgp = require('pg-promise')({promiseLib: promise});
api.parsers.processCsvStream = function(passedStream, processor) {
// some checks trimmed out for example
var db = pgp(api.config.mailroom.fileMakerDbConfig);
var parser = csv.parse(passedStream, {trim: true});
passedStream.pipe(parser);
var readDataFromStream = function(index, data, delay) {
var records = [];
var record;
do {
record = parser.read();
if(record != null)
records.push(record);
} while(record != null && (records.length < api.config.mailroom.fileParserConcurrency))
parser.pause();
if(records.length)
return records;
};
var processData = function(index, data, delay) {
console.log('processData(' + index + ') > data: ', data);
parser.resume();
};
parser.on('readable', function() {
db.task(function(tsk) {
this.page(readDataFromStream, processData);
});
});
return new Promise(function(resolve, reject) {
parser.on('end', resolve);
parser.on('error', reject);
});
}
Anyone sees a potential problem with this approach?
You might want to look at promise-streams
var ps = require('promise-streams');
passedStream
.pipe(csv.parse({trim: true}))
.pipe(ps.map({concurrent: 4}, row => processRowDataWhichMightBeAsyncAndReturnPromise(row)))
.wait().then(_ => {
console.log("All done!");
});
Works with backpressure and everything.
Find below a complete application that correctly executes the same kind of task as you want: It reads a file as a stream, parses it as a CSV and inserts each row into the database.
const fs = require('fs');
const promise = require('bluebird');
const csv = require('csv-parse');
const pgp = require('pg-promise')({promiseLib: promise});
const cn = "postgres://postgres:password#localhost:5432/test_db";
const rs = fs.createReadStream('primes.csv');
const db = pgp(cn);
function receiver(_, data) {
function source(index) {
if (index < data.length) {
// here we insert just the first column value that contains a prime number;
return this.none('insert into primes values($1)', data[index][0]);
}
}
return this.sequence(source);
}
db.task(t => {
return pgp.spex.stream.read.call(t, rs.pipe(csv()), receiver);
})
.then(data => {
console.log('DATA:', data);
}
.catch(error => {
console.log('ERROR:', error);
});
Note that the only thing I changed: using library csv-parse instead of csv, as a better alternative.
Added use of method stream.read from the spex library, which properly serves a Readable stream for use with promises.
I found a slightly better way of doing the same thing; with more control. This is a minimal skeleton with precise parallelism control. With parallel value as one all records are processed in sequence without having the entire file in memory, we can increase parallel value for faster processing.
const csv = require('csv');
const csvParser = require('csv-parser')
const fs = require('fs');
const readStream = fs.createReadStream('IN');
const writeStream = fs.createWriteStream('OUT');
const transform = csv.transform({ parallel: 1 }, (record, done) => {
asyncTask(...) // return Promise
.then(result => {
// ... do something when success
return done(null, record);
}, (err) => {
// ... do something when error
return done(null, record);
})
}
);
readStream
.pipe(csvParser())
.pipe(transform)
.pipe(csv.stringify())
.pipe(writeStream);
This allows doing an async task for each record.
To return a promise instead we can return with an empty promise, and complete it when stream finishes.
.on('end',function() {
//do something wiht csvData
console.log(csvData);
});
So to say you don't want streaming but some kind of data chunks? ;-)
Do you know https://github.com/substack/stream-handbook?
I think the simplest approach without changing your architecture would be some kind of promise pool. e.g. https://github.com/timdp/es6-promise-pool