node.js Get file through http request in chunks, then join them - node.js

I am using range header to download mp4 file in parts like so:
const chunkFile = fs.createWriteStream('5a52e9ba-9328-11ec-b909-0242ac120002.mp4.chunk');
const downloadRequest = https.get({
...
range: 'bytes=0-10000'
}, response => {
...
response.pipe(chunkFile);
}
I do this in loop for all ranges and I end up with a bunch of chunk files in directory, I simplified it for the sake of the question. Then I join the chunks back into one file like so:
function joinChunks({chunkHashes, fileName}) {
const outputFile = fs.createWriteStream(fileName);
for (let i = 0, {length} = chunkHashes; i < length; i++) {
const chunkData = fs.readFileSync(chunkHashes[i]);
outputFile.write(chunkData);
fs.unlinkSync(chunkHashes[i]);
}
}
Was I naive to think it would work like that? The resulting file is the same size but it's broken. Is there a way to store raw data from http response in chunk files on disk, and then join that data to end up with the original file?

Related

How to loop an array of links and get json response in Node.js

I'm accessing an url that gives me a json (https://collections.louvre.fr/ark:/53355/cl010270063.json).
The problem is that I have an array of 448 thousand links to access and getting one by one will take weeks.
I've tried this:
for (let i = 0; i < artLinks.length; i += chunkSize) {
const chunk = artLinks.slice(i, i + chunkSize);
let resArray = []
// Return our response in the allData variable as an array
Promise.all(chunk.map((endpoint) => axios.get(`https://collections.louvre.fr/en/ark:/${endpoint}.json`))).then((res) =>
resArray.push(res.data));
const filePath = path.join(__dirname, `../../json/artInfo-${i}.json`);
await fs.writeFileSync(filePath, JSON.stringify(resArray))
bar.tick()
}
So I could write a file with the information every 100 responses or so. But this will create an empty file and the loop will fire all links at once, and not wait the response and then proceed to the next iteration.
How can I loop an enormous array of links and get it's response in a non dumb way?

Using stream-combiner and Writable Streams (stream-adventure)

i'm working on nodeschool.io's stream-adventure. The challenge:
Write a module that returns a readable/writable stream using the
stream-combiner module. You can use this code to start with:
var combine = require('stream-combiner')
module.exports = function () {
return combine(
// read newline-separated json,
// group books into genres,
// then gzip the output
)
}
Your stream will be written a newline-separated JSON list of science fiction
genres and books. All the books after a "type":"genre" row belong in that
genre until the next "type":"genre" comes along in the output.
{"type":"genre","name":"cyberpunk"}
{"type":"book","name":"Neuromancer"}
{"type":"book","name":"Snow Crash"}
{"type":"genre","name":"space opera"}
{"type":"book","name":"A Deepness in the Sky"}
{"type":"book","name":"Void"}
Your program should generate a newline-separated list of JSON lines of genres,
each with a "books" array containing all the books in that genre. The input
above would yield the output:
{"name":"cyberpunk","books":["Neuromancer","Snow Crash"]}
{"name":"space opera","books":["A Deepness in the Sky","Void"]}
Your stream should take this list of JSON lines and gzip it with
zlib.createGzip().
HINTS
The stream-combiner module creates a pipeline from a list of streams,
returning a single stream that exposes the first stream as the writable side and
the last stream as the readable side like the duplexer module, but with an
arbitrary number of streams in between. Unlike the duplexer module, each
stream is piped to the next. For example:
var combine = require('stream-combiner');
var stream = combine(a, b, c, d);
will internally do a.pipe(b).pipe(c).pipe(d) but the stream returned by
combine() has its writable side hooked into a and its readable side hooked
into d.
As in the previous LINES adventure, the split module is very handy here. You
can put a split stream directly into the stream-combiner pipeline.
Note that split can send empty lines too.
If you end up using split and stream-combiner, make sure to install them
into the directory where your solution file resides by doing:
`npm install stream-combiner split`
Note: when you test the program, the source stream is automatically inserted into the program, so it's perfectly fine to have split() as the first parameter in combine(split(), etc., etc.)
I'm trying to solve this challenge without using the 'through' package.
My code:
var combiner = require('stream-combiner');
var stream = require('stream')
var split = require('split');
var zlib = require('zlib');
module.exports = function() {
var ws = new stream.Writable({decodeStrings: false});
function ResultObj() {
name: '';
books: [];
}
ws._write = function(chunk, enc, next) {
if(chunk.length === 0) {
next();
}
chunk = JSON.parse(chunk);
if(chunk.type === 'genre') {
if(currentResult) {
this.push(JSON.stringify(currentResult) + '\n');
}
var currentResult = new ResultObj();
currentResult.name = chunk.name;
} else {
currentResult.books.push(chunk.name);
}
next();
var wsObj = this;
ws.end = function(d) {
wsObj.push(JSON.stringify(currentResult) + '\n');
}
}
return combiner(split(), ws, zlib.createGzip());
}
My code does not work and returns 'Cannot pipe. Not readable'. Can someone point out to me where i'm going wrong?
Any other comments on how to improve are welcome too...

wait for previous stream to be empty before allowing reading

Say I have a file that contains a list of integers, one per line. I use fs.createReadStream and pipe that into split (so that each chunk is an integer). Then I pipe that into a duplex stream that is supposed to add the numbers and write the sum by piping into fs.createWriteStream.
var fs = require('fs');
var stream = require('stream');
var split = require('split');
var addIntegers = new stream.Duplex();
addIntegers.sum = 0;
addIntegers._read = function(size) {
this.push(this.sum + '\n');
}
addIntegers._write = function(chunk, encoding, done) {
this.sum += +chunk;
done();
}
fs.createReadStream('list-of-integers.txt')
.pipe(split())
.pipe(addIntegers)
.pipe(fs.createWriteStream('sum.txt'));
When I run this, sum.txt just gets continually filled with zeroes and the program never terminates (as expected). How do I wait for the input stream (split) to be empty before allowing the ouput stream (fs.createWriteStream) to read from addIntegers?
I figured it out.
I decided to use a Transform stream instead (thanks mscdex) because it has a method (_flush) that gets called after all written data is consumed. The working code is below. Don't forget to npm i split :)
var fs = require('fs');
var stream = require('stream');
var split = require('split');
var addIntegers = new stream.Transform();
addIntegers.sum = 0;
addIntegers._transform = function(chunk, encoding, done) {
this.sum += +chunk;
done();
}
addIntegers._flush = function(done) {
this.push(this.sum + '\n');
}
fs.createReadStream('list-of-integers.txt')
.pipe(split())
.pipe(addIntegers)
.pipe(fs.createWriteStream('sum.txt'));

Reasons why a node stream read() function might return null?

I am building a node application that reads a CSV file from the file system, analyzes the file, and then parses the file using the csv-parse module. Each of these steps are in the form of a stream, piped one into the next.
The trouble I am having is that for some files, the parse step can read the stream, but for others, the read() method returns null on the readable event and I don't know why.
In the code below, specifically, I will sometimes see data come through on calling read() on the parser stream, and other times it will return null. CSV files that succeed always succeed, and CSV files that fail always fail. I tried to determine some difference between the files, but other than using different field names in the first row, and slightly different data in the body, I can't see any significant difference between the source files.
What are some reasons that a node stream's read() function might return null after a readable event?
Sample code:
var parse = require('csv-parse');
var fs = require('fs');
var stream = require('stream');
var byteCounter = new stream.Transform({objectMode : true});
byteCounter.setEncoding('utf8');
var totalBytes = 0;
// count the bytes we have read
byteCounter._transform = function (chunk, encoding, done) {
var data = chunk.toString();
if (this._lastLineData) {
data = this._lastLineData + data ;
}
var lines = data.split('\n');
// this is because each chunk will probably not end precisely at the end of a line:
this._lastLineData = lines.splice(lines.length-1,1)[0];
lines.forEach(function(line) {
totalBytes += line.length + 1 // we add an extra byte for the end-of-line
this.push(line);
}, this);
done();
};
byteCounter._flush = function (done) {
if (this._lastLineData) {
this.push(this._lastLineData);
}
this._lastLineData = null;
done();
};
// csv parser
var parser = parse({
delimiter: ",",
comment: "#",
skip_empty_lines: true,
auto_parse: true,
columns: true
});
parser.on('readable', function(){
var row;
while( null !== (row = parser.read()) ) {
// do stuff
}
});
// start by reading a file, piping to byteCounter, then pipe to parser.
var myPath = "/path/to/file.csv";
var options = {
encoding : "utf-8"
};
fs.createReadStream(myPath, options).pipe(byteCounter).pipe(parser);

file upload with nodejs koa co-busboy

I use co-busyboy to parse file fields when uploading files using KOA. The official example looks like this:
var parse = require('co-busboy')
var parts = parse(this);
var part;
while (part = yield parts) {
if(!part.length) //it is a stream
part.pipe(fs.createWriteStream('some file.txt'));
}
For some reasons, I want to save all of "part" streams into an array, and perform the actual files writing when all of file streams are fetched. i.e.:
var parse = require('co-busboy');
var parts = parse(this);
var part;
var partArrays = [];
var cnt = 0;
while(part = yield parts){
partArray[cnt++] = part;
}
//after some processing, I perform the writing
for(file in partArray){
file.pipe(fs.createWriteStream('some file.txt');
}
The problem is that the while loop just does not continue. It seems that if I do not call part.pipe, the loop will halt.
So how can I make the while loop continue?

Resources