awaiting completion of a highland stream - node.js

I am writing a small script to stream download and process a number of sequentially named files from url. I am using highlandjs and have it working perfectly one by one. I am attempting to refactor into a loop wherein I await the completion of one highland stream prior to starting another:
// my attempt to abstract the highland into a promise I can await
const processFile = async (url, path) => {
const writeStream = fs.createWriteStream(path);
return hl(request(url))
.split()
// various working transforms
.map(splitDelim)
.filter(filterSchedAOnly)
.map(appendIdentity)
.filter(filterOnlyIdentified)
.map(convertToCSVsafeString)
// end result should write to a file
.pipe(writeStream)
.toPromise();
// also tried this
// return new Promise((resolve, reject) => {
// writeStream.on("end", () => resolve());
// writeStream.on("error", () => reject());
// });
};
(async () => {
let i = 1;
// infinite loop
while (true) {
const url = `http://.../${i}.txt`;
const filePath = `${i}.processed.csv`;
try {
// does not work!
await processFile(url, filePath);
} catch (err) {
console.log(err);
}
i++;
}
})();
How should I wrap my processFile func such that I can await it's completion before moving on to the next iteration?

This appears to be working:
function streamToPromise(stream) {
return new Promise(function(resolve, reject) {
// resolve with location of saved file
stream.on("finish", () => resolve());
// stream.on("end", () => resolve());
stream.on("error", () => reject());
});
}
const processFile = async (url, path, i) => {
const stream = hl(request(url))
.split()
// ......
.pipe(fs.createWriteStream(path));
return streamToPromise(stream);
};

Related

NodeJS - Download MP3 file one at a time

Im trying to make a quick node script to download MP3s from a RSS feed. At the moment I have this :
const https = require('https');
const xml2js = require('xml2js');
const parser = new xml2js.Parser();
const fs = require('fs');
const URL_TO_PARSE = 'https://some-rss.feed.xml';
const req = https.get(URL_TO_PARSE, async (res) => {
let xml = '';
res.on('data', (stream) => {
xml = xml + stream;
});
res.on('end', () => {
parser.parseString(xml, async (err, result) => {
if (err) {
console.log(err);
} else {
let items = result.rss.channel[0].item;
await Promise.all(items.map(async (item) => {
let title = item.title[0];
let enclosure = item.enclosure[0];
let url = enclosure.$.url;
let filepath = `./${title}`;
console.log(`Downloading ${title} to ${filepath}`);
await download_audio_file(url, filepath);
}));
}
});
});
});
const download_audio_file = async (url, filepath) => {
https.get(url, (res) => {
const writeStream = fs.createWriteStream(filepath);
res.pipe(writeStream);
writeStream.on('finish', () => {
writeStream.close();
console.log('File downloaded');
Promise.resolve();
});
writeStream.on('error', (err) => {
console.log(err);
Promise.reject(err);
});
})
But it currently tried to download each one at the same time. Is there a better way to write this to download just one at a time - and possibly also track the % progress?
I see 2 problems with your code.
The first one is that download_audio_file is not returning a promise that resolves when the file is fully downloaded.
You can fix that with this refactored version:
const download_audio_file = async (url, filepath) => {
const promise = new Promise((resolve, reject) => {
https.get(url, (res) => {
const writeStream = fs.createWriteStream(filepath);
res.pipe(writeStream);
writeStream.on("finish", () => {
writeStream.close();
console.log("File downloaded");
resolve();
});
writeStream.on("error", (err) => {
console.log(err);
reject(err);
});
});
});
return promise;
};
Secondly, you are using Promise.all which awaits for all the promises in parallel.
You can replace that code snippet with:
const req = https.get(URL_TO_PARSE, async (res) => {
let xml = '';
res.on('data', (stream) => {
xml = xml + stream;
});
res.on('end', () => {
parser.parseString(xml, async (err, result) => {
if (err) {
console.log(err);
} else {
let items = result.rss.channel[0].item;
for(const item of items) {
let title = item.title[0];
let enclosure = item.enclosure[0];
let url = enclosure.$.url;
let filepath = `./${title}`;
console.log(`Downloading ${title} to ${filepath}`);
await download_audio_file(url, filepath);
}
}
});
});
});
Notice how I replaced the Promise.all with for(const item of items)

In node.js, why is my data not getting passed back after a asynchronous file read using a Promise

I know for sure that my pullData module is getting the data back from the file read but the function calling it, though it has an await, is not getting the data.
This is the module (./initialise.js) that reads the data:
const fs = require('fs');
const getData = () => {
return new Promise((resolve, reject) => {
fs.readFile('./Sybernika.txt',
{ encoding: 'utf8', flag: 'r' },
function (err, data) {
if (err)
reject(err);
else
resolve(data);
});
});
};
module.exports = {getData};
And this is where it gets called (app.js):
const init = require('./initialise');
const pullData = async () => {
init.getData().then((data) => {
return data;
}).catch((err) => {
console.log(err);
});
};
const start = async() => {
let data = await pullData();
console.log(data);
}
start();
putting 'console.log(data)' just before return(data) in the resolve part of the call shows the data so I know it's being read OK. However, that final console.log shows my data variabkle as being undefined.
Any suggestions?
It's either
const pullData = async () => {
return init.getData().then((data) => {
return data;
}).catch((err) => {
console.log(err);
});
};
or
const pullData = async () =>
init.getData().then((data) => {
return data;
}).catch((err) => {
console.log(err);
});
Both versions make sure a promise returned by then/catch is passed down to the caller.

Working with createReadStream in node.js using async/await

After tried TFischer's response, I am still having difficulty with using fs.creadReadStream to process my csv file asynchronously:
return new Promise((resolve, reject) => {
const promises = [];
fs.createReadStream(inputFile)
.pipe(csv())
.on("data", row => promises.push(processData(row, myRepository)))
.on("error", reject)
.on("end", async () => {
await Promise.all(promises);
resolve();
});
});
async function processData(row, myRepository) {
console.log('Current row: ', row); // logs all CSV rows
const record = await myRepository.findOne({where: {id: row.id}});
console.log(record); // row.id is undefined here
return record;
}
I want to be able to use row argument inside processData(row, myRepository) function as the current row of the actual CSV file being parsed but it doesn't seem to work.
Can someone explain what is happening there?
If you just want to read chunks, createReadStream is your guy as it is already an async iterable:
async function main() {
const reader = createReadStream(join(__dirname, "index.html"), "utf8");
for await (const chunk of reader) {
console.log(chunk);
}
}

Returning the result after reading the csv file

I am using PapaParse to read remote csv file and return the result, but every time it's empty not sure why.
function importData(url){
const parseStream = Papa.parse(Papa.NODE_STREAM_INPUT, {});
let length = 0;
const dataStream = request
.get(url)
.pipe(parseStream);
let data: any[] = [];
parseStream.on("data", (chunk: any) => {
data.push(chunk);
});
dataStream.on("finish", (length) => {
console.log(data);//this returns data
console.log(data);//this returns data length
length = data.length;
return data;// Empty
});
return length;//Empty
}
The statement return length will be executed before the dataStream's finish-event has been emitted. If you want your importData function to wait until this happens, you can wrap the parsing in a promise and wait until it's been resolved (still needs error handling, but should give you a start):
function importData(url){
const parsePromise = new Promise((resolve, reject) => {
const parseStream = Papa.parse(Papa.NODE_STREAM_INPUT, {});
const dataStream = request
.get(url)
.pipe(parseStream);
let data: any[] = [];
parseStream.on("data", (chunk: any) => {
data.push(chunk);
});
dataStream.on("finish", () => {
resolve(data);
});
});
return parsePromise
.then((data) => {
console.log(data.length);
return data;
})
}
You can simplify this using async/await:
async function importData(url) {
const data = await new Promise((resolve, reject) => {
const parseStream = Papa.parse(Papa.NODE_STREAM_INPUT, {});
const dataStream = request
.get(url)
.pipe(parseStream);
const data: any[] = [];
parseStream.on("data", (chunk: any) => {
data.push(chunk);
});
dataStream.on("finish", () => {
resolve(data);
});
});
console.log(data.length);
return data;
}

Node.js: Return promises in a defined order

I have some hundreds of JSON files that I need to process in a defined sequence and write back the content as CSV in the same order as in the JSON files:
Write a CSV file with header
Collect an array of JSON files to process
Read the file and return an array with the required information
Append the CSV file, created under #1, with the information
Continue with the next JSON file at step #3
'use strict';
const glob = require('glob');
const fs = require('fs');
const fastcsv = require('fast-csv');
const readFile = require('util').promisify(fs.readFile);
function writeHeader(fileName) {
return new Promise((resolve, reject) => {
fastcsv
.writeToStream(fs.createWriteStream(fileName), [['id', 'aa', 'bb']], {headers: true})
.on('error', (err) => reject(err))
.on('finish', () => resolve(true));
});
}
function buildFileList(globPattern) {
return new Promise((resolve, reject) => {
glob(globPattern, (err, files) => {
if (err) {
reject(err);
} else {
resolve(files);
}
});
});
}
function readFromFile(file) {
return new Promise((resolve, reject) => {
readFile(file, 'utf8', (err, data) => {
if (err) {
reject(err);
} else {
const obj = JSON.parse(data);
const key = Object.keys(obj['776'])[0];
const solarValues = [];
obj['776'][key].map((item, i) => solarValues.push([i, item[0], item[1][0][0]]));
resolve(solarValues);
}
});
});
}
function csvAppend(fileName, rows = []) {
return new Promise((resolve, reject) => {
const csvFile = fs.createWriteStream(fileName, {flags: 'a'});
csvFile.write('\n');
fastcsv
.writeToStream(csvFile, rows, {headers: false})
.on('error', (err) => reject(err))
.on('finish', () => resolve(true));
});
}
writeHeader('test.csv')
.then(() => buildFileList('data/*.json'))
.then(fileList => Promise.all(fileList.map(item => readFromFile(item))))
.then(result => Promise.all(result.map(item => csvAppend('test.csv', item))))
.catch(err => console.log(err.message));
JSON examples:
https://gist.github.com/Sineos/a40718c13ad0834b4a0056091e3ac4ca
https://gist.github.com/Sineos/d626c3087074c23a073379ecef84a55c
Question
While the code basically works, my problem is that the CSV is not written back in a defined order but mixed up like in an asynchronous process.
I tried various combinations with and without Promise.all resulting in either pending promises or mixed up CSV file.
This is my first take on Node.js Promises so every input on how to do it correctly is greatly appreciated. Many thanks in advance.
This code should process your files in order, we'll use async/await and for .. of to loop in sequence:
async function processJsonFiles() {
try {
await writeHeader('test.csv');
let fileList = await buildFileList('data/*.json');
for(let file of fileList) {
let rows = await readFromFile(file);
await csvAppend('test.csv', rows);
}
} catch (err) {
console.error(err.message);
}
}
processJsonFiles();

Resources