Download many files asynchronously - node.js

I want to download all the mp3 files contained in this xml, so I created this code using Node.js and JavaScript:
var https = require('https');
var fs = require('fs');
var xml2js = require('xml2js');
var parser = new xml2js.Parser();
var request = require('request');
const xmlUrl = 'https://deejayreloadedpodcast.maxxer.it/podcast/pinocchio.xml';
var download = async function(url, dest, callback) {
// download if only the file is not existing yet
if(!fs.existsSync(dest)) {
await request.get(url)
.on('error', function(err) {
console.log(err);
})
.pipe(fs.createWriteStream(dest))
.on('close', callback);
}
};
https.get(xmlUrl, function(res) {
var response_data = '';
res.setEncoding('utf8');
res.on('data', function(chunk) {
response_data += chunk;
});
res.on('end', function() {
parser.parseString(response_data, function(err, result) {
if(err) {
console.log('Got error: ' + err.message);
}
else {
var json = JSON.stringify(result, null, 2);
var channels = result['rss']['channel'];
var items = channels[0]['item'];
var urlsTemp = [];
var namesTemp = [];
for(var elem in items) {
var obj = items[elem];
var name = obj['title'][0];
var url = obj['enclosure'][0]['$']['url'];
urlsTemp.push(url);
namesTemp.push(name);
}
var urls = [];
var names = [];
for(i in urlsTemp) {
urls.push(urlsTemp[i]);
names.push(namesTemp[i]);
}
for(var i = 10; i < 20/*urls.length*/; i++) {
var dirPath = './puntate/';
var filename = names[i] + '.mp3';
download(urls[i], dirPath + filename, function() {
console.log('Finished downloading \'' + filename);
});
}
}
});
});
res.on('error', function(err) {
console.log('Got error: ' + err.message);
});
});
This code takes the contents of the XML file, processes it by saving the links and file names in two arrays (urls and names) and then downloads the audio files.
The problem is that it only works if you download a few mp3s at a time (in the example, there are only 10).
If I let it loop from 0 to the full length of the array urls, the program no longer works. It does not generate errors but saves all mp3s with size 0 (ie empty).
Why? I thought the problem was asynchronous code, but I used async/await in the download method.
What's the problem?
Thank you
var i = 0;
var dirPath = './puntate/';
var filename = names[i] + '.mp3';
var fn = function(i) {
console.log('(A)', i, urls.length);
download(urls[i], dirPath + filename, function() {
console.log('Finished downloading \'' + filename);
console.log('(B)', i, urls.length);
if(i < urls.length) {
i++;
console.log('(C)', i, urls.length);
fn(i);
}
});
}
fn(i);
and:
(A) 0 3095
Finished downloading 'Puntata del 17 Settembre 2018.mp3
(B) 0 3095
(C) 1 3095
(A) 1 3095

I suggest you to modify the for loop, since it gives a synchronous function:
for(var i = 10; i < 20/*urls.length*/; i++) {
var dirPath = './puntate/';
var filename = names[i] + '.mp3';
download(urls[i], dirPath + filename, function() {
console.log('Finished downloading \'' + filename);
});
}
to a continuous passing style:
var i=0; /*i starts from 0*/
var dirPath = './puntate/';
var fn=function(i){
var filename = names[i] + '.mp3';
download(urls[i], dirPath + filename, function() {
console.log('Finished downloading \'' + filename);
/*if not finish downloading all the links*/
if(i<urls.length){
i++;
fn(i);
}
});
}
fn(i);
Here is enhanced code version:
Improvements:
deleted a unneccessary for loop
if file already exists, skip it until the next not exists one and print.
var https = require('https');
var fs = require('fs');
var xml2js = require('xml2js');
var parser = new xml2js.Parser();
var request = require('request');
var urls = [];
var names = [];
const xmlUrl = 'https://deejayreloadedpodcast.maxxer.it/podcast/pinocchio.xml';
var download = async function(url, dest, callback) {
request.get(url)
.on('error', function(err) {
console.log(err);
})
.pipe(fs.createWriteStream(dest))
.on('close', callback);
};
https.get(xmlUrl, function(res) {
var response_data = '';
res.setEncoding('utf8');
res.on('data', function(chunk) {
response_data += chunk;
});
res.on('end', function() {
parser.parseString(response_data, function(err, result) {
if(err) {
console.log('Got error: ' + err.message);
}
else {
var json = JSON.stringify(result, null, 2);
var channels = result['rss']['channel'];
var items = channels[0]['item'];
// var urlsTemp = []; //you don't need both of temp arrays
// var namesTemp = []; //push items directly into urls[] and names[]
for(var elem in items) {
var obj = items[elem];
var name = obj['title'][0];
var url = obj['enclosure'][0]['$']['url'];
urls.push(url);
names.push(name);
}
var i = 0;
var dirPath = './puntate/';
var fn = function(i) {
var filename = names[i] + '.mp3';
var fileExist=fs.existsSync(dirPath + filename);
// skip downloading if the file exists already
if(fileExist){
console.log('File exists', i, urls.length);
i++;
fn(i);
}else{ // download if only the file is not existing yet
console.log('(A)', i, urls.length);
download(urls[i], dirPath + filename, function() {
console.log('Finished downloading \'' + filename);
console.log('(B)', i, urls.length);
if(i < urls.length) {
i++;
console.log('(C)', i, urls.length);
fn(i);
}
});
}
}
fn(i);
}
});
});
res.on('error', function(err) {
console.log('Got error: ' + err.message);
});
});

Related

Synchronous NodeJS batch job

I'm trying to write a batch script that will
Read XMLs from a directory
Parse each XML and find a value to use for DB(database) Lookup
Use the parsed value to DB lookup additional metadata
Populate XML with the metadata retrieved from DB lookup (step 4)
Write updated XML to complete directory
Close DB connection
The issue I'm running into is that I cannot control the code execution order so that I can close the DB connection at the end of the script. If I attempt to close the connection, I get a 'connection undefined' error. Below is my code for reference. Is there a good way to accomplish something like this in NodeJs, or should I look at doing this in Java or some other language?
enter code here
'use strict';
let fs = require('fs');
let xml2js = require('xml2js');
const oracledb = require('oracledb');
const dbConfig = require('./dbconfig.js');
function pad(number, length)
{
var str = '' + number;
while (str.length < length)
{
str = '0' + str;
}
return str;
}
async function run() {
try {
// Get a non-pooled connection
let connection;
if (!connection)
{
connection = await oracledb.getConnection(dbConfig);
console.log('Connection was successful!');
}
let directory = "EDI_XMLS";
let dirBuf = Buffer.from(directory);
//var f = 0;
let files = fs.readdirSync(directory);
console.log(files);
for (let f = 0; f < files.length; f++)
{
let parser = new xml2js.Parser();
var xml_json_data = "";
// read the file
await fs.readFile(directory + "/" + files[f], async function(err, data) {
// parse the file
await parser.parseString(data, async function(err, result) {
let results;
var line_count = result.page.GLLines[0].GLLine.length;
console.dir('Invoice: ' + result.page.InvoiceNumber[0]);
console.dir('Line Total: ' + line_count);
console.log('File: ' + f);
try
{ // Lookup Data
results = await connection.execute('SELECT BUSINESS_UNIT, OPERATING_UNIT, DEPTID,PRODUCT,a.effdt FROM SYSADM.PS_A_NSS_SHPTO_ACC#FDEV a where(a.a_ship_to_customer = :shipTo) order by a.effdt
desc',[pad(result.page.VoucherDescription[0], 10)], {
maxRows: 2
});
console.log(results.metaData);
console.log(results.rows);
}
catch (err)
{
console.error(err);
}
for (let i = 0; i < line_count; i++) // Populate data
{
result.page.GLLines[0].GLLine[i].GLBU[0] = results.rows[0][0];
result.page.GLLines[0].GLLine[i].OpUnit[0] = results.rows[0[1];
result.page.GLLines[0].GLLine[i].Department[0] = results.rows[0][2];
result.page.GLLines[0].GLLine[i].Product[0] = results.rows[0][3];
}
// Write to File
var builder = new xml2js.Builder();
var xml = builder.buildObject(result);
await fs.writeFile("complete/" + files[f], xml, function(err, data) {
if (err) console.log(err);
console.log("successfully written our update xml to file");
console.dir('BUs: ' + JSON.stringify(result.page));
}); //end write
}); //end parser
}); //end readfile
console.log('End');
} // async for
}
catch (err)
{
console.error(err);
}
finally
{
await connection.close();
console.log('Finally Done');
}
}
run();
console.log('completely Done');

Trouble with asynchronous functions in node.js

I'm very new to node, and I'm trying to pull a list of IDs from an API, iterate through that list for each ID saving the output, and ultimately rename each file generated. The code below is the closest I've come, and while it works sometimes, it frequently fails as I believe one function isn't waiting for the other to complete (e.g. tries to read before a write), but I'm sure I have other issues going on.
const apiKey = inputData.apiKey
var https = require('https');
var sync = require('sync');
var fs = require('fs');
var JSONfileloc = "./pdfs/file.json"
var queryurl = 'https://intakeq.com/api/v1/intakes/summary?startDate=2018-01-01'
var authHeaders = { 'X-Auth-Key': apiKey }
var queryOpts = { method: 'GET', headers: authHeaders}
function handleFile (error, file)
{
if (error) return console.error('Ran into a problem here', error)
}
fetch(queryurl, queryOpts)
.then
(function findAPI(res, err)
{
if( err )
{ console.log('I cant find the API '+err) }
return res.json()
{console.log('found the API!')}
}
)
.then (function itID(res, err)
{
if( err )
{ console.log('I cant iterate the API '+err) }
for(var i = 0; i < res.length; i++)
{
var intakeID=res[i].Id;
var APIoptions={ host:"intakeq.com", path:"/api/v1/intakes/"+ intakeID, headers: authHeaders };
var PDFoptions={ host:"intakeq.com", path:"/api/v1/intakes/"+ intakeID+'/pdf', headers: authHeaders };
console.log('Working on ID:'+intakeID)
var JSONrequest = https.get(APIoptions, writeJSON)
}})
//READ JSON FUNCTION
function readJSON (err, data)
{
if (err) throw err;
if(data.indexOf('New Patient Forms') >= 0)
var contents = fs.readFileSync(JSONfileloc, handleFile);
var jsonContent = JSON.parse(contents)
//pull PT Name
pName = (jsonContent.ClientName);
console.log('The Patient Name Is ' + jsonContent.ClientName)
//pull PT DOB
pDob = (jsonContent.Questions[3].Answer)
console.log('Patient DOB Is ' + jsonContent.Questions[3].Answer)
//pull Form Type
pForm = (jsonContent.QuestionnaireName)
console.log('The Form Submitted is ' + jsonContent.QuestionnaireName)
//rename and move JSON
fs.rename("./pdfs/file.json", './JSONLogs/'+pName+' '+pForm+' '+Date.now()+'.json', function(err) {
if ( err ) console.log('Problem renaming! ' + err)
else console.log('Copying & Renaming JSON File!');
})
};
//WRITE JSON FUNCTION
function writeJSON(response, err)
{
var JSONfile = fs.createWriteStream(JSONfileloc, handleFile);
if (err) throw err;
response.pipe(JSONfile);
console.log('JSON Created')
fs.readFile(JSONfileloc, readJSON)
}
The research I've done leads me to believe that async.forEach is probably the right approach here, but I've been having a hard time getting that to work properly. Thanks in advance and any suggestions are much appreciated.
const apiKey = inputData.apiKey
var https = require('https');
var sync = require('sync');
var fs = require('fs');
var JSONfileloc = "./pdfs/file.json"
var queryurl = 'https://intakeq.com/api/v1/intakes/summary?startDate=2018-01-01'
var authHeaders = {
'X-Auth-Key': apiKey
}
var queryOpts = {
method: 'GET',
headers: authHeaders
}
function handleFile(error, file) {
if (error) return console.error('Ran into a problem here', error)
}
fetch(queryurl, queryOpts)
.then(function findAPI(res) {
return res.json();
})
.then(function itID(res) {
const JSONRequests = [];
for (var i = 0; i < res.length; i++) {
var intakeID = res[i].Id;
var APIoptions = {
host: "intakeq.com",
path: "/api/v1/intakes/" + intakeID,
headers: authHeaders
};
var PDFoptions = {
host: "intakeq.com",
path: "/api/v1/intakes/" + intakeID + '/pdf',
headers: authHeaders
};
// https.get has response as a stream and not a promise
// This `httpsGet` function converts it to a promise
JSONRequests.push(httpsGet(APIoptions, i));
}
return Promise.all(JSONRequests);
})
function httpsGet(options, filename) {
return new Promise((resolve, reject) => {
https.get(options, (response) => {
// The WriteJSON function, just for brewity
// Otherwise pass resolve to the seperate writeJSON and call it in there
var JSONfile = fs.createWriteStream(filename + ".json");
response.pipe(JSONfile);
JSONfile.on('close', () => {
readJSON(filename + ".json").then(() => {
resolve();
})
})
})
})
}
//READ JSON FUNCTION
function readJSON(filename) {
// if (err) throw err;
var contents = fs.readFileSync(filename, 'utf-8'); // removed handleFile as readFileSync does not allow callbacks, added format
var jsonContent = JSON.parse(contents)
// Make your conditional checks here with the jsonContents
//pull PT Name
pName = (jsonContent.ClientName);
console.log('The Patient Name Is ' + jsonContent.ClientName)
//pull PT DOB
pDob = (jsonContent.Questions[3].Answer)
console.log('Patient DOB Is ' + jsonContent.Questions[3].Answer)
//pull Form Type
pForm = (jsonContent.QuestionnaireName)
console.log('The Form Submitted is ' + jsonContent.QuestionnaireName)
//rename and move JSON
return new Promise((resolve, reject) => {
fs.rename("./pdfs/file.json", './JSONLogs/' + pName + ' ' + pForm + ' ' + Date.now() + '.json', function (err) {
if (err) {
console.log('Problem renaming! ' + err);
reject(err);
} else {
console.log('Copying & Renaming JSON File!');
resolve();
}
})
})
};
Updated to convert https.get response stream to return a Promise which can be handled much better.

How to know non blocking Recursive job is complete in nodejs

I have written this non-blocking nodejs sample recursive file search code, the problem is I am unable to figure out when the task is complete. Like to calculate the time taken for the task.
fs = require('fs');
searchApp = function() {
var dirToScan = 'D:/';
var stringToSearch = 'test';
var scan = function(dir, done) {
fs.readdir(dir, function(err, files) {
files.forEach(function (file) {
var abPath = dir + '/' + file;
try {
fs.lstat(abPath, function(err, stat) {
if(!err && stat.isDirectory()) {
scan(abPath, done);;
}
});
}
catch (e) {
console.log(abPath);
console.log(e);
}
matchString(file,abPath);
});
});
}
var matchString = function (fileName, fullPath) {
if(fileName.indexOf(stringToSearch) != -1) {
console.log(fullPath);
}
}
var onComplte = function () {
console.log('Task is completed');
}
scan(dirToScan,onComplte);
}
searchApp();
Above code do the search perfectly, but I am unable to figure out when the recursion will end.
Its not that straight forward, i guess you have to rely on timer and promise.
fs = require('fs');
var Q = require('q');
searchApp = function() {
var dirToScan = 'D:/';
var stringToSearch = 'test';
var promises = [ ];
var traverseWait = 0;
var onTraverseComplete = function() {
Q.allSettled(promises).then(function(){
console.log('Task is completed');
});
}
var waitForTraverse = function(){
if(traverseWait){
clearTimeout(traverseWait);
}
traverseWait = setTimeout(onTraverseComplete, 5000);
}
var scan = function(dir) {
fs.readdir(dir, function(err, files) {
files.forEach(function (file) {
var abPath = dir + '/' + file;
var future = Q.defer();
try {
fs.lstat(abPath, function(err, stat) {
if(!err && stat.isDirectory()) {
scan(abPath);
}
});
}
catch (e) {
console.log(abPath);
console.log(e);
}
matchString(file,abPath);
future.resolve(abPath);
promises.push(future);
waitForTraverse();
});
});
}
var matchString = function (fileName, fullPath) {
if(fileName.indexOf(stringToSearch) != -1) {
console.log(fullPath);
}
}
scan(dirToScan);
}
searchApp();

Nodejs streams writablestream drain event not firing

I am trying to read in a large file, do some computation and then write to a much bigger file. To prevent excessive memory consumption, I am using streams. The problem that I am facing is that the writestream is not firing the "drain" event, which signals that the writes have been flushed to disk. In order to prevent "back-pressure", I am waiting for the drain event to be fired before I start writing to the buffer again. While debugging I found that after a .write() call returns false and the line fvfileStream.once('drain', test) is executed, the program just stops and does not do anything.
Here is the code:
var fs = require('fs');
//a test function I created to see if the callback is called after drain.
var test = function(){
console.log("Done Draining");
}
fs.readFile('/another/file/to/be/read', {
encoding: "utf8"
}, function(err, data) {
if (err) throw err;
//Make an array containing tags.
var tags = data.split('\n');
//create a write stream.
var fvfileStream = fs.createWriteStream('/path/TagFeatureVectors.csv');
//read in the question posts
var qfileStream = fs.createReadStream('/Big/file/QuestionsWithTags.csv', {
encoding: "utf8"
});
var partialRow = null;
var writable = true;
var count = 0;
var doRead = function() {
var qData = qfileStream.read();
var questions = qData.split('\n');
if (partialRow != null) {
questions[0] = partialRow + questions[0];
partialRow = null;
}
var lastRow = questions[questions.length - 1];
if (lastRow.charAt(lastRow.length - 1) != '\n') {
partialRow = lastRow;
}
questions.forEach(function(row, index, array) {
count++;
var fields = row.split(',');
console.log("Processing question number: " + count + " id: " + fields[0]);
var tagString = fields[1];
var regex = new RegExp(/<([^>]+)>/g);
tags.forEach(function(tag, index, array) {
var found = false;
var questionTags;
while ((questionTags = regex.exec(tagString)) != null) {
var currentTag = questionTags[1]
if (currentTag === tag) {
found = true;
break;
}
};
//This is where the writestream is written to
if (found) {
writable = fvfileStream.write("1,", "utf8");
}else {
writable = fvfileStream.write("0,","utf8");
}
});
});
fvfileStream.write("\n");
}
qfileStream.on('readable', function() {
if (writable) {
doRead();
} else {
//Waiting for drain event.
fvfileStream.once('drain', test);
}
});
qfileStream.on('end', function() {
fvfileStream.end();
});
});
Updated
Based on advise provided by #loganfsmyth, I implemented transform streams, but still ran into the same issue. Here is my updated code:
var fs = require('fs');
var stream = require('stream');
var util = require('util');
var Transform = stream.Transform;
function FVCreator(options) {
// allow use without new
if (!(this instanceof FVCreator)) {
return new FVCreator(options);
}
// init Transform
Transform.call(this, options);
}
util.inherits(FVCreator, Transform);
var partialRow = null;
var count = 0;
var tags;
FVCreator.prototype._transform = function(chunk, enc, cb) {
var that = this;
var questions = chunk.toString().split('\n');
if (partialRow != null) {
questions[0] = partialRow + questions[0];
partialRow = null;
}
var lastRow = questions[questions.length - 1];
if (lastRow.charAt(lastRow.length - 1) != '\n') {
partialRow = lastRow;
questions.splice(questions.length - 1, 1);
}
questions.forEach(function(row, index, array) {
count++;
var fields = row.split(',');
console.log("Processing question number: " + count + " id: " + fields[0]);
var tagString = fields[1];
var regex = new RegExp(/<([^>]+)>/g);
tags.forEach(function(tag, index, array) {
var found = false;
var questionTags;
while ((questionTags = regex.exec(tagString)) != null) {
var currentTag = questionTags[1]
if (currentTag === tag) {
found = true;
break;
}
};
if (found) {
that.push("1,", "utf8");
} else {
that.push("0,", "utf8");
}
});
});
this.push("\n", "utf8");
cb();
};
fs.readFile('/another/file/to/be/read', {
encoding: "utf8"
}, function(err, data) {
if (err) throw err;
//Make an array containing tags.
tags = data.split('\n');
//write to a file.
var fvfileStream = fs.createWriteStream('/path/TagFeatureVectors.csv');
//read in the question posts
var qfileStream = fs.createReadStream('/large/file/to/be/read', {
encoding: "utf8"
});
var fvc = new FVCreator();
qfileStream.pipe(fvc).pipe(fvfileStream);
});
I am running this on OSX Yosemite.

Node.js http.get with Node.js step module

I am new to Node.js world, kind of stuck in situation.
below code is for reference:
var http = require('http');
var step = require('step');
var request = require('request');
exports.readimage2 = function(req, res){
//res.send(200,'OK');
//var image_url = 'http://www.letsgodigital.org/images/artikelen/39/k20d-image.jpg'; //--- 10mb
//var image_url = 'http://upload.wikimedia.org/wikipedia/commons/2/2d/Snake_River_(5mb).jpg';
//var image_url = 'http://www.sandia.gov/images2005/f4_image1.jpg'; //--- 2mb
var image_url = 'http://www.fas.org/nuke/guide/pakistan/pakistan.gif'; // --- some KB
http.get(image_url,
function(responseData) {
var data = new Buffer(parseInt(responseData.headers['content-length'],10));
var pos = 0;
responseData.on('data', function(chunk) {
chunk.copy(data, pos);
pos += chunk.length;
});
responseData.on('end', function () {
res.send(200, data);
});
});
};
Above code fails working for large files if i use it with step module.
Anyone suggest how to do it properly with step.
Here how i did it using step..... although the request module did same for image buffer download thanks to a post on stackoverflow just need to set encoding to null in request to work for buffer response.
var canvas = new Canvas(3000, 3000),
ctx = canvas.getContext('2d'),
Image = Canvas.Image;
var image_url = "http://www.a2hosting.com/images/uploads/landing_images/node.js-hosting.png";
//var image_url = 'http://upload.wikimedia.org/wikipedia/commons/1/16/AsterNovi-belgii-flower-1mb.jpg';
step(
function() {
request.get({
url: image_url,
encoding: null
}, this);
},
function(err, response, body) {
var img = new Image;
img.src = body;
ctx.drawImage(img, 0, 0, img.width, img.height);
//res.send(200, data);
res.send(200, '<img src="' + canvas.toDataURL() + '" />');
}
);
Below is the code working for simple http module of node.
var http = require('http');
var step = require('step');
var request = require('request');
exports.imagedownload = function(req, res){
step(
function(){
console.log('*********** image download start ***********');
fndownload(this);
},
function(err, result){
if(err) {
}
console.log('*********** image download end ***********');
res.send(200, result);
}
);
};
function fndownload(callback) {
var image_url = 'http://upload.wikimedia.org/wikipedia/commons/2/2d/Snake_River_(5mb).jpg'; // --- some KB
http.get(image_url,
function(responseData) {
var data = new Buffer(parseInt(responseData.headers['content-length'],10));
var pos = 0;
responseData.on('data', function(chunk) {
chunk.copy(data, pos);
pos += chunk.length;
});
responseData.on('end', function () {
//res.send(200, data);
callback(null, data);
});
});
};

Resources