Nodejs, How to copy several file in nodejs without crash - node.js

I tried to copy several file with node js.
Here is my an example of what i'm trying to do :
var request = require('request');
va photos [{ 'url': 'http://xxxx.com/im1', 'name' : 'name1' }, { 'url': 'http://xxxx.com/im12', 'name' : 'name2' },
for (var i = 0; i < photos.length; i++) {
request(photos[i].source).pipe(fs.createWriteStream(photos[i].name));
}
After maybe 1000 call i have a socket hang out error.
Following #Timothy Strimple advice i decided to use async module.
My code is now something like this :
async.whilst(function () { return !stop; },
function (callback) {
console.log("get next 20 image");
JM.api('/' + album.id + '/photos', { after: next }, function (resf) {
if (!resf || resf.error) {
console.log(!resf ? 'error occurred' : resf.error);
}
console.log("albums" + album.id + " " + resf.data.length + " dir" + dir);
async.eachSeries(resf.data, function (photo, done) {
request(photo.source).pipe(fs.createWriteStream(dir + "/" +photo.name));
console.log("copy of image " + img_basename);
}, function (err) {
if (err) {
console.log('An images failed to copy');
} else {
console.log('All 20 image have been copied successfully');
}
if (resf.paging && resf.paging.cursors) {
console.log("suite de l'album à venir");
next = resf.paging.cursors.after;
setTimeout(function () { callback(); }, 5000);
}
else {
console.log("Fin de l'album");
stop = true;
setTimeout(function () { callback(); }, 5000);
}
});
});
},
function (err) {
if (err) {
console.log('An images failed to process');
albumcallback();
} else {
console.log('All images in this group have been processed successfully');
albumcallback();
}
}
);// end while
I still having a crash after maybe 1 00 file copied. I'm sure that async.whilst and async.eachSeries are weel because my log show that each call is on series. But i have a crash. I temporary solved the proble by ading a wait after each copy like this :
request(photo.source).pipe(fs.createWriteStream(dir + "/" + img_basename));
console.log("copy of image " + img_basename);
setTimeout(function () { done(); }, 5000);
Is it a limit of request module ? How to change this fea line to make sure that each connection are closed before continung the program ?

You probably need to move to an asynchronous loop. Something like eachLimit from the async module would probably be ideal.
async.eachLimit(photos, 10, function(photo, done) {
var r = request(photos[i].source).pipe(fs.createWriteStream(photos[i].name));
r.on('finish', done);
}, function(err) {
// All images done or there was an error
});
Now it will process all the items in your photos list, but it will only process 10 of them concurrently. This will prevent it from spinning up hundreds or thousands of concurrent outgoing connections.

The request call and pipe call are asyncrhon. So i have to rewrite this line : request(photos[i].source).pipe(fs.createWriteStream(photos[i].name));
See here :
Downloading N number of remote files using Node.js synchronously

Related

I only get all data one time

I'm creating REST API with node.js without express.js framework, and for database I use fs. I'm having problem with get method. All other methods work fine. When I try to get all json files from directory, I can get everything only one time, second and all other times, nothing happens, not even an error, it just stuck for few minutes, and after that I get this error in chrome: net::ERR_EMPTY_RESPONSE, and this in firefox: TypeError: "NetworkError when attempting to fetch resource.". Get one json file works fine. I think problem is in back-end. Because I have same problems with postman.
I'm using node 10.14.1. As I understand when I'm trying to get all files from directory second time, my server.js file don't send any response. And app stops before chosenHandler in server.js line 64.
Get handler:
// Resources - get
// Required data: none
// Optional data: ID
handlers._resources.get = (data, callback) => {
// Check that the ID is valid
checkId(data.queryStringObject.id)
if (resourceDBId) {
// Lookup the resource
_data.read('resources', resourceDBId, (err, data) => !err &&
data ? callback(ok, data) : callback(notFound, {Error: 'User doesn\'t exist'}))
} else {
_data.readAll('resources', (err, data) => {
if (!err && data) {
if (data.last) {
resourcesData.push(data.data)
callback(ok, resourcesData)
}
if (!data.last) resourcesData.push(data.data)
} else callback(internalServerError, {Error: 'Can\'t get all resources'})
})
}
}
All code are here: https://github.com/FreeDevStan/sale
The readAll method in the lib/data.js file needs to initialize i.
Because i is used undefined, it is treated as global variable i and does not enter the loop, and the callback is not working, so response is not possible.
I recommend you to change it as below.
lib.readAll = (dir, callback) => {
fs.readdir(lib.baseDir + dir, (err, data) => {
if(!err && data) {
let i = 0;
while(i < data.length) {
if (i < data.length - 1) {
fs.readFile(lib.baseDir + dir + '/' + data[i], 'utf-8', (err, content) => {
let parsedContent = helpers.parseJsonToObject(content)
err ? callback(err, content) : callback(false, {last: false, data: parsedContent})
})
} if (i === data.length - 1) {
fs.readFile(lib.baseDir + dir + '/' + data[i], 'utf-8', (err, content) => {
let parsedContent = helpers.parseJsonToObject(content)
err ? callback(err, content) : callback(false, {last: true, data: parsedContent})
})
}
i++
}
} else callback(err, data)
})
}

Getting Error Like RequestsLimitError: You just made too many request to instagram API in node js?

I am work with isntagram api in node js. i have one array and in the array store above 20k up instagram id. and then i am do foreach on that array and one by one take instagram id and go for the take bio but that time i am getting error like this RequestsLimitError: You just made too many request to instagram API. i am try every 5 call after set time out also but still i am getting same error so how can resolved this error any one know how can fix it then please let me know.
Here this is my code =>
var InstaId = ["12345687",20k more id store here in the array]
var changesessionFlage = 0;
async.each(InstaId, function (id, callback) {
async.parallel([
function (cb) {
if (id) {
setTimeout(function () {
Client.Account.getById(sess, id).then(function (bio) {
console.log("changesessionFlage" + changesessionFlage);
changesessionFlage++
//console.log("bio : ", bio._params); // here i am getting bio one by one user
if (changesessionFlage == 6) {
changesessionFlage = 0;
}
cb(null, bio._params);
})
.catch(function (err) {
console.log("get boi: ", err)
cb(null, bio._params);
})
}, (changesessionFlage == 5) ? 10000 : 0)
}
}
], function (err, results) {
if (err) {
console.log(err);
return;
}
Result = results
callback();
});
}, function (err) {
if (err) {
console.log(err);
return;
}
else {
console.log("Result=>", Result)
if (Result) {
console.log("Result[0]=>", Result[0])
var ws = XLSX.utils.json_to_sheet(Result[0]);
var wb = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(wb, ws, "People");
var wbout = XLSX.write(wb, { bookType: 'xlsx', type: 'binary' });
res.end(wbout, 'binary');
}
}
});
any one know how can fix this issue then please help me.
Your setTimeout is use incorrectly, all API calls are made at once after 10000 delay.
Since this is a one time job, just split the 20K usernames to 4K batches and execute them every hour. This way you will be under the 5k/hr API limit

Webdriverio & Selenium throw NoSessionIdError & RuntimeError

I'm running on Amazon ECS a cluster of Selenium Standalone Server w/ Firefox attached to another container for my API.
I'm querying my API to get a response and a screenshot of a long running webpage. 90% of the time, I'm getting a NoSessionIdError or a RuntimeError
{ Error: A session id is required for this command but wasn't found in the response payload
at execute(<Function>) - selectorExecute.js:58:65
at execute("return !!document.evaluate;") - ensureClientSideSelectorSupport.js:23:17
type: 'NoSessionIdError',
message: 'A session id is required for this command but wasn\'t found in the response payload' }
{ status: -1,
type: 'ESOCKETTIMEDOUT',
message: 'Couldn\'t connect to selenium server',
orgStatusMessage: 'ESOCKETTIMEDOUT' } }
Here is my code :
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
},
host: 'selenium'
};
var WebRefresh = function () {
this.client = webdriverio.remote(options);
this.statusMessage = "";
this.errorMessage = "";
};
WebRefresh.prototype.refreshURL = function (url, cb) {
var self = this;
self.client
.init()
.url(url)
.pause(10000)
.element('iframe').then(function(res)
{
console.log("FRAME 1");
return self.client.frame(res.value).waitUntil(
function () {
return self.client.getHTML('.statusMessageHolder=Success.');
}, 300000, "TIMED OUT", 1000);
}).catch(function (err) {
self.errorMessage += "WaitUntil-" + err.type + "| ";
console.error(err);
})
.getHTML('.statusMessageHolder', false).then(function (text) {
self.statusMessage = text;
}).catch(function (err) {
self.errorMessage += "GetStatus-" + err.type + "| ";
})
.getHTML('.error-title', false).then(function (text) {
self.errorMessage = text;
}).catch(function (err) {
console.error(err);
self.errorMessage += "GetError-" + err.type + "| ";
}).screenshot().then(
function (data) {
cb(self.statusMessage, self.errorMessage, data.value);
}).catch(function (err) {
console.error(err);
self.errorMessage += "GetScreenshot-" + err.type + "| ";
cb(self.statusMessage, self.errorMessage, undefined);
})
.end();
self.client.on('error', function(err) {
console.error(err);
});
};
module.exports = WebRefresh;
I'm sure there is something wrong with that, I'm not used to promise. Could you help me ?
EDIT
I have 1 Selenium instance linked to 1 API instance, 10 instances in parallel behind an ELB. I have 500 request I need to send, with 10 request per batch. I'm waiting to get a response before sending another request.
I switched to Chrome, results are a bit better but still not what I expect. First requests have better results than other ones.
I added .manage().deleteAllCookies().close().end(); at the end but it's not making a huge difference.

memory leak in node.js app on AWS

I have some code in node, basically it is making api calls to external service and dump the returned data into a database. But it must have some serious memory leak since the node server will run out of memory in the middle. The AWS instance I am using is 2CPU, 4GB RAM. I spent a lot of time to figure out where is the leak with no luck yet. Below is the code, any hint will be helpful.
function refreshSitesBy5Min(rawData, callback){
var sites = JSON.parse(rawData).data;
if (typeof sites !== 'undefined' && sites.length > 0){
log.info('refreshing sites 5min');
sites.forEach(function(elem, index, array){
db.site_5min.find({siteid: elem.id, ts : moment(elem.ts).format('YYYY-MM-DDThh:mm:ss')}, function(err, found){
if (typeof found === 'undefined' || found == null || found.length == 0){
db.site_5min.save({
siteid : elem.id,
gran : '5min',
ts : moment(elem.ts).format('YYYY-MM-DDThh:mm:ss'),
wh_sum : elem.Wh_sum
}, function(err, inserted){
if (err){
log.error(err);
}
});
}
else{
db.site_5min.save({
id: found.id,
siteid : elem.id,
gran : '5min',
ts : moment(elem.ts).format('YYYY-MM-DDThh:mm:ss'),
wh_sum : elem.Wh_sum
}, function(err, updated){
if (err){
log.error(err);
}
})
}
})
})
}
else{
log.warn('no sites data');
}
callback();
}
and this is the code to call previous method:
function refreshSiteByGran(globalToken, gran, frequency){
log.info('refreshing site for ' + gran + ' table');
// db.site.find({}, function(err, sites){
db.run("select * from site", function(err, sites){
if (err){
log.error(err);
}
if (sites){
function handler(i){
if (i < sites.length){
var thePath = '/v3/sites/' + sites[i].siteid + '/data?fields=Wh_sum&tz=US/Pacific&gran=' + gran;
var end = moment().subtract(1, 'days').format('YYYY-MM-DDThh:mm:ss');
var start;
if (gran === '5min' || gran === 'hourly'){
start = moment(end).subtract(frequency, 'days').format('YYYY-MM-DDThh:mm:ss');
}
else if (gran === 'daily'){
start = moment(end).subtract(frequency, 'days').format('YYYY-MM-DDThh:mm:ss');
}
else if (gran === 'monthly'){
start = moment(end).subtract(frequency, 'months').format('YYYY-MM-DDThh:mm:ss');
}
thePath = thePath + '&start=' + start + '&end=' + end;
log.warn('thePath: ' + thePath);
var options = locusUtil.setOptions(thePath, globalToken.token.access_token);
request(options, function(err, result, body){
if (err){
log.error(err + ' path: ' + thePath);
}
if (body && JSON.parse(body).statusCode == 401){
getLocusToken(function(){
setTimeout(function(){
handler(i);
}, 2000);
})
}
else if (body && JSON.parse(body).statusCode == 200){
var data = JSON.parse(body).data;
// log.info('any data? ' + JSON.stringify(body, null, 4));
if (typeof data !== 'undefined' && data.length > 0){
if (gran === '5min'){
refreshSitesBy5Min(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
if (gran === 'hourly'){
refreshSitesByHourly(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
if (gran === 'daily'){
refreshSitesByDaily(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
if (gran === 'monthly'){
refreshSitesByMonthly(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
}
else{
setTimeout(function(){
handler(i+1);
}, 2000);
}
}
// re-try for concurrency error
else if (body && JSON.parse(body).statusCode == 429){
log.warn('error body ' + JSON.stringify(body));
setTimeout(function(){
handler(i);
}, 2000);
}
// if any other error, just skip
else {
setTimeout(function(){
handler(i+1);
}, 2000);
}
})
}
else{
return;
}
}
handler(0);
}
});
}
I believe the problem is inside this two blocks, I used memwatch to monitor v8 garbage collection, I see usage_trend is increasing fast, so it must have leaks.
This is very simple to solve...
First, get rid of the forEach loop, located here...
sites.forEach(function(elem, index, array){
Instead, create a recursive function that simply passes an index to the following iteration. What this does is create a loop that executes correctly in accordance to the given CPU and memory allotted. No need for process.nextTick() or any of that fancy jazz.
Asynchronous loops are not technically the answer, as they overload systems rather quickly with thousands of queues. Instead, iterate through each record, then only proceed to the next when the current process is finished.
Also, delete the current array index before proceeding to the next.
Eventually, the end of the loop is reached when the index returns "undefined". That is when the callback to the main function refreshSitesBy5Min is summoned.
function refreshSitesBy5Min(rawData, callback) {
var sites = JSON.parse(rawData).data
getSite(0)
function getSite(index) {
// we have reached the end
if(!sites[index])
return callback()
runProcess(sites[index]
// clear up memory after every iteration
delete sites[index]
// done with iteration, move on to the next
getSite(++index)
}
}
Still not done yet...
Big JSON Object
If your JSON object is massive, you will need to stream your JSON and handle tiny chunks at a time.
https://github.com/uhop/stream-json
Big Database Result Set
Your SQL query should utilize limit if you are returning more than 500 records results at a time, even smaller is better. So if your record set being returned is 100,000. Just grab 500 at a time in a recursive function, simply increment the index and multiply it by num_records, in this scenario: 500.
var offset = iter * 500
limit: [offset, 500]

Exit Node Process After Successful fs.appendFile

I'm having trouble create processes in parallel with Node while exiting when they're done with a simple HTTP GET request. I've noticed that if I fire a process.exit() inside of a callback for appendFile, some files will not be created or appended in a Node cluster setup. Ideally, the way below is how I would like to fire events since the process is exited as soon as the job is done:
var rp = require("request-promise");
config = require("./config"),
cluster = require("cluster"),
os = require("os"),
fs = require("fs");
var keywordArray = [
'keyword1',
'keyword2',
...
];
if (cluster.isMaster) {
var numCPUs = os.cpus().length;
var clusterDivision = Math.ceil(keywordArray.length/numCPUs);
// Reset the json if previously set
keywordArray.forEach(function(arrayItem) {
fs.unlink(config.dataDirectory + arrayItem + '.json', function(err) {
if (err) console.error(err);
console.log('successfully unlinked ' + arrayItem + '.json from ' + config.dataDirectory);
});
});
// Create a worker for each CPU
// Seperate the array out evenly for each worker
for (var j=1;j<=numCPUs;j++) {
var tempArray = [];
var removed = keywordArray.splice(0, clusterDivision);
if (removed.length > 0) {
// The array contains something so let's do something with the keyword
console.log('creating a worker');
cluster.fork().send(removed);
} else {
// We don't need a cluster here
}
}
process.on('exit', function() {
console.log('exited');
});
} else if (cluster.isWorker) {
// Code to run if we're in a worker process
// Send the object we created above from variables so they're available to the workers
process.on('message', function(seperatedArrayItem) {
seperatedArrayItem.forEach(function(arrayItem) {
function radarRequest(err, response, body) {
var responseBody = JSON.parse(body);
console.log(arrayItem);
fs.appendFileSync(config.dataDirectory + arrayItem + '.json', JSON.stringify(responseBody.results, null, '\t'), function (err) {
if (err) console.err(err);
console.log('success writing file');
});
}
rp({
url: config.radarSearchURI +
'?key='+ config.apiKey +
'&location=' + config.latitude + ',' + config.longitude +
'&radius=' + config.searchRadius +
'&keyword=' + arrayItem, headers: config.headers
}, radarRequest);
});
setTimeout(function() {
process.exit(0);
}, 5000);
});
}
The only way I can make sure all files are properly appended is by using a Timeout, which is exactly what I don't want to - and shouldn't - do. Is there another way I can ensure an appendFile has happened successfully and then kill the node process? Here's a way that works (assuming the process doesn't take longer than 5 seconds):
process.on('message', function(seperatedArrayItem) {
seperatedArrayItem.forEach(function(arrayItem) {
function radarRequest(err, response, body) {
var responseBody = JSON.parse(body);
console.log(arrayItem);
fs.appendFile(config.dataDirectory + arrayItem + '.json', JSON.stringify(responseBody.results, null, '\t'), function (err) {
if (err) console.err(err)
console.log('success writing file');
});
}
rp({
url: config.radarSearchURI +
'?key='+ config.apiKey +
'&location=' + config.latitude + ',' + config.longitude +
'&radius=' + config.searchRadius +
'&keyword=' + arrayItem, headers: config.headers
}, radarRequest);
});
setTimeout(function() {
process.exit(0);
}, 5000);
});
You can use an async flow control module like async to kill the process after all files are written. I'd also recomment cluster.worker.disconnect() so that the node process will simple exit gracefully, but that isn't a requirement.
async.forEach(seperatedArrayItem, function(item, done){
// append file and call 'done' when it is written.
}, function(){
// Will be called when all item 'done' functions have been called.
cluster.worker.disconnect();
});
Node fs.appendFile( ... ) is an asynchronous function. So it expects us to pass a callback for we know it has finished its main operation, to inform us of some error occurred, or another purpose.
This means we need to call Node process.exit( ... ) in the scope of the provided callback. I've written this code to test:
'use strict';
var fs = require('fs');
function jsonValue(obj) {
return JSON.stringify(obj, null, '\t');
}
fs.appendFile('file.json', jsonValue(['t', 'e', 's', 't']), function(error) {
if (error) {
throw error;
}
console.log('success writing file'); // no error, so log...
process.exit(); // and exit right now
console.log('exited?'); // this will not be printed
});
Well, it worked as defined.
Other way it works is to use the synchronous version of fs.appendFile( ... ) and call process.exit() in a sequential way:
fs.appendFileSync('file.json', jsonValue(['t', 'e', 's', 't']));
console.log('success writing file'); // no error (I hope so =), so log...
process.exit(); // and exit right now
console.log('exited?'); // this will not be printed
This is clean code and works, but you lose the robustness and convenience gained with the callback...

Resources