I am writing an app using node.js which sends a request to github and fetches the html page of github project issues. when I send more request for 40th page I am getting 429 response for request. how can i overcome this RATE limit of github?
function requestPage(pageNo){
var changedUrl = url+"?page="+pageNo+"&q=is%3Aissue+is%3Aopen"; //URL for requesting all the pages individually
request(changedUrl, function(error, response, html){ //requesting thee web page
if(error){
return error;
}
else{
var $ = cheerio.load(html);
if(pageNo == 40){
console.log(response.statusCode);
fs.writeFile("page.html", html ,'utf8',function(err){
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
}
//functions
}
});
}
for (var i = 1; i <= noOfPages; i++) {
requestPage(i);
}
using the right tool for the right job
First, please note that you should better use github API instead of requesting the HTML pages. It should make your work easier, as you'll directly get the data you are interested in in JSON format, without having to parse HTML.
Issues API endpoints are documented here.
You also should check whether github allows you to crawl their page, and make sure you don't overload their servers with your requests.
fixing your code
However, in case you want to continue the way you're doing, you arrange your code this way:
handle rate limiting
Inside request function callback, you should add a condition to check the response from github:
request(changedUrl, function(error, response, html){
if(error){
return error;
}
else{
if (response.headers.status === '429 Too Many Requests') {
setTimeout(function() {requestPage(pageNo)}, 54000000}) // retry in some delay (find out the appropriate timeout value)
}
else {
... //continue the processing
behaving well
Another change required in your code may avoid you from being rate-limited. It should also spare your CPU and github servers' too.
This is bad:
for (var i = 1; i <= noOfPages; i++) {
requestPage(i);
}
Reason: your are sending noOfPages requests to github quasi-simultaneously.
How to fix: use a recursive function instead of a for loop, and set a delay between the calls.
Fixed code:
function requestPage(pageNo){
var changedUrl = url+"?page="+pageNo+"&q=is%3Aissue+is%3Aopen"; //URL for requesting all the pages individually
request(changedUrl, function(error, response, html){ //requesting thee web page
if(error){
return error;
}
else{
if (response.headers.status === '429 Too Many Requests') {
// retry to request the same page after some delay
setTimeout(function() {requestPage(pageNo)}, 54000000})
}
else {
var $ = cheerio.load(html);
if(pageNo == 40){
console.log(response.statusCode);
fs.writeFile("page.html", html ,'utf8',function(err){
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
if (i < noOfPages) {
// request next page after a small delay
setTimeout(function() {requestPage(pageNo+1)}, 15000})
}
}
}
});
}
// request the first page immediately
requestPage(1);
I leave it up to you to cleanup the hard-coded if(pageNo == 40){, which probably should be if(pageNo == noOfPages){ and could be arranged with the following if clause.
In ES7 with babel, just one page at a time:
import req from 'request-promise';
async function getPages(urls) {
for (let url of urls) {
let html = await req(url);
console.log(html);
}
Related
I'm trying to write a function to use callbacks to send a message in Facebook messenger. I need to do this because I'm having problems sending text from an array. The messages are sent, but not in the correct order. I THINK this is because Nodejs is looping over the elements faster than it can send the text. See my question about this here.
So now I am trying to rewrite my send functions using callbacks, in the vain hope that I can somehow FORCE NodeJS to actually WAIT before jumping to the next element!
So far I have the following code:
Main send function:
sendWithCallback: function(messageData, callback) {
request({
uri: 'https://graph.facebook.com/v2.6/me/messages',
qs: {
access_token: config.FB_PAGE_TOKEN
},
method: 'POST',
json: messageData
}, function (error, response, body) {
if (!error && response.statusCode === 200) {
let recipientId = body.recipient_id;
console.log("Message sent to recipient '%s'", recipientId);
callback(true);
} else {
console.error("Could not send message: ", response.statusCode, response.statusMessage, body.error)
callback(false);
}
}
);
},
Function for sending a "multi part" message (i.e. an Array of text):
sendMultipartMessage: function(recipientId, textArray) {
let messageData, msgPart, msgLength, count = 0;
msgLength = textArray.length;
while (count < msgLength) {
msgPart = textArray[count];
messageData = {
recipient: {
id: recipientId
},
message: {
text: msgPart
}
};
}
self.sendWithCallback(messageData, function(sent) {
if (sent) {
count++;
console.log("Message part %s sent.", msgPart);
}
else {
console.log("Couldn't send message");
}
});
},
In my head, this code works properly! It sends the text (taken from the array), then increments the count until it is equal to messageLength. But in reality it DOESN'T do that. Instead, it just goes into an infinite loop (which I can't see happening in my logs) and then crashes the app.
WHAT am I doing wrong?
If we simplify your loop it essentially becomes this:
let count = 0;
while (count < msgLength) {
messageData = data;
}
You never increment count.
I think that you intend to move the self.sendWithCallback call inside of the while loop. However, this still won't do what you want and will run forever. Even if it did do what you wanted, it wouldn't solve the problem of sending messages out in order.
JavaScript's concurrency model uses an event loop with "run-to-completion." You can pass messages to the event queue using something like request which you call by sendWithCallback. This only adds a message to the queue, but that message is not processed until the current running block completes. That means that your while loop actually has to complete before any of your requests start running. We can construct a simpler example with setTimeout:
let count = 0;
while (count < 1) {
setTimeout(() => {
count++;
}, 1000);
}
console.log('while loop completed');
In the above the while loop never completes because count never gets incremented in the same block (console.log will never be called). It needs to complete before it can start processing the asynchronous messages you are creating with setTimeout.
You could actually just rewrite it like this:
textArray.forEach(msgPart => self.sendWithCallback(msgPart, sent => {
if (!sent) console.error('Could not send message');
});
However this doesn't guarantee the order that the messages were sent and it will send messages even if one of the messages triggers an error. If you want to send them in order you will have to recursively call sendWithCallback within the callback for the next message once the previous one completes. That might look something like this:
let count = 0;
const sendMessage = (textArray, count) => {
self.sendMessageWithCallback(textArray[count], sent => {
count++;
if (sent && count < textArray.length) {
sendMessages(textArray, count);
}
});
}
sendMessages(textArray, 0);
If you were using promises and async/await you could write this much more simply as something like:
for (count = 0; count < msgLength; count++) {
await self.sendMessageAsync(textArray[count]);
}
However this would require a larger rewrite of the surrounding code and using something like request-promise instead of just request.
This is some of my code that I have in my index.js. Its waiting for the person to visit url.com/proxy and then it loads up my proxy page, which is really just a form which sends back an email and a code. From my MongoDB database, I grab the users order using the code, which contains some information I need (like product and the message they're trying to get). For some reason, it seems like its responding before it gets this information and then holds onto it for the next time the form is submitted.
The newline in my res.send(product + '\n' + message) isnt working either, but thats not a big deal right now.
But.. for example, the first time I fill out the form ill get a blank response. The second time, I'll get the response to whatever I filled in for the first form, and then the third time ill get the second response. I'm fairly new to Web Development, and feel like I'm doing something obviously wrong but can't seem to figure it out. Any help would be appreciated, thank you.
app.get('/proxy', function(req,res){
res.sendFile(__dirname+ "/views/proxy.html");
});
var message = "";
var product = "";
app.post('/getMessage', function(req,res)
{
returnMsg(req.body.user.code, req.body.user.email);
//res.setHeader('Content-Type', 'text/plain');
res.send(product + "\n" + message);
});
function returnMsg(code, email){
MongoClient.connect(url, function(err, db){
var cursor = db.collection('Orders').find( { "order_id" : Number(code) })
cursor.each(function(err, doc){
assert.equal(err, null);
if (doc!= null)
{
message = doc["message"];
product = doc["product"];
}
else {
console.log("wtf");
// error code here
}
});
console.log(email + " + " + message);
var document = {
"Email" : email,
"Message" : message
}
db.collection("Users").insertOne(document);
db.close();
});
}
You need to do lots of reading about your asynchronous programming works in node.js. There are significant design problems with this code:
You are using module level variables instead of request-level variables.
You are not correctly handling asynchronous responses.
All of this makes a server that simply does not work correctly. You've found one of the problems already. Your async response finishes AFTER you send your response so you end up sending the previously saved response not the current one. In addition, if multiple users are using your server, their responses will tromp on each other.
The core design principle here is first that you need to learn how to program with asynchronous operations. Any function that uses an asynchronous respons and wants to return that value back to the caller needs to accept a callback and deliver the async value via the callback or return a promise and return the value via a resolved promise. The caller then needs to use that callback or promise to fetch the async value when it is available and only send the response then.
In addition, all data associated with a request needs to stay "inside" the request handle or the request object - not in any module level or global variables. That keeps the request from one user from interfering with the requests from another user.
To understand how to return a value from a function with an asynchronous operation in it, see How do I return the response from an asynchronous call?.
What ends up happening in your code is this sequence of events:
Incoming request for /getMessage
You call returnMsg()
returnMsg initiates a connection to the database and then returns
Your request handler calls res.send() with whatever was previously in the message and product variables.
Then, sometime later, the database connect finishes and you call db.collection().find() and then iterate the cursor.
6/ Some time later, the cursor iteration has the first result which you put into your message and product variables (where those values sit until the next request comes in).
In working out how your code should actually work, there are some things about your logic that are unclear. You are assigning message and product inside of cursor.each(). Since cursor.each() is a loop that can run many iterations, which value of message and product do you actually want to use in the res.send()?
Assuming you want the last message and product value from your cursor.each() loop, you could do this:
app.post('/getMessage', function(req, res) {
returnMsg(req.body.user.code, req.body.user.email, function(err, message, product) {
if (err) {
// send some meaningful error response
res.status(500).end();
} else {
res.send(product + "\n" + message);
}
});
});
function returnMsg(code, email, callback) {
let callbackCalled = false;
MongoClient.connect(url, function(err, db) {
if (err) {
return callback(err);
}
var cursor = db.collection('Orders').find({
"order_id": Number(code)
});
var message = "";
var product = "";
cursor.each(function(err, doc) {
if (err) {
if (!callbackCalled) {
callback(err);
callbackCalled = true;
}
} else {
if (doc != null) {
message = doc["message"];
product = doc["product"];
} else {
console.log("wtf");
// error code here
}
}
});
if (message) {
console.log(email + " + " + message);
var document = {
"Email": email,
"Message": message
}
db.collection("Users").insertOne(document);
}
db.close();
if (!callbackCalled) {
callback(null, message, product);
}
});
}
Personally, I would use promises and use the promise interface in your database rather than callbacks.
This code is still just conceptual because it has other issues you need to deal with such as:
Proper error handling is still largely unfinished.
You aren't actually waiting for things like the insert.One() to finish before proceeding.
I'm going to be honest. I'm way in over my head here.
I need to scrape data from a dynamic site for my employer. Before the data is visible on the page, there are some clicks and waits necessary. Simple PHP scraping won't do. So I found out about this NodeJS + PhantomJS combo. Quite a pain to set up, but I did manage to load a site, run some code and get a result.
I wrote a piece of jQuery which uses timeout loops to wait for some data to be loaded. Eventually I get a js object that I want to write to a file (JSON).
The issue I'm facing.
I build up the the js object inside the PhantomJS .evaluate scope, which runs in a headerless browser, so not directly in my Node.JS server scope. How do I send the variable I built up inside evaluate back to my server so I can write it to my file?
Some example code (I know it's ugly, but it's for illustrative purposes). I use node-phantom-simple as a bridge between Phantom and Node
var phantom = require('node-phantom-simple'),
fs = require('fs'),
webPage = 'https://www.imagemedia.com/printing/business-card-printing/'
phantom.create(function(err, ph) {
return ph.createPage(function(err, page) {
return page.open(webPage, function(err, status) {
page.onConsoleMessage = function(msg) {
console.log(msg);
};
console.log("opened site? ", status);
page.evaluate(function() {
setTimeout(function() {
$('.price-select-cnt').eq(0).find('select').val('1266').change()
timeOutLoop()
function timeOutLoop() {
console.log('looping')
setTimeout(function() {
if ($('#ajax_price_tool div').length != 6) {
timeOutLoop()
} else {
$('.price-select-cnt').eq(1).find('select').val('25')
$('.price-select-cnt').eq(2).find('select').val('Premium Card Stock')
$('.price-select-cnt').eq(3).find('select').val('Standard').change()
timeOutLoop2()
}
}, 100)
}
function timeOutLoop2() {
console.log('looping2')
setTimeout(function() {
if ($('.pricing-cost-cnt').text() == '$0' || $('.pricing-cost-cnt').text() == '') {
timeOutLoop2()
} else {
var price = $('.pricing-cost-cnt').text()
console.log(price)
}
}, 100)
}
}, 4000)
});
});
});
});
function writeJSON(plsWrite) {
var key = 'file'
fs.writeFile('./results/' + key + '.json', plsWrite, 'utf8', function() {
console.log('The JSON file is saved as');
console.log('results/' + key + '.json');
});
}
So do do I write the price this code takes from the website, get it out of the evaluate scope and write it to a file?
I have an Express app which requires very low response rate ~<200ms. Right now we can only get this number but that's a separate topic.
We're planning to fetch a piece of data from the database, if found in Redis return the data if not then fire the request and save that to redis so the next requests can get it from Redis.
I'm running some testing and was wondering if there's a way to reduce the amount of database fetching requests?
For example, currently our application has 300req/s per box. We have six boxes running on AWS. If for the first time that piece of data is not available in Redis, there might be around ~500 requests trying to fetch the data from DB and cache that in Redis. We're trying to reduce that number down. Not sure if there's a way in Node.js or Redis to handle that.
Here's the code that I'm testing.
client.getAsync('key').then(function (data) {
if(data) {
console.log(data); // Return this data if found
res.send(data);
} else {
// I'm trying to reduce the number of calls for concurrent requests in this block.
console.log('not found');
var dataFromDb = // fetch data from DB
client.set('key', dataFromDb); // Fire and forget
res.send('not found'); // Return not found right away
}
});
And I test the call by using ab
ab -n 20 -c 10 http://localhost:8081/redis
This is the results I got
not found
not found
not found
not found
not found
not found
something
not found
something
something
something
something
something
something
something
something
something
something
In this example, there's 7 requests trying to fetch database with the same data and save to Redis.
My question is, is there anyway I can reduce the number of requests down? Because fetching DB is quite slow as of now ~900ms (We're trying to optimize that)
Yes there is. I did same thing. I will describe only logic here. Method to fetchCache should return a promise. Also you keep array of { cacheKey, promise }. Each time you send a request - you add key to this array. When next time you need to fetch cache - you check array first and if key there grabbing this promise. Else calling fetchCache.
Here is my code. It works, but probably hard to read. Should give you a basic understanding.
class DictTranslatableRepo {
constructor(model) {
var self = this;
self.title = model + "s Repo";
self.model = models[model];
self.running = {};
self.curItems = {};
}
*start() {
var self = this;
var curItems = yield self.model.findAll();
_.forEach(curItems, function(row) {
self.curItems[row.key] = row.value;
});
};
*map(from) {
var self = this;
if (from == "") return "";
if (!_.isUndefined(self.curItems[from])) return self.curItems[from];
if (_.isUndefined(self.running[from])) {
self.running[from] = [];
return new Promise(function(resolve, reject) {
self.running[from].push(resolve);
self.job(from, function(err, to) { // Main job
var callbackArr = self.running[from];
delete self.running[from];
_.forEach(callbackArr, function(callback) {
callback(to);
});
});
});
} else {
return new Promise(function(resolve, reject) {
self.running[from].push(resolve);
});
}
};
job(from, callback) {
var self = this;
var to = "as shown";
co(function*() {
try {
to = yield translator.translate(from);
yield self.model.add({key: from, value: to});
self.curItems[from] = to;
callback(null, to);
} catch (err) {
callback(err);
//logger.error("Cant translate entity: " + from);
}
}).catch(function(err) {
// Unhandled Error
callback(new Error(err));
});
};
}
My map method is your fetchCache method.
I'm sorry if this is a basic question, but I am trying to implement a program in node.js that should wait for the value of a variable available trough a request to a cloud api (photon.variable()) to be 1. This variable should not be requested more than once per second. My first attempt is included in the sample code below. Despite knowing it does not work at all, I think it could be useful to show the functionality I would like to implement.
var photondata = 0;
while (photondata < 1)
{
setTimeout(function () {
photon.variable("witok", function(err, data) {
if (!err) {
console.log("data: ", data.result);
photondata = data.result;
}
else console.log(err);
})}, 1000);
}
Since you couldn't do async stuff in loops before, the traditional approach would be to create a function that adds itself to setTimeout for as long as needed, then calls some other function when it's done. You still need to do this in the browser if not using Babel.
These days, you can stop execution and wait for things to happen when using a generator function (which latest versions of Node now support). There are many libraries that will let you do this and I will advertise ours :)
CL.run(function* () {
var photondata = 0;
while (true) {
yield CL.try(function* () {
var data = yield photon.variable("witok", CL.cb());
console.log("data: ", data.result);
photondata = data.result;
}, function* (err) {
console.log(err.message);
});
if (photondata >= 1) break;
yield CL.sleep(1000);
}
// do whatever you need here
});