nodejs request running last - node.js

I am trying to figure out how to make the callback function in request run in order. Currently, my loop runs 10 times but does not wait for the callback function in request to finish before moving to the next iteration. My output is nothing like what I'd expect it to be and I'm not sure why certain things are being printed before others. Here is how my code is as of now:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var url;
for(var i=0; i < 10; i++ ){
url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
request(url, function(err, resp, body){
console.log("hello");
var $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
return;
}else{
console.log($('.error').text().substring(0, 14) );
var pfname = $('.pfname');
var plname = $('.plname');
var professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
return;
}
});
}
Here is the output I am getting:
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
Michael Beeson
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
hello
Sami Khuri
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
hello
aaa aaa
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
Here is the proper output:
aaa aaa
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1
Sami Khuri
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=2
Michael Beeson
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=3
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found

There are multiple issues in your code, but the main issue is that you're running an async operation inside the for loop so your for loop will start all the async operations and then they will, one-by-one complete later. Any variables shared between the loop invocations will tromp one another.
So, in a nutshell, I did:
Removed all shared variables so each loop invocation has its own variables (no conflicts).
Switched over to request-promise so we can use Promise.all() to more easily tell us when they are all done.
Returned the value we want from each .then() handler so that will be collected by Promise.all() as the final values for each invocation of the loop.
Because there appears to be no reason to sequence your operations, I let them all run in a parallel (that's faster) and then let Promise.all() put the results in order for us in the final array of results.
Here's the code:
const express = require('express');
const path = require('path');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const app = express();
let promises = [];
for (let i = 0; i < 10; i++ ) {
let url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
promises.push(rp(url).then(function(body) {
console.log(url);
let $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
return null;
} else {
console.log($('.error').text().substring(0, 14) );
let pfname = $('.pfname');
let plname = $('.plname');
let professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
return professorName;
}
}));
}
// see when they are all done
Promise.all(promises).then(results => {
// array of results, some entries that were not found may be null
console.log(results);
}).catch(err => {
console.log(err);
});
If you want to sequence them one at a time so the second request doesn't start until the first one is done, that could be done like this using async/await:
const express = require('express');
const path = require('path');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const app = express();
async function run() {
let results = [];
for (let i = 0; i < 10; i++ ) {
let url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
try {
let body = await rp(url);
console.log("hello");
let $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
results.push(null);
} else {
console.log($('.error').text().substring(0, 14) );
let pfname = $('.pfname');
let plname = $('.plname');
let professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
results.push(professorName);
}
} catch(e) {
console.log(url, e);
results.push(null);
}
}
return results;
}
run().then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Related

Can't get text from a div

I want to get the content of the div mw-content-text from some wikipedia page (this is just examples to learn node.js) I have made this:
var fetch = require('node-fetch');
var cheerio = require('cheerio');
var fs = require('fs');
var vv = [
'https://en.wikipedia.org/wiki/Ben_Silbermann',
'https://en.wikipedia.org/wiki/List_of_Internet_entrepreneurs'
];
var bo=[],
$;
vv.forEach((t)=>{
fetch(t)
.then(res => res.text())
.then((body) => {
$ = cheerio.load(body);
var finded = $('#mw-content-text').text();
bo.push(finded);
});
});
console.log(bo);
If I output body, it is filled with a string containing the whole html page (so, this step is ok),
If I output $ it contains a collection (but I'm not sure if it's populated, I use the node.js command prompt but it looks that it's not the right tool, any advice on that too?)
Anyway, variable bo returns me an empty array
The issue here is that we're logging bo before the fetch call is complete. I'd suggest using the async/await syntax to ensure we wait for all the gets to return, then we can log the result.
You could follow with some more processing like removing empty lines, whitespace etc, but that shouldn't be too hard.
var fetch = require('node-fetch');
var cheerio = require('cheerio');
var vv = [
'https://en.wikipedia.org/wiki/Ben_Silbermann',
'https://en.wikipedia.org/wiki/List_of_Internet_entrepreneurs'
];
async function getDivcontent() {
const promises = vv.map(async t => {
const body = await fetch(t).then(res => res.text());
const $ = cheerio.load(body);
return $('#mw-content-text').text();
});
return await Promise.all(promises);
}
async function test() {
let result = await getDivcontent();
console.log("Result:" + result);
}
test();

Firebase Database Get All Value In Order Cloud Functions

I develop for Firebase Cloud Functions. I have a Firebase Realtime Database like this:
----- myData
-------eqewrwrepere (this one is a device token)
---------Lta+sde-fer (this one is a firebase id)
firstvalue : "a"
secondvalue : "b"
----------Qrgd+ad-qdda (this one is second firebase id)
firstvalue : "c"
secondvalue : "d"
-------eqwerSAsdqe (this one is another device token)
---------Lta+sde-fer (this one is a firebase id)
firstvalue : "x"
secondvalue : "y"
----------Qrgd+ad-qdda (this one is second firebase id)
firstvalue : "z"
secondvalue : "t"
I fetch these data by this code. With this code i fetch all data and put them an array. And when fetching done, i loop this array for finding items. I am an iOS developer, so i am a newbie for NodeJS. Here is what i want to do:
Get firstvalue for each database data.
Make a api request with firstvalue of each database data.
Api returns an image.
Write image temp directory.
Process this image for visionApi.
Extract text.
Update database.
Send notification for deviceToken
Now i am able to retrieve database items in my array. When i make a request in for loop, request called async. So for loop continues, but request response or writing file and vision processing executed only once.
In for loop, get databasearray[0], make request, write file, process it with vision api, update database and go for next databasearray[1] item.
I read about Promises on different pages. But i did not understand.
Thank you.
'use strict';
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp(functions.config().firebase);
var request = require('request');
var fs = require('fs');
//var fs = require("fs");
// Get a reference to the Cloud Vision API component
const Vision = require('#google-cloud/vision');
const vision = new Vision.ImageAnnotatorClient();
// Imports the Google Cloud client library
//const {Storage} = require('#google-cloud/storage');
var fs = require("fs");
var os = require("os");
var databaseArray = [];
exports.hourly_job = functions.pubsub
.topic('hourly-job')
.onPublish((event) => {
console.log("Hourly Job");
var db = admin.database();
var ref = db.ref("myData")
ref.once("value").then(function(allData) {
allData.forEach(function(deviceToken) {
deviceToken.forEach(function(firebaseIDs) {
var deviceTokenVar = deviceToken.key;
var firebaseIDVar = firebaseIDs.key;
var firstvalue = firebaseIDs.child("firstvalue").val();
var secondvalue = firebaseIDs.child("secondvalue").val();
var items = [deviceTokenVar, firebaseIDVar, firstvalue, secondvalue];
databaseArray.push([...items]);
});
});
return databaseArray;
}).then(function(databasem) {
var i;
for (i = 0; i < databaseArray.length; i++) {
var databaseArrayDeviceToken = databaseArray[i][0];
console.log("DeviceToken: " + databaseArrayDeviceToken);
var databaseArrayFirebaseID = databaseArray[i][1];
console.log("FirebaseID: " + databaseArrayFirebaseID);
var databaseArrayfirstvalue = databaseArray[i][2];
console.log("firstval: " + databaseArrayfirstvalue);
var databaseArraysecondval = databaseArray[i][3];
console.log("Second: " + databaseArraysecondval);
var url = "http://api.blabla" + databaseArrayfirstvalue;
/////////////here make a request, pause loop, process returned image, but how //////////////////////
request.get({
url: url,
encoding: 'binary'
}, function(error, httpResponse, body) {
if (!error && httpResponse.statusCode == 200) {
fs.writeFileSync('/tmp/processed.jpg', body, 'binary')
console.log("file written");
})
}
});
return true;
});
I found solution with Mocas helps. Here is the solution. I use async/await functions in code. Now for loop waits for the function response. But now I have different problems. I think main async function hangs because of awaits. And then next hourly trigger, it runs again. So console log shows 15-16-17 or more ‘i’ values in for loop. I have 4 element in database array but console log shows more than this every hour. And it increases every time. So I guess that I should cancel this await functions after a timeout. But I don’t know how. Here is code:
use strict';
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp(functions.config().firebase);
var request = require('request-promise').defaults({ encoding: null });
var fs = require('fs');
// Get a reference to the Cloud Vision API component
const Vision = require('#google-cloud/vision');
const vision = new Vision.ImageAnnotatorClient();
var os = require("os");
var databaseArray = [];
var uniqueFilename = require('unique-filename')
exports.hourly_job = functions.pubsub
.topic('hourly-job')
.onPublish((event) => {
console.log("Hourly Job");
var db = admin.database();
var ref = db.ref("myData")
ref.once("value").then(function(allData) {
allData.forEach(function(deviceToken) {
deviceToken.forEach(function(firebaseIDs) {
var deviceTokenVar = deviceToken.key;
var firebaseIDVar = firebaseIDs.key;
var firstvalue = firebaseIDs.child("firstvalue").val();
var secondvalue = firebaseIDs.child("secondvalue").val();
var items = [deviceTokenVar, firebaseIDVar, firstvalue, secondvalue];
databaseArray.push([...items]);
//console.log(databaseArray);
//return true;
});
//return true;
});
return databaseArray;
}).then(function (databasem) {
main().catch(console.error);
});
return true;
});
const main = async () => {
var i;
for (i = 0; i < databaseArray.length; i++) {
console.log("Database Arrays " + i + ". elements: ");
var databaseArrayDeviceToken = databaseArray[i][0];
console.log("DeviceToken: " + databaseArrayDeviceToken);
var databaseArrayFirebaseID = databaseArray[i][1];
console.log("FirebaseID: " + databaseArrayFirebaseID);
var databaseArrayfirst = databaseArray[i][2];
console.log("first: " + databaseArrayfirst);
var databaseArraysecond = databaseArray[i][3];
console.log("second: " + databaseArraysecond);
if (databaseArrayfirst != "") {
var apiUrl = "http://api.blabla;
try {
const apiBody = await request.get(apiUrl);
///////////////////////////vison start//////////////////////
const visionResponseBody = await vision.documentTextDetection(apiBody)
var visionResponse = visionResponseBody[0].textAnnotations[0].description;
console.log("Vision response text " + visionResponse );
...some logic here about response...
/////////////////////////////////////////////////
var getdatabasevar = await admin.database().ref("myData/" + databaseArrayDeviceToken + "/" + databaseArrayFirebaseID);
await getdatabasevar.update({
"firstvalue": visionResponse
});
/////////////////////////////////////////////////
var getanotgerdatabasevar = await admin.database().ref("myData/" + databaseArrayDeviceToken + "/" + databaseArrayFirebaseID + "/" + "secondvalue");
await getanotgerdatabasevar.once("value")
.then(function(var) {
..some logic..
//send notification
});
} catch (error) {
console.error(error);
}
///////////////////////////vison end//////////////////////
}
};
return true;
};

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

scraping a page that redirects

i try to scrape a simple page (require cheerio and request):
https://www.ishares.com/uk/individual/en/products/251824/
The code fails. I believe it is because, in order to get to the above, users are prompted on previous page for "individual" or "institutional" so are being redirected.
I have tried different variations of the url, but all fail.
how can i get the raw HTML using node.js ?
here is the code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio'); // fast flexible implement of jQuery for server.
var fs = require('fs');
var app = express();
var port = 8000;
var timeLog = []; // for dl to measure the time of events.
// var startTime = Date.now();
timeLog[0] = Date.now();
console.log('program initiated at time: '+new Date());
// example 1: pull the webpage and print to console
var url ="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf";
url = "https://www.ishares.com/uk/individual/en/products/251824/";
url="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
request(url,function functionName(err,resp,body) {
var $ = cheerio.load(body);
var distYield = $('.col-distYield');
var distYieldText = distYield.text();
console.log('we got to line 24');
console.log(distYieldText);
timeLog[2] = Date.now();
console.log('data capture time: '+(timeLog[2] - timeLog[0])/1000+' seconds');
if (err) {
console.log(err);
}else {
//console.log(body);
console.log('the body was written: success');
}
});
// example 2: download webpage and save file
var destination = fs.createWriteStream('./downloads/iSharesSEMB.html');
request(url)
.pipe(destination);
// example 3:
var destination = fs.createWriteStream('./downloads/iSharesSEMB2.html');
request(url)
.pipe(destination)
.on("finish",function () {
console.log('done');
})
.on('error',function (err) {
console.log(err);
});
timeLog[1] = Date.now();
console.log('program completed at time: '+new Date());
console.log('Asynchronous program run time: '+(timeLog[1] - timeLog[0])/1000+' seconds');
Alright, I got it to work. I enabled cookie support for request but then got into a redirect loop. Adding a promise worked it out. Here's only the relevant HTML request part:
const request = require('request'),
cheerio = require('cheerio');
const url = "https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
options = {
jar: true
}
const getDistYield = url => {
return new Promise((resolve, reject) => {
request(url, options, function(err,resp,body) {
if (err) reject(err);
let $ = cheerio.load(body);
resolve($('.col-distYield'));
})
})
}
getDistYield(url)
.then((tag) => {
console.log(tag.text())
}).catch((e) => {
console.error(e)
})
Outputs:
Distribution Yield
The distribution yield represents the ratio of distributed income over the last 12 months to the fund’s current NAV.
as of 20-Feb-2018
4.82
Also, notice I've used the last URL you provided.
I hope this works it out for you :)
have amended the resolve part to just get the value (and not the text) which is a nested class.
resolve($('.col-distYield > span:nth-child(2)'));

NodeJs + Request-promise - error catching

I'm having trouble with error handling with my function in my bot for Discord. What I've got right now is a command that scraps information from a website, I want to make it so if there is an error (404), the user will get some feedback. How would I go about doing this? Right now I currently have something, but I'm not sure what I'm doing wrong. Here is a snippet of code:
//modules used
const rp = require('request-promise-native');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
if (message.content.startsWith(prefix + 'latest')) {
//website url variables
const website_domain = "https://hypebeast.com/";
let website_path = args[0];
let website_url = website_domain + website_path;
//extra arguments variable
let extra_arg = args.slice(1).join(" ");
if (extra_arg.length > 0) {
message.reply('too many arguments! Please refer to `h.help` for correct usage.');
} else {
//opening url and scrapping elements
function scrapData(website_url) {
return rp(website_url)
.then(body => {
let items = [],
$ = cheerio.load(body).catch(errors.StatusCodeError, function (reason) {
console.log(reason);
});
//web scrapping here
$('.post-box').each(function() {
let title = $(this).find($('.title h2 span')).first().text(),
caption = $(this).find($('.post-box-excerpt p')).first().text(),
article_url = $(this).find($('.col-hb-post-image a')).first().attr('href'),
thumbnail_long = $(this).find($('.thumbnail img')).first().attr('src');
//adding title, caption, etc to list
items.push({title, caption, article_url, thumbnail_long});
//check items in console
console.log(items);
})
return items;
})
}
I have just modified your code little try this below code.
//modules used
const rp = require('request-promise-native');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
if (message.content.startsWith(prefix + 'latest')) {
//website url variables
const website_domain = "https://hypebeast.com/";
let website_path = args[0];
let website_url = website_domain + website_path;
//extra arguments variable
let extra_arg = args.slice(1).join(" ");
if (extra_arg.length > 0) {
message.reply('too many arguments! Please refer to `h.help` for correct usage.');
} else {
var options = {
uri: website_url,
transform: function (body) {
return cheerio.load(body);
}
};
rp(options)
.then(function ($) {
// Process html like you would with jQuery...
$('.post-box').each(function() {
let title = $(this).find($('.title h2 span')).first().text(),
caption = $(this).find($('.post-box-excerpt p')).first().text(),
article_url = $(this).find($('.col-hb-post-image a')).first().attr('href'),
thumbnail_long = $(this).find($('.thumbnail img')).first().attr('src');
//adding title, caption, etc to list
items.push({title, caption, article_url, thumbnail_long});
//check items in console
console.log(items);
});
})
.catch(function (err) {
console.log(err);
});
}

Resources