I've been trying to scrape this url unsuccessfully and retrieve the "Date of Organization in Massachusetts." I suspect I might just be mislabeling the DOM, but have already tried a series of ids and classes. Any suggestions - I'm using cheerio and request.
var url = 'http://corp.sec.state.ma.us/CorpWeb/CorpSearch/CorpSummary.aspx?FEIN=800829800&SEARCH_TYPE=1';
request(url, function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
var orgdate = $('#MainContent_tblOrg .p1 td #MainContent_lblOrganisationDate').text();
console.log(orgdate);
});
Related
I'm getting data through a single request. But here I am trying to send multiple HTTP requests. Here I just struck unable to get data and how to pass data in view page i.e, in EJS
router.get('/specials',function(req,res,next){
var callbackThree = function(error, resp, body) {
var data = JSON.parse(body);
res.render("specials",{ data: data});
}
var callbackTwo = function(error, resp, body) {
request("https://siteblabla.com/wsmenu/sub_menu_list/789/", callBackThree);
}
var callbackOne = function(error, resp, body) {
request("https://siteblabla.com/wsspecials/specials_list/123/", callBackTwo);
}
// request("api.com/users", callBackOne);
});
You need to use Promises and there is a great npm package called ejs-promise which you can make use of in your case.
You can download it at the below URL,
https://www.npmjs.com/package/ejs-promise
Hope this helps!
I am trying to get the API response time from a website with cheerio. I have to wait first though for the site to fetch the time, though I am not sure how exactly to do that. Here is what I have tried. At the moment it does not work because it doesnt wait for the website to fetch the time.
request.get("site", function (err, res, body) {
if (!err) {
var $ = cheerio.load(body);
$('.metrics-container').filter(function(){
var data = $(this);
var response_time_api = data.children().children().children()[1].children;
console.log(response_time_api)
});
}
});
here is the image from the source am fetching
There are several tutorials that describe how to scrape websites with request and cheerio. In these tutorials they send the output to the console or stream the DOM with fs into a file as seen in the example below.
request(link, function (err, resp, html) {
if (err) return console.error(err)
var $ = cheerio.load(html),
img = $('#img_wrapper').data('src');
console.log(img);
}).pipe(fs.createWriteStream('img_link.txt'));
But what if I would like to process the output during script execution? How can I access the output or send it back to the calling function? I, of course, could load img_link.txt and get the information from there, but this would be to costly and doesn't make sense.
You can wrap request in a function that will callback with html
function(link, callback){
request(link, function(err, im, body){
callback(err, body);
});
});
Then assign it to exports and use in any other module.
Remove the pipe all together.
request(link, function (err, resp, html) {
if (err) return console.error(err)
var $ = cheerio.load(html);
var img = $('#img_wrapper').data('src'); // the var img now has the src attr of some image
return img; // Will return the src attr
});
Update
By your comments, it seems like your request function is working as expected, but the problem is rather accessing the data from another module.
I suggest you read this Purpose of Node.js module.exports and how you use it.
This is also a good resource article describing how require and exports are working.
Put the code above in a module
Use the module.exports
Require the module in another file
I'm playing around with using nodejs as a custom front end for drupal and i'm trying to come up with a way to match the backend menu system, blocks and views with the routing in express.
example route
module.exports = {
'/work': function(req, res){
//get view json for this page
request('http://site.api/casestudies', function(err, response, body){
views_body = JSON.parse(body);
//get node id from alias
request('http://site.api/alias-to-nid' + req.url, function(err, response, body){
body = JSON.parse(body);
var reqUrl = 'http://site.api/rest/api/' + body.path;
request(reqUrl, function(err, response, body){
body = JSON.parse(body);
//get the data we need
var node_title = body.title,
node_body = body.body.und[0].safe_value,
pageclass = 'not-front section-work';
res.render('work', {title: node_title, class:pageclass, node_title:node_title, node_body:node_body, views_body:views_body});
});
});
});
}
}
So, i hit /work and grab the json for the casestudies view that should exist on that page, then i lookup the node id from the /work alias using another request and finally use the node id in yet another nested request call to grab the rest of the json for the page before finally sending it on the the template.
Now - I have a feeling that this is a terrible way to go about this. What should I be doing instead!?
I'm apparently a little newer to Javascript than I'd care to admit. I'm trying to pull a webpage using Node.js and save the contents as a variable, so I can parse it however I feel like.
In Python, I would do this:
from bs4 import BeautifulSoup # for parsing
import urllib
text = urllib.urlopen("http://www.myawesomepage.com/").read()
parse_my_awesome_html(text)
How would I do this in Node?
I've gotten as far as:
var request = require("request");
request("http://www.myawesomepage.com/", function (error, response, body) {
/*
Something here that lets me access the text
outside of the closure
This doesn't work:
this.text = body;
*/
})
var request = require("request");
var parseMyAwesomeHtml = function(html) {
//Have at it
};
request("http://www.myawesomepage.com/", function (error, response, body) {
if (!error) {
parseMyAwesomeHtml(body);
} else {
console.log(error);
}
});
Edit: As Kishore noted, there are nice options for parsing available. Also see cheerio if you have python/gyp issues with jsdom on windows. Cheerio on github
That request() call is asynchronous, so the response is only available inside the callback. You have to call your parse function from it:
function parse_my_awesome_html(text){
...
}
request("http://www.myawesomepage.com/", function (error, response, body) {
parse_my_awesome_html(body)
})
Get used to chaining callbacks, that's essentially how any I/O will happen in javascript :)
JsDom is pretty good to achieve things like this if you want to parse the response.
var request = require('request'),
jsdom = require('jsdom');
request({ uri:'http://www.myawesomepage.com/' }, function (error, response, body) {
if (error && response.statusCode !== 200) {
console.log('Error when contacting myawesomepage.com')
}
jsdom.env({
html: body,
scripts: [
'http://code.jquery.com/jquery-1.5.min.js'
]
}, function (err, window) {
var $ = window.jQuery;
// jQuery is now loaded on the jsdom window created from 'agent.body'
console.log($('body').html());
});
});
also if your page has lot of javascript/ajax content being loaded you might want to consider using phantomjs
Source http://blog.nodejitsu.com/jsdom-jquery-in-5-lines-on-nodejs/