Cheerio scraping returning only two rows - node.js

So I tested my scraping on a static HTML file before adding it to my Node app.
The problem is that it's not returning all the rows.
On the site:
$('#sport tr').length
//Returns 13
In Cheerio:
$('#sport tr').length
//Returns 2
I'm stumped, here is the code I'm using. I've contained the URL as proof, so you can visit it yourself if you wish.
I'm suspecting it's something to do with var $ = cheerio.load(html); however I'm not experienced in Cheerio to say outright that's the problem.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
var url = 'http://www.olbg.com/football.php';
var json = [];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
console.log($('#sport tr').length);
var headers = [];
$('#sport tr th').each(function(i, th) {
var text = $(th).text();
if (text.trim() !== "") {
headers[i] = text.replace(/[\t\n\r\s]/mgi, '');
}
});
$('#sport tr').each(function(i, tr) {
// skip if header
if (!$(tr).is('th')) {
var temp = {};
temp["Event"] = $(tr).find('td').eq(0).text().trim();
temp["TopSelection"] = $(tr).find('td').eq(1).text().trim();
temp["BookieOdds"] = $(tr).find('td').eq(2).text().trim();
temp["OLBGRating"] = $(tr).find('td').eq(3).find('img').length;
if (temp["Event"] !== "" || temp["TopSelection"] !== ""){
json.push(temp);
}
}
});
}
// To write to the system we will use the built in 'fs' library.
// In this example we will pass 3 parameters to the writeFile function
// Parameter 1 : output.json - this is what the created filename will be called
// Parameter 2 : JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read
// Parameter 3 : callback function - a callback function to let us know the status of our function
fs.writeFile('output.json', JSON.stringify(json), function(err){
console.log('File successfully written!');
})
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send(json);
});
});
app.listen("8081");
console.log("Magic happens on port 8081");
exports = module.exports = app;

The reason that you're not getting the expected result is because the (table) html on that page is mangled. If you look at the second <td> in the second <tr> of the table#sport, you'll see an "extra" </td>. This causes the <td> that the table#sport is inside to close (and an implicit closing of table#sport) on some parsers because that is the closest open <td>. So that is why the parser reports only 2 <tr>s instead of 13. The other <tr>s you're expecting are now outside of table#sport.
Probably your best bet is to pass the html through an HTML tidying program/script (e.g. this one with the clean option enabled) first before passing it to cheerio. After that, your selector should return the elements you're probably expecting.

Related

Anystock not working with Anychart-NodeJS

I'm setting up a node.js server that renders static jpg/png images using Anychart.
It is possible for me to return the simple example pie charts in the examples but when I try to return the examples for AnyStock, I get some weird results.
The code should create and return a stock chart on the url: xx.xxx.xxx.xx:3000/insert.
Instead the code returns this chart without any graphs or candlesticks:
When I set the same graph up on a plain html site, I get following result:
The node.js code:
var fs = require('fs');
var express = require('express');
var app = express();
var path = require('path');
var router = express.Router();
app.get('/', function(req, res) {
var query = require('url').parse(req.url, true).query;
var stock_id = query.stock_id;
var type = query.type;
if (type == "insert") {
var JSDOM = require('jsdom').JSDOM;
var jsdom = new JSDOM('<head><script src="https://cdn.anychart.com/releases/8.9.0/js/anychart-core.min.js" type="text/javascript"></script><script src="https://cdn.anychart.com/releases/8.9.0/js/anychart-stock.min.js" type="text/javascript"></script></head><body><div id="container" style="width: 500px; height: 400px;"></div></body>', {
runScripts: 'dangerously'
});
var window = jsdom.window;
var anychart = require('anychart')(window);
var anychartExport = require('anychart-nodejs')(anychart);
var table, mapping, chart;
table = anychart.data.table();
table.addData([
['2015-12-24', 511.53, 514.98, 505.79, 506.40],
['2015-12-25', 512.53, 514.88, 505.69, 507.34],
['2015-12-26', 511.83, 514.98, 505.59, 506.23],
['2015-12-27', 511.22, 515.30, 505.49, 506.47],
['2015-12-28', 510.35, 515.72, 505.23, 505.80],
['2015-12-29', 510.53, 515.86, 505.38, 508.25],
['2015-12-30', 511.43, 515.98, 505.66, 507.45],
['2015-12-31', 511.50, 515.33, 505.99, 507.98],
['2016-01-01', 511.32, 514.29, 505.99, 506.37],
['2016-01-02', 511.70, 514.87, 506.18, 506.75],
['2016-01-03', 512.30, 514.78, 505.87, 508.67],
['2016-01-04', 512.50, 514.77, 505.83, 508.35],
['2016-01-05', 511.53, 516.18, 505.91, 509.42],
['2016-01-06', 511.13, 516.01, 506.00, 509.26],
['2016-01-07', 510.93, 516.07, 506.00, 510.99],
['2016-01-08', 510.88, 515.93, 505.22, 509.95],
['2016-01-09', 509.12, 515.97, 505.15, 510.12],
['2016-01-10', 508.53, 516.13, 505.66, 510.42],
['2016-01-11', 508.90, 516.24, 505.73, 510.40]
]);
mapping = table.mapAs();
mapping.addField('open', 1, 'first');
mapping.addField('high', 2, 'max');
mapping.addField('low', 3, 'min');
mapping.addField('close', 4, 'last');
mapping.addField('value', 4, 'last');
chart = anychart.stock();
chart.plot(0).ohlc(mapping).name('ACME Corp.');
chart.title('AnyStock Basic Sample');
chart.container('container');
chart.draw();
anychartExport.exportTo(chart, 'jpg').then(function(image) {
fs.writeFile('anychart.jpg', image, function(fsWriteError) {
if (fsWriteError) {
console.log(fsWriteError);
} else {
res.sendFile(path.join(__dirname + '/anychart.jpg'));
}
});
}, function(generationError) {
console.log(generationError);
});
} else if (type == "image") {
res.sendFile(path.join(__dirname + '/anychart.jpg'));
}
});
app.listen(3000);
I suspect there's something wrong with the way I includes the JS-files in the jsdom. If I exclude the two files in the jsdom, I get the same result..
Please let me know if you have any suggestions.
Can you please check and share the browser console messages? That tends to be the first troubleshooting step :)
So after a few days of waiting time, the AnyChart Support returned to me with the following answer for my question above:
we can't guarantee that this module will work as expected. It depends
on many other libraries that can't provide stable versions in
different OS.
Instead they recommend to use their Export Server solution which is different from what I was looking for.
Our setup is built on a LAMP-server, so we don't want to run another server just for a few images a day.
If any of you have a suggestion for a solution where I can export my AnyStock charts to JPG, PNG or GIFs please let me know.
Thanks :-)

Call function from Dust template

I'm developing a simple web site where I need to retrieve a list of objects from a database. I wanted to try nodejs so, after days of reading and tests, I finally decided to use this configuration:
Server technology: Nodejs + Express
Template engine: Dust
Database/Data source: Parse
I wired all these stuff and it seems working well, but I have now the first problem: I need to call a function from a Dust template, this is the code:
{>layout/}
{<content}
<ul>
{#photos}
<li>{photo.get("name")}{~n}</li>
{/photos}
</ul>
{/content}
but it doesn't work because it prints out {photo.get("name")} (literally) instead of printing the name of each photo. The query with Parse works correctly as I can see the loaded objects via console.log().
I'm new both with nodejs and dust so I'm not sure the problem is related only to dust. Any idea?
I have no any other solution except creation of a helper:
var dust = require('dustjs-linkedin');
dust.helpers.exec = function(chunk, context, bodies, params) {
var args = JSON.parse(params.args.replace(/'/g, '"'));
var object = context.stack.head;
params.func.split('.').some(function(property) {
if (typeof(object[property]) === "function") {
var result = object[property].apply(object, args);
chunk.write(result);
return true;
} else {
object = object[property];
return false;
}
})
return chunk;
};
Suppose we have following data:
app.get('/dust-test', function(req, res) {
function Photo(name) {
var props = {'name': name};
this.get = function(prop) {
return props[prop];
}
}
var photos = ['foo', 'bar', 'nanana'].map(function(name) {
return new Photo(name);
})
res.render("dust-test", {
photo: new Photo('me'),
photos: photos
});
});
Usage:
<li>{#exec func="photo.get" args="['name']" /}</li>
{#photo}
<li>{#exec func="get" args="['name']" /}</li>
{/photo}
<ul>
{#photos}
<li>{#exec func="get" args="['name']" /}{~n}</li>
{/photos}
</ul>
Where args - is an array of arguments in json format (single quotes are used)

I want to pipe a readable css file to the http response

I have an issue with outputting the readable stream to the http response.
behind the scenes there is a regular request and response streams coming from the generic http createServer. I check to see if the 'req.url' ends in css, and I create a readable stream of this file. I see the css contents in the console.log, with the right css code I expect. Then, I try to pipe the readable css file stream to the response, but in Chrome, the file response is blank when I inspect the response. It is a 200 response though. Any thoughts at first glance? I've tried different variations of where I have code commented out.
router.addRoute("[a-aA-z0-9]{1,50}.css$", function(matches){
var cssFile = matches[0];
var pathToCss = process.cwd() + "/" + cssFile;
// takes care of os diffs regarding path delimiters and such
pathToCss = path.normalize(pathToCss);
console.log(matches);
console.log("PATH TO CSS");
console.log(pathToCss)
var readable = fs.createReadStream(pathToCss);
var write = function(chunk){
this.queue(chunk.toString());
console.log(chunk.toString());
}
var end = function(){
this.queue(null);
}
var thru = through(write,end);
//req.on("end",function(){
res.pipe(readable.pipe(thru)).pipe(res);
//res.end();
//});
});
you need to pipe your readable stream into your through-stream, and then pipe it into the response:
readable.pipe(thru).pipe(res);
edit: for preparing your css path, just use path.join instead of concatenating your path and normalizing it:
var pathToCss = path.join(process.cwd(), cssFile);
I separated out this route (css) from my normal html producing routes, the problem I had was that my normal routes in my router object returned strings, like res.end(compiled_html_str), and the css file readable stream was going through that same routing function. I made it separate by isolating it from my router.
var cssMatch = [];
if(cssMatch = req.url.match(/.+\/(.+\.css$)/)){
res.writeHead({"Content-Type":"text/css"});
var cssFile = cssMatch[1];
var pathToCss = process.cwd() + "/" + cssFile;
// takes care of os diffs regarding path delimiters and such
pathToCss = path.normalize(pathToCss);
console.log(cssMatch);
console.log("PATH TO CSS");
console.log(pathToCss)
var readable = fs.createReadStream(pathToCss);
var cssStr = "";
readable.on("data",function(chunk){
cssStr += chunk.toString();
});
readable.on("end",function(){
res.end(cssStr);
});
}

Node request throwing: Error: Invalid URI "www.urlworksinbrowser.com" or options.uri is a required argument

I'm using Node v0.10.11 on Ubuntu 12.04. I can't figure out what I'm missing to make streams of URLs work with the request module.
This program is trying to go to a mailing list site, find the download links for each month, then download the pages for each month.
Mikael's readme says "The first argument can be either an url or an options object. The only required option is URI, all others are optional.
uri || url - fully qualified uri or a parsed url object from url.parse()"
If I call url.parse(www.targeturl.com), I get
Error: options.uri is a required argument
If I don't use url.parse, I get
Error: Invalid URI "www.freelists.org/archive/si-list/06-2013"
(this link works perfectly fine in my browsers)
I've cut the code down to 42 lines. Any advice welcome
var request = require('request'),
url = require('url'),
stream = require('stream'),
cheerio = require('cheerio'), // a reduced jQuery style DOM library
Transform = require('stream').Transform
var DomStripStream = function(target) {
this.target = target;
stream.Transform.call(this,{objectMode: true});
}
DomStripStream.prototype = Object.create(
Transform.prototype, {constructor: {value: DomStripStream}}
)
DomStripStream.prototype.write = function () {
this._transform.apply(this, arguments);
};
DomStripStream.prototype.end = function () {
this._transform.apply(this, arguments);
this.emit("end");
};
DomStripStream.prototype._transform = function(chunk, encoding, callback) {
chunk = chunk ? chunk.toString() : "";
$ = cheerio.load(chunk);
domLinks = $(this.target);
$(domLinks).each(function (i, link) {
currLink = 'www.freelists.org' + $(link).attr('href')
// currLink = url.parse(currLink)
request(currLink, function (error, response, body) {
console.log(error);
})
});
}
var fs = require("fs"),
output = fs.createWriteStream("out.txt"),
mainPage = new DomStripStream('td a')
request('http://www.freelists.org/archive/si-list').
pipe(mainPage).
pipe(output);
add http:// or https:// in the url

fs.exists not accepting variables

I'm using post requests to search for binaries but can't get the following code to work correctly. fs.exists() won't accept the variable b_path but will work correctly if given a hard coded string. b_path prints to the console as expected, correctly building the path to the binary.
app.post('*', function(req, res) {
// generate the name of the binary
var request = require('url').parse(req.url, true);
var len = request.pathname.toString().length;
var binary = request.pathname.slice(1,len);
binary = binary.concat(' ');
var b_path = app.get('binaries_path')+binary;
fs.exists(b_path, function (exists) {
if(exists) {
console.log('exists');
}
}
}
Why does this occur?

Resources