I'm using Cheerio and request with Node.js to do some basic web scraping, but can't seem to figure out how to access the data. The page loads via request and I'm able to console.log the page title using Cheerio, but when I get to the scripts it's a complex mess of objects.
In the body section of the page it looks like..
<body>
<script src="someUrl" script type="text/javascript" />
<script src="someUrl" script type="text/javascript" />
<script src="someUrl" script type="text/javascript" />
<script type="text/javascript">var months = [6,12,24,36,48,60]; var amounts = [5000,10000,15000,20000,25000]</script>
I'm trying to get to the variables in the last script to store them as variables in my node script for use, but I can't seem to access them, even as text.
When I try this in node, I get the page title followed by some huge object response in the console, not the variable text to parse. Suggestions?
$ = cheerio.load(body);
console.log($('title').text());
var text = $('script');
console.dir(text[3]);
You can parse those variables with regex but cheerio is a little messy:
var cheerio = require('cheerio')
var html = `
<body>
<script src="someUrl" type="text/javascript" />
<script src="someUrl" type="text/javascript" />
<script src="someUrl" type="text/javascript" />
<script type="text/javascript">var months = [6,12,24,36,48,60]; var amounts = [5000,10000,15000,20000,25000]</script>
</body>
`
var str, $ = cheerio.load(html, {xmlMode: true}); // xmlMode: true is a workaround for many cheerio bugs.
console.log(str = $('script:not([src])')[0].children[0].data) // no cleaner way to do this, cheerio?
// var months = [6,12,24,36,48,60]; var amounts = [5000,10000,15000,20000,25000]
var months = JSON.parse(str.match(/months = (\[.*?\])/)[1])
console.log(months)
// [ 6, 12, 24, 36, 48, 60 ]
var amounts = JSON.parse(str.match(/amounts = (\[.*?\])/)[1])
console.log(amounts)
// [ 5000, 10000, 15000, 20000, 25000 ]
Related
Hello I'm trying getElementById to change text inside my html web page. below you can find my 1st attempt
<!DOCTYPE html>
<html>
<body>
<h2>Use JavaScript to Change Text</h2>
<p>This example writes "Hello JavaScript!" into an HTML element with id="demo":</p>
<p id="demo"></p>
<script>
var net = require('net');
var sleep = require('sleep');
var element = document.getElementById("demo");
element.innerHTML = "Hello JavaScript!";
</script>
This code doesn't work because I can see the:
Use JavaScript to Change Text
This example writes "Hello JavaScript!" into an HTML element with id="demo":
but the:
"Hello JavaScript!"
is missing.
Changing the positions of the vars at the beginning of the js script makes the code working:
<!DOCTYPE html>
<html>
<body>
<h2>Use JavaScript to Change Text</h2>
<p>This example writes "Hello JavaScript!" into an HTML element with id="demo":</p>
<p id="demo"></p>
<script>
var element = document.getElementById("demo");
element.innerHTML = "Hello JavaScript!";
var net = require('net');
var sleep = require('sleep');
</script>
Why? I need both sleep and net later on when I'll write other parts of the code but I need to manipulate again the "demo" html as well.
There is a problem with both lines. The require() function is not a client side function, recognized by the browser. Typically require() is used in server side NodeJS code, but there is a require.js library file that you can add...
var net = require('net');
var sleep = require('sleep');
Add this to your project:
http://requirejs.org/docs/release/2.2.0/minified/require.js
And take a look at this :
http://requirejs.org/docs/api.html
Source :
Javascript require() function giving ReferenceError: require is not defined
It seems likely that the lines with
var net = require('net');
var sleep = require('sleep');
are actually causing an error, which in turn causes the JS to stop evaluating, so that it doesn't hit the getElementById line. You can check this in you browser's developer tools, in the console.
I am trying to load local javascripts with jsdom.
Now I am wondering how I can make jsdom loading the javascripts from "__dirname/../public".
Can someone help me?
My current code is:
var fs = require('fs');
var jsdom = require('jsdom');
jsdom.defaultDocumentFeatures = {
FetchExternalResources: ["script"],
ProcessExternalResources: ["script"],
MutationEvents : '2.0',
QuerySelector : false
};
exports.test = function(req, res) {
var html = fs.readFileSync(__dirname+'/../public/html/lame.html');
var document = jsdom.jsdom(html, null, {documentRoot: __dirname+'/../public/'});
var window = document.createWindow();
window.addEventListener('load', function () {
//window.$('script').remove();
//window.$('[id]').removeAttr('id');
res.send(window.document.innerHTML);
window.close();
});
}
The simple HTML page is:
<html>
<head>
<title id="title">bum</title>
<script type="text/javascript" src="/javascripts/jquery.min.js"></script>
<script type="text/javascript" src="/javascripts/stuff.js"></script>
</head>
<body>
<h1>hello world</h1>
<h2 id="bam">XXX</h2>
</body>
</html>
I've run into the same issue and got it to work. Try these, in order:
documentRoot is not documented. The option which is documented is url. So replace documentRoot with url.
If the above is not enough, then add a base element. I've set my templates like this:
<head>
<base href="#BASE#"></base>
<!-- ... -->
</head>
where #BASE# is replaced with the same value as the one passed to url.
The solutions above are extracted from actual code in use in a test suite.
I've been having trouble setting up a test framework for a NodeJS + Backbone app with the constant "require is not defined" error. I finally got it working using an in-browser test framework which picks up all of the dependencies I need and running a test.js file.
Currently, I'm only doing basic testing of my Backbone models, views, and collections. Now, I want to add in API testing but I'm back to the same "require is not defined" error. What is causing this? It's clear that I'm missing something fundamental here. I just want to add:
var request = require('supertest')
, express = require('express');
var app = express();
Snippet of test.js:
describe('Application', function(){
it("creates a global variable for the namespace", function() {
should.exist(App);
})
});
describe('Models', function() {
describe('SearchFormModel', function() {
beforeEach(function() {
this.SearchFormModel = new App.Model.SearchFormModel();
this.defaultFields = this.SearchFormModel.attributes;
})
it("created a SearchFormModel", function() {
should.exist(this.SearchFormModel);
})
it("should have 7 default fields", function() {
Object.keys(this.SearchFormModel).length.should.equal(7);
})
it("should default all fields to empty string", function() {
for (var key in this.defaultFields) {
this.defaultFields[key].should.equal("");
}
})
});
});
test-runner.html:
<!DOCTYPE html>
<html lang="en">
<head>
<!-- Title & Meta -->
<title>Frontend tests</title>
<meta charset="utf-8">
<!-- Stylesheets -->
<link rel="stylesheet" href="../node_modules/mocha/mocha.css">
</head>
<body>
<div id="mocha"></div>
<!-- Testing Libraries -->
<script src="../node_modules/mocha/mocha.js"></script>
<script src="../node_modules/chai/chai.js"></script>
<script>
// Use the expect version of chai assertions - http://chaijs.com/api/bdd
var should = chai.should();
// Tell mocha we want TDD syntax
mocha.setup('tdd');
</script>
<!-- Libs -->
<script src="../public/lib/jquery-1.8.2.min.js"></script>
<script src="../public/lib/underscore-min.js"></script>
<script src="../public/lib/backbone-min.js"></script>
<script src="../public/lib/bootstrap.min.js"></script>
<script src="../public/lib/highcharts.js"></script>
<script src="../public/lib/bootstrap-datepicker.js"></script>
<script src="../public/js/modules/exporting.js"></script>
<!-- Source files -->
<script src="../public/js/namespace.js"></script>
<script src="../public/js/jst.js"></script>
<script src="../public/js/utils.js"></script>
<script src="../public/js/models/models.js"></script>
<script src="../public/js/models/search.js"></script>
<script src="../public/js/models/plot.js"></script>
<script src="../public/js/models/search_result.js"></script>
<script src="../public/js/views/header.js"></script>
<script src="../public/js/views/plot.js"></script>
<script src="../public/js/views/list.js"></script>
<script src="../public/js/views/search.js"></script>
<script src="../public/js/router.js"></script>
<script src="../public/js/app.js"></script>
<!-- Test -->
<script src="test.js"></script>
<script>
mocha.run();
</script>
</body>
</html>
require and commonjs only works in Node.js
If you run Browser test, then you need to code it like you'll run it in the browser. Also note that Unit Test should be done in isolation, you shouldn't need to load you app server (express) to run your test.
I'd like to point you to an easy solution from there, but there's just too many choices. Very basically, you should start running browser test in the browser by loading an html file.
Then, you'll want to automatize this and run browser test from the terminal. That's when you want to run test in PhantomJs and the likes and output browser results on the terminal. Around this, you can checkout Karma and Testem who're two browser test runner (remember here Mocha alone won't run browser test via command line).
As you're using Backbone, you might be interested in the Backbone-Boilerplate Karma + Grunt test setup as a starting point. See more on this here: https://github.com/backbone-boilerplate/backbone-boilerplate
I'm trying to connect the browser to my application via socket.io.
<script type="text/javascript" src="http://localhost:4000/socket.io/socket.io.js"></script>
<script>
var socket = io.connect('http://localhost:4000');
</script>
With this standard method all works fine.
Now I'm trying to transform this connection in "dynamic" based on the IP of the server, something like this:
<html>
<head>
var socket;
function loadFile(filename){
var ip_server = location.host;
var body = document.getElementsByTagName( 'body' )[0],
fileref = document.createElement('script');
fileref.setAttribute("type","text/javascript");
fileref.setAttribute("src", "http://"+ip_server+"/"+filename);
body.appendChild( fileref );
}
</head>
<body>
<script type="text/javascript">
loadFile("socket.io/socket.io.js");
socket = io.connect('http://'+location.host);
</script>
</body>
</html>
But firebug says ReferenceError: io is not defined on line socket = io.connect('http://'+location.host);.
How can I solve? There's a simple way to do what I'm thinking?
Thanks
Socket.io has "magical" integration with Node.js which means that something much simpler will work automatically:
<script src="/socket.io/socket.io.js"></script>
var socket = io.connect();
This will find the library and the socket with no explicit host or path. It should "just work."
const socket = io.connect(location.href);
Is there a way to change the Google Analytics code included in a template using Plates?
For example, for the below template:
<!DOCTYPE html>
<html>
<head></head>
<body>
<script id="googleAnalytics">
var _gaq=[['_setAccount','GA_ACCOUNT_CODE'],['_trackPageview']];
(function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
s.parentNode.insertBefore(g,s)}(document,'script'));
</script>
</body>
</html>
I would like to use a different GA_ACCOUNT_CODE depending on the environment the code runs in.
Is this possible with Plates? If not, what is the common way one would solve this issue in NodeJS & Flatiron?
Plates' idea is great but far from complete. Here is a solution.
app.js
var fs = require('fs');
var plates = require('plates');
var flatiron = require('flatiron');
var app = flatiron.app;
app.use(flatiron.plugins.http);
app.router.get('/', function(){
var html = fs.readFileSync('index.html', 'utf-8');
var map = plates.Map();
var data = {"GA_ACCOUNT_CODE": "YOUR_CODE_FROM_CONFIG"}
map.where('data-ga').is('GA_ACCOUNT_CODE').insert('GA_ACCOUNT_CODE');
html = plates.bind(html, data, map);
this.res.html(html);
});
app.start(3000);
index.html
<!DOCTYPE html>
<html>
<head></head>
<body>
<script id="googleAnalytics" data-ga="GA_ACCOUNT_CODE">
var GA_ACCOUNT_CODE = document.getElementById('googleAnalytics').getAttribute('data-ga');
var _gaq=[['_setAccount',GA_ACCOUNT_CODE],['_trackPageview']];
(function(d,t){var g=d.createElement(t),s=d.getElementsByTagName(t)[0];
g.src=('https:'==location.protocol?'//ssl':'//www')+'.google-analytics.com/ga.js';
s.parentNode.insertBefore(g,s)}(document,'script'));
</script>
</body>
</html>
Another way would just be string.replace() like this:
var html = fs.readFileSync('index.html', 'utf-8');
html = html.replace('GA_ACCOUNT_CODE', 'YOUR_CODE_FROM_CONFIG');
this.res.html(html);
Have a look at cheerio for support. It's like any other css selector and dom manipulation frontend library, just backends.