How can I pipe external html content in jade? - node.js

I'm trying to get some html from a page online and place inside my jade template so I can style without copying and pasting every time a need it.
var request = require("request");
var cheerio = require("cheerio");
var loadContent = function() {
request({
uri: "http://www.mywebsite.com.br/test"
}, function(error, response, body) {
var $ = cheerio.load(body);
var result;
$('.content').each(function(){
result={"content":$(this).html()};
});
placeContent(result);
return true;
});
};
var placeContent = function(content) {
return content;
};
module.exports = loadContent;
Inside my gulpfile.js, besides the right requirements, I have:
gulp.task('jadeBuild', function() {
var options = {
pretty: true
};
return gulp.src(src+'/*.jade')
.pipe(data(function(){
return loadContent();
}))
.pipe(jade(options))
.pipe(gulp.dest(build))
.pipe(connect.reload());
});
And my jade file:
.mycontent
#{content}
What am I missing?

Try changing #{content} to !{content} in your jade file. This will tell jade not to escape any of the characters(which can be dangerous depending on where the input is coming from!).
See http://jade-lang.com/reference/interpolation/
Also, when you loop over each .content you are overwriting result every time. You need to append to result if you want to aggregate all the content together. Something like:
var result = {content: ''};
$('.content').each(function(){
result.content += $(this).html();
});

Related

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

NodeJS: Use a pug template to display results

I have a method in a NodeJS app that handles scraping a URL, and when successful, saving that data in a Mongo database, and showing the results.
Main method:
//url parameter
app.get('/urls/', function(req, res) {
var client = new MetaInspector(req.query.url, {
timeout: 5000
});
client.on("fetch", function() {
var imagesArray = [];
var keywordsArray = [];
var now = new Date();
var dateVal = dateFormat(now, "mm/dd/yyyy h:MM:ss");
for (var i = 0; i < client.images.length; i++) {
// we only want jpgs. nothing else.
if (client.images[i].indexOf('.jpg') > -1) {
imagesArray.push({
"image": client.images[i]
})
}
}
for (var i = 0; i < client.keywords.length; i++) {
keywordsArray.push({
"keyword": client.keywords[i]
})
}
var newUrls = Urls({
url: client.url,
date_added: dateVal,
keywords: req.body.keywords,
author: client.author,
description: client.description,
ogTitle: client.ogTitle,
ogDescription: client.ogDescription,
image: client.image,
images: imagesArray,
keywords: keywordsArray
});
newUrls.save(function(err) {
if (err) throw err;
res.send('Success' + newUrls);
});
});
client.on("error", function(err) {
console.log(err);
});
client.fetch();
});
This all works well and good. But I'm using Pug and Express and have specific routes setup. I'd like instead of sending the newUrls obj to the res.send, have it go to a particular route and pass it to a particular pug template I already have setup:
// Route.js
var express = require('express');
var router = express.Router();
var Urls = require('../models/urlModel');
var Footer = require('../models/footerModel');
/* URL Saved Success Page */
router.get('/saved', function (req, res) {});
});
module.exports = router;
My view lives in a pug file located at:
/views/saved.pug
div#body
include nav.pug
div.container.item-container
div.row
div.col-md-8
h1 Item successfully saved.
h5 {item}
h6 {description}
I've tried using the res.send method, but that doesn't work. Any suggestions on how to handle this?
For my understanding, you want the request redirected to /saved with payload after urls saved to database, in this scenario, you could user res.redirect with query string
newUrls.save(function(err){
var payload = JSON.stringify({
url: client.url,
date_added: dateVal,
keywords: req.body.keywords,
author: client.author,
description: client.description,
ogTitle: client.ogTitle,
ogDescription: client.ogDescription,
image: client.image,
images: imagesArray,
keywords: keywordsArray
})
//append the payload as a query string
res.redirect(`/saved?payload=${payload}`)
})
and in /saved route, you could parse the query and use res.render
router.get('/saved', function (req, res) {});
let payload = JSON.parse(req.query.payload);
if(payload){
res.render('saved', payload)
}
});

WebScraping & web navigation simulation

I'm make a webscraper and I already know how to scrap some data and convert them to Json with this code I made :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://www.footmercato.net/';
request(url, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var data = [];
var i = 1;
$('.text').each(function(i, element) {
var article = $('p');
var jsObject = { title : "", article : "", date : "" };
var articleTxt = article.text();
jsObject.article = articleTxt;
data.push(jsObject);
})
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
});
app.listen('8080');
But I would like to navigate to the website I'm scraping, fill out form and going to others pages.
Does somebody know if i can do it with cheerio or how I can add it to my existing code ?
Thanks
You can use webdriverio actually he will open a browser window, and then you can manipulate the dom through the webdriverio api to handle forms mouse clicks, and navigate from one page to an other.
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
}
};
webdriverio
.remote(options)
.init()
.url('http://www.google.com')
.getTitle().then(function(title) {
console.log('Title was: ' + title);
})
.end();

How to concurrent download files using cheerio and nodejs?

I have a website with multiple pages, each page lists download links which I want to scrap and download.
I have few issues with it:
My script only downloads about 4-5 files and getting stuck.
I would like to concurrently download as much files as my CPU can.
I got stuck with maximum event emitters, I don't understand why is that so I just go
How to follow redirects purely using request module (without follow-redirects)?
How to download the file like the browser does without mentioning it's name? there is no content-disposition but I think the browser follow redirects and the redirected URL has the filename in it's path.
My current code looks like so:
var request = require('request');
var cheerio = require('cheerio');
var https = require('follow-redirects').https;
require('events').EventEmitter.prototype._maxListeners = 1000;
for(var i = 1; i <= 10000; i++) {
(function(i){
url = 'http://mywebsite.com/files?page=' + i;
request(url, gotHTML)
})(i);
}
function gotHTML(err, resp, html) {
var $ = cheerio.load(html);
$('.file-header').each(function() {
var data = $(this);
var fileLink = data.children().first().children().first().attr('href');
var fileName = fileLink.substring(10);
var downloadLink = 'https://mywebsite.com/api/download/' + fileName;
download(downloadLink, function() {
console.log('downloaded');
})
})
}
function download(url, cb) {
var request = https.get(url, function(response) {
var location = request.res.headers.location;
console.log(location);
location = location.split('/').pop();
console.log(location);
var file = fs.createWriteStream(location);
response.pipe(file);
file.on('finish', function() {
file.close(cb);
});
});
}
The default HTTP/HTTPS Agent only uses a maximum of 5 sockets (maxSockets) for requests to the same origin. So this could be causing some issues for you.
Try changing this:
var request = https.get(url, function(response) {
to this:
var options = require('url').parse(url);
options.agent = false; // or use a custom https.Agent with a higher `maxSockets`
var request = https.get(options, function(response) {

Send PDF file from AngularJS to NodeJS

i need to send a PDF file from angularjs client to NodeJS service.
I did the angularjs service, and when i receive the file its a string like this:
%PDF-1.3
3 0 obj
<</Type /Page
/Parent 1 0 R
/Reso
How can i reconvert this string to PDF in NodeJS?
This is the client code:
var sendByEmail = function () {
$scope.generatingPdf = true;
$('#budget').show();
var pdf = new JsPDF('p', 'pt', 'letter');
var source = $('#budget')[0];
pdf.addHTML(source, 0, 0, function () {
var resultPdf = pdf.output();
BillService.sendByEmail("rbrlnx#gmail.com", resultPdf).then(function () {
});
$('#budget').hide();
});
};
var sendByEmail = function (email, file) {
var deferred = $q.defer();
var data = {
email: email,
file: file
};
BillService.sendByEmail(data, function (result) {
deferred.resolve(result);
}, function () {
deferred.reject();
});
return deferred.promise;
};
The server code controller its empty:
var sendByEmail = function (req, res, next) {
var file = req.body.file;
};
I experimented with this a while ago, and I came up with this. It's not production ready by a long shot maybe you find it useful. It's free of front end libraries (except Angular ofcourse), but assumes you're using Express 4x and body-parser.
The result:
In the browser:
On the server:
What you're seeing:
You're seeing a tiny node server, serving static index.html and angular files, and a POST route receiving a PDF in base64 as delivered by the HTML FileReader API, and saves it to disk.
Instead of saving to disk, you can send it as an email attachment. See for instance here or here for some info on that.
The example below assumes uploading a PDF by a user through a file input, but the idea is the same for all other ways of sending a document to your back end system. The most important thing is to send the pdf data as BASE64, because this is the format that most file writers and email packages use (as opposed to straight up binary for instance..). This also goes for images, documents etc.
How did I do that:
In your HTML:
<div pdfs>Your browser doesn't support File API.</div>
A directive called pdfs:
myApp.directive('pdfs', ['upload', function(upload) {
return {
replace: true,
scope: function() {
files = null;
},
template: '<input id="files" type="file">',
link: function(scope,element) {
element.bind('change', function(evt) {
scope.$apply(function() {
scope.files = evt.target.files;
});
});
},
controller: function($scope, $attrs) {
$scope.$watch('files', function(files) {
//upload.put(files)
if(typeof files !== 'undefined' && files.length > 0) {
for(var i = 0; i<files.length;i++) {
readFile(files[i])
}
}
}, true);
function readFile(file) {
var reader = new FileReader();
reader.addEventListener("loadend", function(evt) {
upload.post({name: file.name, data: reader.result})
})
if(reader.type = 'application/pdf') {
reader.readAsDataURL(file);
}
}
}
}
}]);
A tiny service:
myApp.service('upload', function($http) {
this.post = function(file) {
$http.post('/pdf', file);
}
});
And a node server:
var express = require('express');
var bodyParser = require('body-parser')
var fs = require("fs");
var app = express();
app.use(express.static('.'));
app.use( bodyParser.json({limit: '1mb'}) );
app.post('/pdf', function(req, res){
var name = req.body.name;
var pdf = req.body.data;
var pdf = pdf.replace('data:application/pdf;base64,', '');
res.send('received');
fs.writeFile(name, pdf, 'base64', function(err) {
console.log(err);
});
});
var server = app.listen(3000, function() {
console.log('Listening on port %d', server.address().port);
});

Resources