I Have a HTML response. I need to parse it and generate a DOM object. After generation of DOM object I need to search a particular string inside it and get complete hierarchy of HTML tags in which it resides. Is there any NPM package available.
There's an even simpler API for this now in htmlparser2:
var htmlparser = require("htmlparser2");
var dom = htmlparser.parseDOM("<html>your html string</html>");
console.log(dom);
You have htmlparser2 package that can parse HTML stream. You can get the DOM with DomHandler which is bundled with htmlparser2 itself. See the example given there. E.g.
var htmlparser = require("htmlparser2");
var rawHtml = "<html>your html string</html>";
var handler = new htmlparser.DomHandler(function (error, dom) {
console.log(dom);
});
var parser = new htmlparser.Parser(handler);
parser.write(rawHtml);
parser.done();
Parsing DOM object manually is tedious job.
I think that everyone needs a Soup Select(soupselect package) to parse complex DOM objects.
A soupselect is great package in handling DOM.
See following example:
var htmlparser = require("htmlparser2");
var select = require('soupselect').select;
var handler = new htmlparser.DomHandler(function (error, dom) {
if (error)
console.log('error:', error);
else {
// selector reference:
// http://www.w3schools.com/jquery/jquery_ref_selectors.asp
var sel = select(dom, 'body p');
console.log("text in the first <p>: '" + sel[0].children[0].data + "'");
}
});
var parser = new htmlparser.Parser(handler);
var rawHtml =
"<html>"
+ "<head><title>My Title</title></head>"
+ "<body>"
+ "<p>"
+ " Hello World"
+ "</p></body></html>";
parser.parseComplete(rawHtml);
output:
text in the first <p>: ' Hello World'
parseDOM function is now deprecated in htmlparser2 package. You can use parseDocument function now.
const { parseDocument } = require("htmlparser2");
let dom = parseDocument(row_html);
console.log('DOM: ', dom);
Related
I need to union 2 pdf files, on the server site Netsuite script api2.
Or other native javascript option.
Thanks
Use the pdfset element in a template file. This is a fragment from an SS2 Suitelet:
function renderSet(opts){
var tpl = ['<?xml version="1.0"?>','<pdfset>'];
opts.files.forEach(function(id, idx){
const partFile = file.load({id:id});
var pdf_fileURL = xml.escape({xmlText:partFile.url});
tpl.push("<pdf src='" + pdf_fileURL + "'/>");
});
tpl.push("</pdfset>");
log.debug({title:'bound template', details:xml.escape({xmlText:tpl.join('\n')})});
return render.xmlToPdf({
xmlString: tpl.join('\n')
});
}
var pdf = renderSet({files:[file1, file2]});
pdf.name = basename +'_'+ getDateStamp() +'.pdf';
response.writeFile({
file:pdf,
isInline: false
});
Using gulp 3.9.1
I am attempting to return a bunch of files and perform a task that requires a var to be passed between two pipes.
I'm using node uuid to create a v3 UUID for each file path to
ultimately end up with a uuid for each page. I'm grabbing the file path with gulp-print.
I want to store that uuid value as a var. In the next pipe Im using
gulp-inject-string to write it into the page during the build.
Help: Either I need help getting the file path inside the gulp-inject-string pipe or I need to pass the var between the two different pipes. If I globally set a var with a default value outside the src it gets passed easily to the pipe(inject).
Super simplified code below:
// test code
var gulp = require('gulp');
var print = require('gulp-print');
var inject = require('gulp-inject-string');
var reload = browserSync.reload;
const uuidv3 = require('uuid/v3');
var uuid;
gulp.task('uuid', function() {
return gulp.src('**/*.html'])
// create uuid
.pipe(print(function(filepath) {
uuid = uuidv3(filepath, uuidv3.URL);
return "compiled: " + filepath + ' uuid: ' + uuid;
}))
// need to to add UUIDv3 to each page
.pipe(inject.before('</head>', '<meta name="dc.identifier" content="' + uuid + '">'))
.pipe(gulp.dest('/prod/./'))
.pipe(reload({ stream: true }));
});
It's worth noting that I need a cross platform way to get the file path starting in the root of the project and including forward slashes. The gulp(print) does this perfectly starting at the root of the project and ignoring anything upstream from that point. The format of the path is important because it's one half of the equation in creating the uuid and the uuid's must match on Mac or PC platforms.
examples:
/index.html
/dir1/file.html
/dir1/dir2/dir3/file.html
var gulp = require('gulp');
var print = require('gulp-print');
var inject = require('gulp-inject-string');
const uuidv3 = require('uuid/v3');
var tap = require('gulp-tap');
// you can declare here
var uuid;
gulp.task('pages', function() {
// or you can declare here
var uuid;
return gulp.src('**/*.html')
// bunch of stuff happens here involving templating/minifying
// create uuid
.pipe(print(function(filepath) {
// then set it here and use it further below
// it will be available
uuid = uuidv3(filepath, uuidv3.URL);
return "compiled: " + filepath + ' uuid: ' + uuid;
}))
// need to to add UUIDv3 to each page
//.pipe(inject.before('</head>', '<meta name="dc.identifier" content="' + uuid + '">\n'))
.pipe(tap(function(file, t) {
return t.through(inject.before('</head>', '<meta name="dc.identifier" content="' + uuid + '">\n');
})
.pipe(gulp.dest('/prod/./'))
.pipe(reload({stream:true}));
});
You are just creating a variable at a higher scope that you can set and refer to later. If you need a bunch of them create an array with filepath as an index. But I would try it first as just a simple value.
I solved the problem. It was an amateur mistake. I returned the statement where the var was set so the var was essentially killed. Updated code that allows the var to pass through the pipes.
var gulp = require('gulp');
var print = require('gulp-print');
var replace = require('gulp-replace');
const uuidv3 = require('uuid/v3');
var uuid;
gulp.task('build', function() {
return gulp.src('**/*.html')
// get a cross-platform filepath and create a uuid
.pipe(print(function(filepath) {
uuid = uuidv3(filepath, uuidv3.URL);
}))
// inject uuid
.pipe(replace('dc.identifier" content=""', function() {
return 'dc.identifier" content="' + uuid + '"';
}))
.pipe(gulp.dest('/prod/./'));
});
The var uuid passes through the pipes just fine now. This code creates a UUID based on a cross-platform file path and injects it into an empty dc.identifier meta tag.
So I tested my scraping on a static HTML file before adding it to my Node app.
The problem is that it's not returning all the rows.
On the site:
$('#sport tr').length
//Returns 13
In Cheerio:
$('#sport tr').length
//Returns 2
I'm stumped, here is the code I'm using. I've contained the URL as proof, so you can visit it yourself if you wish.
I'm suspecting it's something to do with var $ = cheerio.load(html); however I'm not experienced in Cheerio to say outright that's the problem.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
var url = 'http://www.olbg.com/football.php';
var json = [];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
console.log($('#sport tr').length);
var headers = [];
$('#sport tr th').each(function(i, th) {
var text = $(th).text();
if (text.trim() !== "") {
headers[i] = text.replace(/[\t\n\r\s]/mgi, '');
}
});
$('#sport tr').each(function(i, tr) {
// skip if header
if (!$(tr).is('th')) {
var temp = {};
temp["Event"] = $(tr).find('td').eq(0).text().trim();
temp["TopSelection"] = $(tr).find('td').eq(1).text().trim();
temp["BookieOdds"] = $(tr).find('td').eq(2).text().trim();
temp["OLBGRating"] = $(tr).find('td').eq(3).find('img').length;
if (temp["Event"] !== "" || temp["TopSelection"] !== ""){
json.push(temp);
}
}
});
}
// To write to the system we will use the built in 'fs' library.
// In this example we will pass 3 parameters to the writeFile function
// Parameter 1 : output.json - this is what the created filename will be called
// Parameter 2 : JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read
// Parameter 3 : callback function - a callback function to let us know the status of our function
fs.writeFile('output.json', JSON.stringify(json), function(err){
console.log('File successfully written!');
})
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send(json);
});
});
app.listen("8081");
console.log("Magic happens on port 8081");
exports = module.exports = app;
The reason that you're not getting the expected result is because the (table) html on that page is mangled. If you look at the second <td> in the second <tr> of the table#sport, you'll see an "extra" </td>. This causes the <td> that the table#sport is inside to close (and an implicit closing of table#sport) on some parsers because that is the closest open <td>. So that is why the parser reports only 2 <tr>s instead of 13. The other <tr>s you're expecting are now outside of table#sport.
Probably your best bet is to pass the html through an HTML tidying program/script (e.g. this one with the clean option enabled) first before passing it to cheerio. After that, your selector should return the elements you're probably expecting.
I have an issue with outputting the readable stream to the http response.
behind the scenes there is a regular request and response streams coming from the generic http createServer. I check to see if the 'req.url' ends in css, and I create a readable stream of this file. I see the css contents in the console.log, with the right css code I expect. Then, I try to pipe the readable css file stream to the response, but in Chrome, the file response is blank when I inspect the response. It is a 200 response though. Any thoughts at first glance? I've tried different variations of where I have code commented out.
router.addRoute("[a-aA-z0-9]{1,50}.css$", function(matches){
var cssFile = matches[0];
var pathToCss = process.cwd() + "/" + cssFile;
// takes care of os diffs regarding path delimiters and such
pathToCss = path.normalize(pathToCss);
console.log(matches);
console.log("PATH TO CSS");
console.log(pathToCss)
var readable = fs.createReadStream(pathToCss);
var write = function(chunk){
this.queue(chunk.toString());
console.log(chunk.toString());
}
var end = function(){
this.queue(null);
}
var thru = through(write,end);
//req.on("end",function(){
res.pipe(readable.pipe(thru)).pipe(res);
//res.end();
//});
});
you need to pipe your readable stream into your through-stream, and then pipe it into the response:
readable.pipe(thru).pipe(res);
edit: for preparing your css path, just use path.join instead of concatenating your path and normalizing it:
var pathToCss = path.join(process.cwd(), cssFile);
I separated out this route (css) from my normal html producing routes, the problem I had was that my normal routes in my router object returned strings, like res.end(compiled_html_str), and the css file readable stream was going through that same routing function. I made it separate by isolating it from my router.
var cssMatch = [];
if(cssMatch = req.url.match(/.+\/(.+\.css$)/)){
res.writeHead({"Content-Type":"text/css"});
var cssFile = cssMatch[1];
var pathToCss = process.cwd() + "/" + cssFile;
// takes care of os diffs regarding path delimiters and such
pathToCss = path.normalize(pathToCss);
console.log(cssMatch);
console.log("PATH TO CSS");
console.log(pathToCss)
var readable = fs.createReadStream(pathToCss);
var cssStr = "";
readable.on("data",function(chunk){
cssStr += chunk.toString();
});
readable.on("end",function(){
res.end(cssStr);
});
}
For example, I have this JSON document "foo.json":
{
"foo": [
{
"bar": "Hello World!"
},
{
"bar": "The End"
}
]
}
In Node.js, I would like to use templating (handlebars or any) to generate a string from the JSON document, such as:
<p>Hello World!</p><p>The End</p>
... And then assign that string value to a variable in Node.js. Finally, I'll concatenate more values to the variable and output the final variable value as an html document.
Can this be done without using a framework like Express?
If you want to use handlebars, just grab the npm module:
npm install handlebars
Then in your script, you can use handlebars to render your output based on a simple template that iterates over the array foo and creates a <p> for each item, containing the text of the bar property:
var handlebars = require('handlebars');
// get your data into a variable
var fooJson = require('foo.json');
// set up your handlebars template
var source = '{{#each foo}}<p>{{this.bar}}</p>{{/each}}';
// compile the template
var template = handlebars.compile(source);
// call template as a function, passing in your data as the context
var outputString = template(fooJson);
If you want to use a .hbs template file instead of a string source you can use the fs module to read the file with fs.readFile, call toString() on the returned buffer, and use that to call a rendering function. Try this:
var handlebars = require('handlebars');
var fs = require('fs');
// get your data into a variable
var fooJson = require('path/to/foo.json');
// read the file and use the callback to render
fs.readFile('path/to/source.hbs', function(err, data){
if (!err) {
// make the buffer into a string
var source = data.toString();
// call the render function
renderToString(source, fooJson);
} else {
// handle file read error
}
});
// this will be called after the file is read
function renderToString(source, data) {
var template = handlebars.compile(source);
var outputString = template(data);
return outputString;
}