node.js \ sanitize html and also remove tags - node.js

how can I tell "sanitize-html" to actually remove the html tags (keep only the content within)? currently if for example I set it to keep the div sections, in the output it writes also the <div>some content</div> - I want only the inside...('some content')
to make it short - I don't want the tags, attributes etc. - only the content of those elements..
var Crawler = require("js-crawler");
var download = require("url-download");
var sanitizeHtml = require('sanitize-html');
var util = require('util');
var fs = require('fs');
new Crawler().configure({depth: 1})
.crawl("http://www.cnn.com", function onSuccess(page) {
var clean = sanitizeHtml(page.body,{
allowedTags: [ 'p', 'em', 'strong','div' ],
});
console.log(clean);
fs.writeFile('sanitized.txt', clean, function (err) {
if (err) throw err;
console.log('It\'s saved! in same location.');
});
console.log(util.inspect(clean, {showHidden: false, depth: null}));
var str = JSON.stringify(clean.toString());
console.log(str);
/*download(page.url, './download')
.on('close', function () {
console.log('One file has been downloaded.');
});*/
});

I'm the author of sanitize-html.
You can set allowedTags to an empty array. sanitize-html does not discard the contents of a disallowed tag, only the tag itself (with the exception of a few tags like "script" and "style" for which this would not make sense). Otherwise it wouldn't be much use for its original intended purpose, which is cleaning up markup copied and pasted from word processors and the like into a rich text editor.
However, if you have markup like:
<div>One</div><div>Two</div>
That will come out as:
OneTwo
To work around that, you can use the textFilter option to ensure the text of a tag is always followed by at least one space:
textFilter: function(text) {
return text + ' ';
}
However, this will also introduce extra spaces in sentences that contain inline tags like "strong" and "em".
So the more I think about it, the best answer for you is probably a completely different npm module:
https://www.npmjs.com/package/html-to-text
It's widely used and much better suited than your use case. sanitize-html is really meant for situations where you want the tags... just not the wrong tags.

Related

get var inside script tag using puppeteer

good evening everyone, I'm trying to get the array that is in this.data = [{"id_penggiat":"6f81520f-1a8b-4d1d-a7d5-defdb630818b"...] . using cheerio and puppets. I have tried my best but not getting results. help me to solver this
<script>
.......
var PenggiatField = {
fieldid: '#penggiatField',
data: true,
debug: false,
Init: function(params) {
var field = this;
this.data = [{"id_penggiat":"6f81520f-1a8b-4d1d-a7d5-defdb630818b","id_kerma":"9388706e-7156-4d07-bf77-cb9c6d7f30d3","id_mitra":null,"id_unit":"89c82da3-7b28-4305-82c4-347dae042847","id_unit_teknis":null,"no_pihak":"1","nm_pihak":"Institut Teknologi Sepuluh Nopember","alamat":"Kampus ITS Sukolilo Surabaya","ttd_nama":"Tri Joko Wahyu Adi, ST., MT., Ph.D.","ttd_jabatan":"Direktur","pic_nama":"","pic_jabatan":"","pic_email":null,"pelaksana":["c2c72bf4-ec6c-46dd-8be8-d9fb77a90bd6","7e42c5bb-0c44-4b93-9162-ec6fb31cd67c","902bd59e-745d-4903-b1bf-171fe6f0fb8c","c93fb75a-1065-4587-a7af-475d4256f44d","9a6592f0-ff86-401d-8021-0e9f6fc285fe","8f2d4244-d868-45e3-a906-578ce2f9f572","1f8fc06b-4707-4e11-bdc8-312f39017758"],"create_date":"2022-03-16 04:41:14.146436","id_creator":"79ce0204-5c6d-43dd-a310-8c13a557a402","last_update":"2022-03-16 04:54:25.228835","id_updater":"79ce0204-5c6d-43dd-a310-8c13a557a402","soft_delete":"0","a_lembaga":"university","id_institusi":"89c82da3-7b28-4305-82c4-347dae042847","nm_mitra":null,"nm_unit":"Institut Teknologi Sepuluh Nopember","nm_unit_teknis":null,"id_klas_mitra":"12","nm_klas_mitra":"Institusi Pendidikan","id_negara":"ID","nm_negara":"Indonesia"},{"id_penggiat":"c4c3db69-8cbb-4a8e-ae6c-544b532760c6","id_kerma":"9388706e-7156-4d07-bf77-cb9c6d7f30d3","id_mitra":"f4b81538-6442-4a19-b9f6-8581d152725d","id_unit":null,"id_unit_teknis":null,"no_pihak":"2","nm_pihak":"Sekolah Menengah Atas Negeri 1 Karas Magetan","alamat":"Jl. Karas, Punden, Kuwon, Kec. Kendal, Kabupaten Magetan, Jawa Timur 63395","ttd_nama":"Bahtiar Kholili, S.Pd., M.M.Pd.","ttd_jabatan":"Kepala Sekolah","pic_nama":"","pic_jabatan":"","pic_email":null,"pelaksana":null,"create_date":"2022-03-16 04:41:14.146436","id_creator":"79ce0204-5c6d-43dd-a310-8c13a557a402","last_update":"2022-03-16 04:54:25.228835","id_updater":"79ce0204-5c6d-43dd-a310-8c13a557a402","soft_delete":"0","a_lembaga":"mitra","id_institusi":"f4b81538-6442-4a19-b9f6-8581d152725d","nm_mitra":"Sekolah Menengah Atas Negeri 1 Karas Magetan","nm_unit":null,"nm_unit_teknis":null,"id_klas_mitra":"12","nm_klas_mitra":"Institusi Pendidikan","id_negara":"ID","nm_negara":"Indonesia"}];
........
</script>
in puppeteer the following would do the job, but it is not a general solution.
await page.goto(pageUrl)
const filteredArray = await page.$$eval('script', scripts =>
JSON.parse(
scripts
.map(src => src.innerHTML)
.filter(el => el.includes('"id_penggiat":'))[0] // selecting the desired <script>
.split('\n')
.filter(el => el.includes('"id_penggiat":'))[0] // selecting the exact script line within the <sript>'s innerHTML
.replace(/this.data\W=\W|;$/gm, '')
.trim()
)
)
console.log(filteredArray)
what's happening?
page.$$eval collects all <script> tags into an array
we map over all scripts' innerHTML
select the one that includes the "id_penggiat": string fragment
any JavaScript scripts will be received in a huge string with innerHTML, all lines will be delimited by \n linebreaks: we can split them one-by-one
let's select the exact line within the tag that includes "id_penggiat": string fragment
clean the output string from the un-parseable parts like the variable name, this, and the ending ;. plus trim the extra spaces from the beginning of the line.
the final result can be parsed as JSON.
note: using the 1st index [0] on the Array.filter-ed values picking only the first occurrences, if you want all of them: you will need to process further.

Is it absolutely the case that arrays are used by reference in other modules in Node.js?

I have
// file cars.js
var bodyshop = require('./bodyshop')
var connections = [];
many functions which operate on connections. adding them, changing them etc.
code in this file includes things like
bodyshop.meld(blah)
bodyshop.mix(blah)
exports.connections = connections
and then
// file bodyshop.js
let cars = require('./cars');
even more functions which operate on connections. adding them, changing them etc.
code in this file includes things like
cars.connections[3].color = pink
cars.connections.splice(deleteMe, 1)
module.exports = { meld, mix, flatten }
Is it absolutely honestly the case that code in bodyshop such as cars.connections.splice(deleteMe, 1) will indeed delete an item from "the" connections (ie, the one and only connections, declared in cars.js) and code in bodyshop such as cars.connections[3].color = pink will indeed change the color of index 3 of "the" self-same one and only connections?
Is it quite OK / safe / acceptable that I used the syntax "module.exports = { }" at the end of bodyshop, rather than three lines like "exports.meld = meld" ?
Is this sentence indeed to totally correct?? "In Node.js if you export from M an array, when using the array in another module X which requires M, the array will be by reference in X, i.e. not by copy" ... ?
I created two files with the following methods and the array as you mentioned.
First File: test1.js
const testArray = [];
const getArray = () => {
return testArray;
};
module.exports = {
testArray,
getArray
}
Second File: test2.js
const { testArray, getArray } = require('./test1');
console.log('testing the required array before modifying it');
console.log(getArray());
testArray.push('test');
console.log('testing the method result after modifying the required array content');
console.log(getArray());
If you can create the mentioned files and run them locally, you will see the following result.
>node test2.js
testing the required array before modifying it
[]
testing the method result after modifying the required array content
[ 'test' ]
The points observed is,
yes, it's okay if you want to export it with the syntax module.exports = { }, It not much an issue.
If any of the methods modify this array outside of the required file, it will affect here as well, This because require will be a reference, not a copy.
The one possible solution will be creating a JSON copy of it while requiring as below:
const { testArray, getArray } = require('./test1');
const testArrayCopy = JSON.parse(JSON.stringify(testArray));
console.log('testing the required array before modifying it');
console.log(getArray());
testArrayCopy.push('test');
console.log('testing the method result after modifying the required array content');
console.log(getArray());
This is the result:
>node test2.js
testing the required array before modifying it
[]
testing the method result after modifying the required array content
[]
Note: JSON copy will not help you in parsing DateTime properly.

How to split text depending on word count

I am trying to make a lyric project using discord.js, cheerio and the website called genius.com.
I have successfully found a way to scrape the lyrics from the website, I am onto the part where I need to split it because discord has a max word limit of 2000.
I can check how many characters/words are in the overall lyrics by doing lyrics.length, I just need to find a way to split the string and send both, in the future I might implement richEmbeds to make it more stylish but for now I'm focusing on the basics.
var request = require('request');
var cheerio = require('cheerio');
/*
This is a project for my discord bot, the reason for the 2000 word limit is because
discords character limit is currently set to 2000, this means that i will have to add
a function to split the lyrics and send each part
*/
//Define the URL that we are going to be scraping the data from
var UR_L = "https://genius.com/Josh-a-and-jake-hill-not-afraid-of-dying-lyrics";
//send a request to the website and return the contents of the website
request(UR_L, function(err, resp, body) {
//load the website using cheerio
$ = cheerio.load(body);
//define lyrics as the selector to text form
var lyrics = $('p').text();
if (lyrics.length > "2000" && lyrics.length < "4000") {
} else if (lyrics.length > "4000" && lyrics.length < "6000") {
} else {
//send the lyrics as one message
}
})
You can find a live version running here on repl.it.
You don't need to use any fancy function, that function is already built in discord.js: you can attach some options to a message, and MessageOptions.split is what you're searching for. When you want to send the text, do it like this:
channel.send(lyrics, { split: true });
If lyrics.length is greater that the limit, discord.js will cut your messages and send them one after the other, making it seem like it's only one.
channel is the TextChannel you want to send the messages to.
Discord has a 2000 characters limit not a 2000 words limit.
One solution to your problem could be this:
// This will result in an array with strings of max 2000 length
const lyricsArr = lyrics.match(/.{1,2000}/g);
lyricsArr.forEach(chunk => sendMessage(chunk))
Given the async nature of sending messages, you might want to look into modules like p-iteration to ensure the chunks arrive in the correct order.
That being said, there exists APIs for getting lyrics of songs, which I would recommend instead of scraping. See apiseeds lyrics API as an example.
UPDATE
const lyrics = 'These are my lyrics';
const lyricsArr = lyrics.match(/.{1,8}/g);
console.log(lyricsArr); // [ 'These ar', 'e my lyr', 'ics' ]
lyricsArr.forEach((chunk, i) => {
// Break if this is the last chunk.
if (i == lyricsArr.length -1) {
return;
}
// If last character is not a space, we split a word in two.
// Add additional non-wordbreaking symbols between the slashes (in the regex) if needed.
if (!chunk[chunk.length - 1].match(/[ ,.!]/)) {
const lastWord = chunk.match(/\s([^ .]+)$/)
lyricsArr[i + 1] = lastWord[1] + lyricsArr[i + 1];
lyricsArr[i] = lyricsArr[i].split(/\s[^ .]*$/)[0];
}
})
console.log(lyricsArr) // [ 'These', 'are my', 'lyrics' ]
Updated as per the comments.
This is some crude code that i did not spend much time on, but it does the job.
Some info when using this approach:
You need to add any symbols that should not be considered wordbreaking to the regex in the second if
This has not been tested thoroughly, so use at your own risk.
It will definitely break if you have a word in the lyrics longer than the chunk size. Since this is around 2000, I imagine it will not be problem.
This will no longer ensure that the chunk length is below the limit, so change the limit to around 1900 to be safe
You can use .split( ) Javascript function.
word_list = lyrics.split(" ")
And word_list.length to access the number of words in your message and word_list[0] to select the first word for instance.

Orchard CMS Contrib.Review module

I am beginner in Orchard CMS and i need add voting functionality to content. I have installed Contib.Vote and Contrib.Review modules. After that i have added Review part to page content type. Also, i have executed recipe. At the first look everything is fine, but link for review refer to the same page with # symbol and nothing is happenning by clicking on it. It seems like module does not work or work incorrectly. Please help with my problem.
UPD.
Hi devqon and thanx for your help. Your answer was really useful for me. According to your advice i was looking around javascript inside Review Part view file (Parts_Reviews.cshtml). Just for a test i changed its source code a little bit.
#using (Script.Foot())
{
<script type="text/javascript">
//<![CDATA[
(function () {
var numberOfReviewsToShowByDefault = 5;
var $showAllReviewsLink = $('#showAllReviewsLink');
var $deleteReviewConfirmationDialogDiv = $('#deleteReviewConfirmationDialogDiv');
$deleteReviewConfirmationDialogDiv.dialog({ autoOpen: false, modal: true, resizable: false });
$('#deleteReviewLink').click(function () {
$('#reviewId').val($(this).attr("data-review-id"));
ShowDeleteReviewDialog();
return false;
});
$('#showReviewFormLink').click(function () {
$('#createReviewLinkDiv').slideToggle('fast', function () { $('#reviewFormDiv').slideToggle('fast'); });
return false;
});
$('#cancelCreateReviewLink').click(function () {
$('#reviewFormDiv').slideToggle('fast', function() { $('#createReviewLinkDiv').slideToggle('fast'); });
return false;
});
$('#deleteReviewForm').submit(function () {
$('input[type=submit]', this).attr('disabled', 'disabled');
});
$('#cancelDeleteReviewButton').click(function () {
CloseConfirmationDialogDiv();
return false;
});
var rowCount = $('#reviewsList li').length;
if (rowCount > numberOfReviewsToShowByDefault) {
SetupToggle();
}
if (document.location.hash === '#Reviews') {
var topPx = $('#reviews-heading').position().top;
$('body,html').animate({ scrollTop: topPx }, 'slow');
}
if ($("#comment").length) {
var characterCountUpdater = new CharacterCountUpdater($("#comment"), $("#commentCharactersLeft"));
setInterval(function() { characterCountUpdater.UpdateCharacterCount(); }, 100);
$("#comment").keypress(function() { characterCountUpdater.UpdateCharacterCount(); });
if ($("#comment").val().length) {
$("#showReviewFormLink").trigger("click");
}
}
function CharacterCountUpdater(commentBox, charactersLeftBox)
{
this.commentBox = commentBox;
this.charactersLeftBox = charactersLeftBox;
this.maxLength = commentBox.attr("maxlength");
commentBox.removeAttr("maxlength");
return this;
}
Now form for review is displayed. The form looks good, submit button works, character counter works too. But i still can't apply my rating. Stars not react on clicking. That is why submit operation ends with error 'In order to submit a review, you must also submit a rating.'. Look like something inside Parts.Stars.NoAverage.cshtml does not work. Please, help me.
According to the project's site it is a known issue: broken from version 1.7.2.
When looking at the code of the Parts_Reviews.cshtml it says the following on lines 20-24:
string showReviewUri = "#";
if (!Request.IsAuthenticated)
{
showReviewUri = Url.Action("LogOn", "Account", new { area = "Orchard.Users", ReturnUrl = Context.Request.RawUrl });
}
and on line 29:
<div id="createReviewLinkDiv"><span id="createReviewLinkSpan">#noReviewsYetText<a id="showReviewFormLink" href="#showReviewUri">#reviewLinkText</a></span></div>
Therefore, it was intended to let the anchor be # when the request is authenticated (you are logged on). This means it probably will be handled in JavaScript, which can be seen on lines 105-112:
$('#showReviewFormLink').click(function () {
$('#createReviewLinkDiv').slideToggle('fast', function () { $('#reviewFormDiv').slideToggle('fast'); });
return false;
});
$('#cancelCreateReviewLink').click(function () {
$('#reviewFormDiv').slideToggle('fast', function() { $('#createReviewLinkDiv').slideToggle('fast'); });
return false;
});
This piece of code should let you see the form to write a review, so something is going wrong there presumably. When there's something wrong in this jQuery code it probably gives an error in the console, so check out the browser's console when you click the 'Be the first to write a review' link.
This should get you further, if you don't know what to do please provide the error and I will try to dig more. I haven't downloaded the module so I don't have live feed.
Console of Firefox tells: $(...).live is not a function. It refers to Contrib.Stars.js source code file. This function is not supported in jquery now and i replaced it by .on() function in all places api.jquery.com/on. Now module works fine.
Check out my comment at the site below to see how I was was able to get it working again on Orchard 1.8.1:
Orchard Reviews Project Site
You basically just need to change 3 different lines in the Contrib.Stars.js file but I would recommend copying the .js file along with the Review module's different views to a custom theme directory, in order to override everything and force the Reviews module to use your edited .js file:
On line 12 & 13:
Change this:
$(".stars-clear").live(
"click",
To this:
$("body").on(
"click", ".stars-clear",
On line 44 & 45:
Change this:
.live(
"mouseenter",
To this:
.mouseenter(
On line 48 & 49:
Change this:
.live(
"mouseleave",
To this:
.mouseleave(

Internet Explorer truncating flashVars containing JSON

This only happens in IE.
I'm using swfobject and loading the flash vars as such
var flashVars = {
myVar:'{"url":"http://google.com/", "id":"9999"}',
};
var params = {
allowFullScreen:"true",
wmode:"transparent",
allowScriptAccess:'always'
};
swfobject.embedSWF("mySwf.swf", "mySwf", "512", "318", "10.0.0", "./js/swfobject/expressInstall.swf", flashVars, params);
Everything works perfectly in all browser but IE. I checked myVar and it comes into the swf as { and that's it. I know it's dying at the '. I've tried putting a \ infront, then tried \\ and kept adding one slash until I got to \\\\\\\\. I even inverted all the slashes and tried the same ritual. Nothing.
I can get the string to finally come through, with inverted quotes and using double slashes, but then my JSON parser gets mad about there being slashes in my string.
Here's an example of what works, but of what is invalid JSON:
"{\\'url\\':\\'http://google.com/\\', \\'id\\':\\'9999\\'}"
Yep IE treats flashVars differently to all the other major browsers, I believe you need to make use of the JavaScript encodeURIComponent method which will escape all reserved characters from your String, eg:
// Removing all reserved characters from the flashVar value.
var flashVars = {
myVar: encodeURIComponent('{"url":"http://google.com/", "id":"9999"}'),
};
If you are passing multiple values in the flashVars then you could iterate through them and encode all chars in a single pass:
var flashVars = {
myVar: '{"url":"http://google.com/", "id":"9999"}',
anotherVar: 42
};
// Escape all values contained in the flashVars object.
for (key in flashVars) {
if (flashVars.hasOwnProperty(key)) {
flashVars[key] = encodeURIComponent(flashVars[key]);
}
}
As #dgmdan and #bcmoney suggested, it would probably make your code easier to read if you made use of JSON.stringify - however, you need to bear in mind that IE8 and below do not have a native JSON object, so you will need to include Crockford's JS Library in your HTML page.
// Making use of a JSON library.
var flashVars = {
myVar: encodeURIComponent(JSON.stringify({ url: "http://google.com/", id: "9999"})),
};
Also, it's worth bearing in mind that flashVars are limited to ~64k; so if you are planning to pass a lot of data, it might be better to use an ExternalInterface call to pull them from the JavaScript instead.
Try this to replace your first 3 lines:
var subVars = { url: "http://google.com/", id: "9999" };
var flashVars = { myVar: JSON.stringify(subVars) };

Resources