good evening everyone, I'm trying to get the array that is in this.data = [{"id_penggiat":"6f81520f-1a8b-4d1d-a7d5-defdb630818b"...] . using cheerio and puppets. I have tried my best but not getting results. help me to solver this
<script>
.......
var PenggiatField = {
fieldid: '#penggiatField',
data: true,
debug: false,
Init: function(params) {
var field = this;
this.data = [{"id_penggiat":"6f81520f-1a8b-4d1d-a7d5-defdb630818b","id_kerma":"9388706e-7156-4d07-bf77-cb9c6d7f30d3","id_mitra":null,"id_unit":"89c82da3-7b28-4305-82c4-347dae042847","id_unit_teknis":null,"no_pihak":"1","nm_pihak":"Institut Teknologi Sepuluh Nopember","alamat":"Kampus ITS Sukolilo Surabaya","ttd_nama":"Tri Joko Wahyu Adi, ST., MT., Ph.D.","ttd_jabatan":"Direktur","pic_nama":"","pic_jabatan":"","pic_email":null,"pelaksana":["c2c72bf4-ec6c-46dd-8be8-d9fb77a90bd6","7e42c5bb-0c44-4b93-9162-ec6fb31cd67c","902bd59e-745d-4903-b1bf-171fe6f0fb8c","c93fb75a-1065-4587-a7af-475d4256f44d","9a6592f0-ff86-401d-8021-0e9f6fc285fe","8f2d4244-d868-45e3-a906-578ce2f9f572","1f8fc06b-4707-4e11-bdc8-312f39017758"],"create_date":"2022-03-16 04:41:14.146436","id_creator":"79ce0204-5c6d-43dd-a310-8c13a557a402","last_update":"2022-03-16 04:54:25.228835","id_updater":"79ce0204-5c6d-43dd-a310-8c13a557a402","soft_delete":"0","a_lembaga":"university","id_institusi":"89c82da3-7b28-4305-82c4-347dae042847","nm_mitra":null,"nm_unit":"Institut Teknologi Sepuluh Nopember","nm_unit_teknis":null,"id_klas_mitra":"12","nm_klas_mitra":"Institusi Pendidikan","id_negara":"ID","nm_negara":"Indonesia"},{"id_penggiat":"c4c3db69-8cbb-4a8e-ae6c-544b532760c6","id_kerma":"9388706e-7156-4d07-bf77-cb9c6d7f30d3","id_mitra":"f4b81538-6442-4a19-b9f6-8581d152725d","id_unit":null,"id_unit_teknis":null,"no_pihak":"2","nm_pihak":"Sekolah Menengah Atas Negeri 1 Karas Magetan","alamat":"Jl. Karas, Punden, Kuwon, Kec. Kendal, Kabupaten Magetan, Jawa Timur 63395","ttd_nama":"Bahtiar Kholili, S.Pd., M.M.Pd.","ttd_jabatan":"Kepala Sekolah","pic_nama":"","pic_jabatan":"","pic_email":null,"pelaksana":null,"create_date":"2022-03-16 04:41:14.146436","id_creator":"79ce0204-5c6d-43dd-a310-8c13a557a402","last_update":"2022-03-16 04:54:25.228835","id_updater":"79ce0204-5c6d-43dd-a310-8c13a557a402","soft_delete":"0","a_lembaga":"mitra","id_institusi":"f4b81538-6442-4a19-b9f6-8581d152725d","nm_mitra":"Sekolah Menengah Atas Negeri 1 Karas Magetan","nm_unit":null,"nm_unit_teknis":null,"id_klas_mitra":"12","nm_klas_mitra":"Institusi Pendidikan","id_negara":"ID","nm_negara":"Indonesia"}];
........
</script>
in puppeteer the following would do the job, but it is not a general solution.
await page.goto(pageUrl)
const filteredArray = await page.$$eval('script', scripts =>
JSON.parse(
scripts
.map(src => src.innerHTML)
.filter(el => el.includes('"id_penggiat":'))[0] // selecting the desired <script>
.split('\n')
.filter(el => el.includes('"id_penggiat":'))[0] // selecting the exact script line within the <sript>'s innerHTML
.replace(/this.data\W=\W|;$/gm, '')
.trim()
)
)
console.log(filteredArray)
what's happening?
page.$$eval collects all <script> tags into an array
we map over all scripts' innerHTML
select the one that includes the "id_penggiat": string fragment
any JavaScript scripts will be received in a huge string with innerHTML, all lines will be delimited by \n linebreaks: we can split them one-by-one
let's select the exact line within the tag that includes "id_penggiat": string fragment
clean the output string from the un-parseable parts like the variable name, this, and the ending ;. plus trim the extra spaces from the beginning of the line.
the final result can be parsed as JSON.
note: using the 1st index [0] on the Array.filter-ed values picking only the first occurrences, if you want all of them: you will need to process further.
I am using Cloudinary to host my media on the cloud for my NodeJS project.
To delete an image from the Clodinary Cloud, I need to pass a Public Id for that image, to the Cloudinary API.
I realised, Public ID is embedded into the url, how to I extract it out from the URL?
Because, I don't want to store my data in this format :
image : {
url : `http://res.cloudinary.com/cloud_name/image/upload/v1647610701/rsorl4rtziefw46fllvh.png`,
publicId : `rsorl4rtziefw46fllvh`
}
Rather, I find it better to store it like this :
image : `http://res.cloudinary.com/cloud_name/image/upload/v1647610701/rsorl4rtziefw46fllvh.png`
The solution to this problem is to implement a funciton which extracts the publicId for every URL passed in as argument.
Here's the function :
const getPublicId = (imageURL) => imageURL.split("/").pop().split(".")[0];
Edited after #loic-vdb 's suggestion
Explanation :
It splits the string in an array using "/" as seperator.
imageURL="http://res.cloudinary.com/cloud_name/image/upload/v1647610701/rsorl4rtziefw46fllvh.png";
becomes,
imageURL = [ 'http:',
'',
'res.cloudinary.com',
'cloud_name',
'image',
'upload',
'v1647610701',
'rsorl4rtziefw46fllvh.png' ]
Next, pop the array (returns the last element of the array)
imageURL = 'rsorl4rtziefw46fllvh.png';
Now, split this string into array using "." as seperator, we get :
imageURL = [ 'rsorl4rtziefw46fllvh', 'png' ]
Finally select the 0th element that is our PublicId return that
imageURL = 'rsorl4rtziefw46fllvh';
Based on the answer by a Cloudinary support team member
... the public_id contains all folders and the last part of the public_id is the filename.
Here is what I tried and worked
const path = require("path");
const getPublicId = (imageURL) => {
const [, publicIdWithExtensionName] = imageURL.split("upload/");
const extensionName = path.extname(publicIdWithExtensionName)
const publicId = publicIdWithExtensionName.replace(extensionName, "")
return publicId
};
especially for cases where you store your assets in folders
I'm studying about Jsoup and developing my own app showing some events.
I want to parse this Home Page
And I want to parse This Image
So, I wrote code like this.
private String url = "http://event.lottecinema.co.kr/LCHS/Contents/Event/event-summary-list.aspx";
...
Elements imgs = document.select("html body form#form1 div#wrap div#container.sub div#content div.event_Hwrap.allevPg ul#emovie_list_20.emvie_list li a " );
...
for(Element img : imgs) {
imageTitles.add(img.attr("alt")); //event title list }
But there are not any images...
I don't know why.. because I parsed some images other Google sites.
you may try this selector
Elements imgs = document.select("body img");
for(Element img : imgs)
{
String imgalt_attrval = img.attr("alt");
String imgsrc_attrval = img.attr("src")
}
I have the following xml:
<foo><toReplace/></foo>
I want to replace <toReplace/> tag with the following string:
"<b>bar</b>"
How can I do that?
Right now I have the following code:
var xml = "<foo><toReplace/></foo>";
var parser = new dom.DOMParser().parseFromString(xml, "text/xml");
parser.getElementsByTagName("toReplacce")[0].textNode = "<b>bar</b>";
console.log(parser.toString()); // "<foo><b>bar</b>"
The problem is that is escapes HTML. How can I replace the content with the HTML string here?
you can always use the module from npm
var unescape = require('unescape');
console.log(unescape(parser.toString()))
When I tested your code there is a small typo: (toReplacce instead of toReplace)
var dom = require('xmldom');
var xml = "<foo><toReplace/></foo>";
var parser = new dom.DOMParser().parseFromString(xml, "text/xml");
var a = parser.getElementsByTagName("toReplace")[0];
//console.dir(a);
a.textvalue = "<b>bar</b>";
console.log(parser.toString());
how can I tell "sanitize-html" to actually remove the html tags (keep only the content within)? currently if for example I set it to keep the div sections, in the output it writes also the <div>some content</div> - I want only the inside...('some content')
to make it short - I don't want the tags, attributes etc. - only the content of those elements..
var Crawler = require("js-crawler");
var download = require("url-download");
var sanitizeHtml = require('sanitize-html');
var util = require('util');
var fs = require('fs');
new Crawler().configure({depth: 1})
.crawl("http://www.cnn.com", function onSuccess(page) {
var clean = sanitizeHtml(page.body,{
allowedTags: [ 'p', 'em', 'strong','div' ],
});
console.log(clean);
fs.writeFile('sanitized.txt', clean, function (err) {
if (err) throw err;
console.log('It\'s saved! in same location.');
});
console.log(util.inspect(clean, {showHidden: false, depth: null}));
var str = JSON.stringify(clean.toString());
console.log(str);
/*download(page.url, './download')
.on('close', function () {
console.log('One file has been downloaded.');
});*/
});
I'm the author of sanitize-html.
You can set allowedTags to an empty array. sanitize-html does not discard the contents of a disallowed tag, only the tag itself (with the exception of a few tags like "script" and "style" for which this would not make sense). Otherwise it wouldn't be much use for its original intended purpose, which is cleaning up markup copied and pasted from word processors and the like into a rich text editor.
However, if you have markup like:
<div>One</div><div>Two</div>
That will come out as:
OneTwo
To work around that, you can use the textFilter option to ensure the text of a tag is always followed by at least one space:
textFilter: function(text) {
return text + ' ';
}
However, this will also introduce extra spaces in sentences that contain inline tags like "strong" and "em".
So the more I think about it, the best answer for you is probably a completely different npm module:
https://www.npmjs.com/package/html-to-text
It's widely used and much better suited than your use case. sanitize-html is really meant for situations where you want the tags... just not the wrong tags.