How to parse with parse5? - node.js

As example parse5. Parse function return document. But, querySelector function doesn't exist. https://developer.mozilla.org/en-US/docs/Web/API/Document.
import fetch from 'node-fetch';
import { parse } from 'parse5';
(async () => {
const options = {
redirect: "manual"
};
const response = await fetch('https://google.com', options);
const dom = parse(await response.text());
console.log(dom.querySelector('title'));
})();

The object that the parse function returns is called a Document, but it's not the same as the Web API Document you'd find in a browser. Instead, it's the root of a very simple tree data structure. The documentation for this data structure is at https://github.com/inikulin/parse5/blob/master/packages/parse5/docs/tree-adapter/default/interface-list.md
Despite its simplicity, the tree has everything you need to parse the document. It's likely that you'll have to recursively search for the element you're looking for. Try this code as an example:
const fetch = require('node-fetch');
const { parse } = require('parse5');
const getDescendantByTag = (node, tag) => {
for (let i = 0; i < node.childNodes?.length; i++) {
if (node.childNodes[i].tagName === tag) return node.childNodes[i];
const result = getDescendantByTag(node.childNodes[i], tag);
if (result) return result;
}
return null;
};
fetch('https://google.com', { redirect: 'manual' })
.then((response) => response.text())
.then((text) => {
console.log(getDescendantByTag(parse(text), 'title'));
});

I'm not sure, but i think Node.JS has no implementation for querySelector.
You can use polyfill to solve this, like this implementation at Github
/**
* Polyfills the querySelector and querySelectorAll methods.
* #see https://gist.github.com/Fusselwurm/4673695
*/
(function () {
var style;
var select = function (selector, maxCount) {
var all = document.all,
l = all.length,
i,
resultSet = [];
style.addRule(selector, "foo:bar");
for (i = 0; i < l; i += 1) {
if (all[i].currentStyle.foo === "bar") {
resultSet.push(all[i]);
if (resultSet.length > maxCount) {
break;
}
}
}
style.removeRule(0);
return resultSet;
};
if (document.querySelectorAll || document.querySelector) {
return;
}
style = document.createStyleSheet();
document.querySelectorAll = document.body.querySelectorAll = function (selector) {
return select(selector, Infinity);
};
document.querySelector = document.body.querySelector = function (selector) {
return select(selector, 1)[0] || null;
};
}());
Polyfills help with browser support and implementations for NodeJS. You can use same technique for other functions (in example, to support non-existent functions in IE7 in a React application).
NOTE: Also available as npm package, see https://www.npmjs.com/package/polyfill-queryselector

hast-util-select can help
/*
npm i parse5 hast-util-from-parse5 hast-util-to-html hast-util-select
*/
import {parse} from 'parse5'
import {fromParse5} from 'hast-util-from-parse5'
import {toHtml} from 'hast-util-to-html'
import {matches, select, selectAll} from 'hast-util-select'
function parseHtml(source) {
const p5ast = parse(source, { sourceCodeLocationInfo: true })
return fromParse5(p5ast)
}
const html = `
<html>
<body>
<div class="some-class">find me 1</div>
<div class="some-class">find me 2</div>
<div>ignore me</div>
</body>
</html>
`
const tree = parseHtml(html)
for (const div of selectAll("div.some-class", tree)) {
//console.dir(div)
console.log(toHtml(div))
}
gives
<div class="some-class">find me 1</div>
<div class="some-class">find me 2</div>

Related

Best practice using promises chaining in node JS

I'm little bit confusing in promises. first, I have some ugly code like this:
async function presence(ctx) {
try {
var prsenceData = [];
var isSuccess = Boolean(false);
var ckFilePath = "./somepath/cookie.json";
if (!fs.existsSync(ckFilePath)) {
await menuLogin.login(ctx).then(login => {
isSuccess = Boolean(login[0].status);
myCk.saveCookies(login[0].cookies, ckFilePath);
if (!isSuccess) {
myCk.deleteCookies(ckFilePath);
return false;
}
});
} else {
await myCk.checkToDelete(ckFilePath).then(isDel => {
if (isDel) {
return false;
}
});
}
await presenceNow.check(fs.existsSync(ckFilePath), ctx).then(data => {
for (let id = 0; id < data[0].pesan.length; id++) {
console.log(data[0].pesan[id]);
}
for (let id = 0; id < data[0].id.length; id++) {
presenceData.push(data[0].id);
}
if (data[0].pesan.length == 0 && fs.existsSync(ckFilePath)) {
myCk.deleteCookies(ckFilePath);
}
});
} catch (e) {
console.log(e);
}
return presenceData;
}
Can anyone explain why presenceNow.check() function is not calling if my ckFilePath does not exist? but if myCkFilePath is exist, my code run so well. And maybe anyone can show me the better code for that case? thanks.
Mixing async/await and promise chains like this is something of a code smell that the author lacked an understand of async/await. It's also something of a mixed metaphor.
If you refactor it to actually use async/await you get something like this that's a lot easier to understand.
My suspicion is that your presenceNow.check() method is not being called because the function is taking returning via one of the two return paths above it:
the file exists and myCk.checkToDelete() returns true, or
the file does not exist, and the login is unsuccessful.
const fs = require('fs/promises');
async function presence(ctx) {
var presenceData = [];
var isSuccess = false;
var ckFilePath = "./somepath/cookie.json";
let ckFilePathExists = await fs.access(ckFilePath);
if (ckFilePathExists) {
const isDel = await myCk.checkToDelete(ckFilePath);
if (isDel) {
return false;
}
} else {
const login = await menuLogin.login(ctx);
const isSuccess = login[0].status
myCk.saveCookies(login[0].cookies, ckFilePath);
if (!isSuccess) {
myCk.deleteCookies(ckFilePath);
return false;
}
}
ckFilePathExists = await fs.access(ckFilePath)
const data = await presenceNow.check(ckFilePathExists, ctx);
for (let id = 0; id < data[0].pesan.length; id++) {
console.log(data[0].pesan[id]);
}
for (let id = 0; id < data[0].id.length; id++) {
presenceData.push(data[0].id);
}
if (data[0].pesan.length == 0 && await fs.access(ckFilePath) ) {
myCk.deleteCookies(ckFilePath);
}
return presenceData;
}

Deleting child's child elements while web scrapping and writing it to a html file using NodeJS puppeteer

I'm doing webscarping and writing the data to another HTML file.
On line " const content = await page.$eval('.eApVPN', e => e.innerHTML);" I'm fetching the inner html of a div, this div has multiple p tag, inside those p tags there are multiple hyperlink(a) tags
I want to remove those tags href, but I'm unable to do so
const fs = require('fs').promises;
const helps = require('./_helpers');
const OUTDIR = './results/dataset/'
fs.stat(OUTDIR).catch(async (err) => {
if (err.message.includes('Result Director Doesnt Exist')) {
await fs.mkdir(OUTDIR);
}
await fs.mkdir(OUTDIR);
});
const scraperObject = {
async scraper(browser){
const dataSet = await helps.readCSV('./results/dataset.csv');
console.log("dataset is : ", dataset);
var cookies = null
let page = await browser.newPage();
for (let i = 0; i < dataSet.length ; i++) {
let url = dataSet[i].coinPage
const filename = dataSet[i].symbol;
try{
console.log(`Navigating to ${url}...`);
await page.goto(url);
if (cookies == null){
cookies = await page.cookies();
await fs.writeFile('./storage/cookies', JSON.stringify(cookies, null, 2));
}
await helps.autoScroll(page);
await page.waitForSelector('.eApVPN');
const content = await page.$eval('.eApVPN', e => e.innerHTML);
await fs.writeFile(`${OUTDIR}${filename}.html`, content, (error) => { console.log(error); });
console.log("Written to HTML successfully!");
} catch (err){
console.log(err, '------->', dataSet[i].symbol);
}
}
await page.close();
}
}
module.exports = scraperObject;
Unfortunately Puppeteer doesn't have native functionality to remove nodes. However, you can use .evaluate method to evaluate any javascript script against the current document. For example a script which removes your nodes would look something like this:
await page.evaluate((sel) => {
var elements = document.querySelectorAll(sel);
for(var i=0; i< elements.length; i++){
elements[i].remove()
}
}, ".eApVPN>a")
The above code will remove any <a> nodes directly under a node with eApVPN class. Then you can extract the data with your $eval selector.

Working code below to fetch all the data from my MongoDb databse but now I want to only fetch based on lets say courses category. How can I do it?

Below is a working code that I used to fetch all my data from my MongoDb database. It worked fine and now I am looking to only fetch based on certain criteria. Lets say I have list of courses with title, description, category, and price. I want to only show the ones with category that says 'coding. How can I do it?
This is React Code
import "./style/courses.css";
import {
Link
} from "react-router-dom";
import React, {
useEffect,
useState
} from "react";
import './style/home.css';
function Course() {
const [courses, setCourses] = useState([]);
useEffect(() => {
fetch("http://localhost:8000/courses")
.then(response => response.json())
.then(data => {
setCourses(data); // set users in state
});
}, []); // empty array because we only run once
return (
courses.map(course => {
return ( <
div className = "main"
key = {
course._id
} >
<
div className = "product" >
<
img src = {
course.image
}
alt = {
course.title
}
/>
<
div className = "product__info" >
<
p className = "info__name" > {
course.title
} < /p> <
p className = "info__name" > {
course.category
} < /p>
<
p className = "info__description" > {
course.description.substring(0, 100)
}... < /p>
<
p className = "info__price" > $ {
course.price
} < /p>
<
Link to = {
`/courses/${course._id}`
}
className = "info__button" >
View <
/Link>
<
/div> <
/div> <
/div>
)
})
)
}
export default Course;
<script src="https://cdnjs.cloudflare.com/ajax/libs/react/16.6.3/umd/react.production.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react-dom/16.6.3/umd/react-dom.production.min.js"></script>
This is backend routes
const express = require("express");
const mongoose = require("mongoose");
const router= express.Router()
const categoryController = require("../controllers/categorycontroller.js");
router.route('/').get(categoryController.findCategory);
module.exports=router
with initial route
app.use('/courses/category', categoryeRoute)
This is controller
const Category = require("../models/course");
//find course by category
exports.findCategory = async (req, res) => {
try {
const Categoryquery = req.query.category;
const firstCategory = await Category.find({category:Categoryquery});
res.send(firstCategory);
} catch {
res.status(404).send({ error: "category is not found!" });
}
};
Your code is excellent...
This line const firstCategory = await Category.find({category:Categoryquery})
When using your postman or any software you used to test your API I believe for the category you must have used a direct value like const firstCategory = await Category.find({category:"science"}) this will in turn display only science category.
But the major reason why your code isn't working is because of you need to send a value to filter the courses since when you already have this line
const Categoryquery = req.query.category;
I suggest using a drop-down with all the categories in the frontend. So that you can inturn use onChange to pass the category value to the backend. But req.query here simply means you will extract the data from your url.
Check this out

download <title> of a url with minimal data usage

For the purpose of generating links to another websites I need to download content of tag.
But I would like to use as minimal bandwidth as possible.
In some hardcore variant, to process input stream and close the connection when reached.
Or to e.g. fetch first 1024 chars on first attempt and when it did not contain the whole title as a fallback fetch the whole thing.
What could I use in nodejs to achieve this?
In case someone else is interested, here is what I end up with (very initial version, use just for notion what to use).
A few notes thought:
the core of this solution is to read as few chunks of http response as possible, till the is reached.
not sure, how friendly is by convention to interrupt the connection with response.destroy() (which probably closes the underlying socket).
import {get as httpGet, IncomingMessage} from 'http';
import {get as httpsGet} from 'https';
import {titleParser} from '../ParseTitleFromHtml';
async function parseFromStream(response: IncomingMessage): Promise<string|null> {
let body = '';
for await (const chunk of response) {
const text = chunk.toString();
body += text;
const title = titleParser.parse(body);
if (title !== null) {
response.destroy();
return title;
}
}
response.destroy();
return null;
}
export enum TitleDownloaderRejectionCodesEnum
{
INVALID_STATUS_CODE = 'INVALID_STATUS_CODE',
TIMEOUT = 'TIMEOUT',
NOT_FOUND = 'NOT_FOUND',
FAILED = 'FAILED', // all other errors
}
export class TitleDownloader
{
public async downloadTitle (url: string): Promise<string|null>
{
const isHttps = url.search(/https/i) === 0;
const method = isHttps ? httpsGet : httpGet;
return new Promise((resolve, reject) => {
const clientRequest = method(
url,
async (response) => {
if (!(response.statusCode >= 200 && response.statusCode < 300)) {
clientRequest.abort();
reject(response.statusCode === 404
? TitleDownloaderRejectionCodesEnum.NOT_FOUND
: TitleDownloaderRejectionCodesEnum.INVALID_STATUS_CODE
);
return;
}
const title = await parseFromStream(response);
resolve (title);
}
);
clientRequest.setTimeout(2000, () => {
clientRequest.abort();
reject(TitleDownloaderRejectionCodesEnum.TIMEOUT);
})
.on('error', (err: any) => {
// clear timeout
if (err && err.message && err.message.indexOf('ENOTFOUND') !== -1) {
reject(TitleDownloaderRejectionCodesEnum.NOT_FOUND);
}
reject(TitleDownloaderRejectionCodesEnum.FAILED);
});
});
}
}
export const titleDownloader = new TitleDownloader();

Refresh the content without page refresh

hello i have developed the chat application using socket.io, expressjs and mongoose it works fine. it refreshes after some seconds and fetches new clients from db if exist. problem is that user can feel that div is refreshsing.and also some time take soem time in response. how to avoid this case. here is my code
This is my server side code
setInterval(function () {
var allOnLine;
allOnLine = io.sockets.clients();
for (var client in allOnLine) {
if (allOnLine[client].username == "undefined") {
continue;
} else {
notifyAll(allOnLine[client].username);
}
}
}, 50000);
and here is notify all method
function notifyAll(userName) {
contactModel.find({'userId':userName}, (function (err, contactModel) {
usernames = [];
var contacts = contactModel;
for (var a = 0; a < contacts.length; a++) {
usernames[a] = contacts[a].contactId;
}
var allOnLine;
allOnLine = io.sockets.clients();
for (var client in allOnLine) {
if (allOnLine[client].username == "undefined") {
continue;
} else {
for (var i = 0; i < usernames.length; i++) {
if (allOnLine[client].username == usernames[i]) {
usernames[i] = usernames[i] + " -On";
}
allOnLine[client].username);
}
}
}
io.sockets.to(userName).emit('updateusers', usernames);
}));
}
This is my client code
socket.on('updateusers', function(usernames) {
jQuery('#usernames').html('');
jQuery.each(usernames, function(key, value) {
jQuery('#usernames').append('<div class="chatContact" id="chatLink" onclick="privateChat(\''+value+'\')">' );
}}
any help i had also posted this question but no answer
Your problem is that youdelete everything from usernames and after that you write all contacts. You would better remove the offline contacts from the $('#usernames') and after that to add to that list the online contacts. I writed some functions to show you the functionality. I created html list of online contacts and I also created an array of new online contacts. Here is the code:
<div id="u">
<div class="d" onclick="chat('asd1')">asd1</div>
<div class="d" onclick="chat('asd12')">asd12</div>
<div class="d" onclick="chat('asd13')">asd13</div>
<div class="d" onclick="chat('asd142')">asd14</div>
</div>
Here you have the javascript that you need to run after the DOM is ready:
var onlineUsernames = ["asd211","asd12","asd13","asd14"];
var usernamesContainerID = 'u';
var $usernamesContainer = $('#'+usernamesContainerID);
function extractUsernameFromAttr(onclickValue)
{
return onclickValue.split("'")[1];
}
function buildExistingUsernames($userDivs)
{
var existingUsernames = [];
$userDivs.each(function(index,value){
var username = extractUsernameFromAttr($userDivs[index].getAttribute('onclick'));
existingUsernames.push(username);
})
return existingUsernames;
}
function removeUserFromList($user)
{
document.getElementById(usernamesContainerID).removeChild($user);
}
function addUserToList(value)
{
$('<div/>',{
onclick:"chat('"+value+"')",
class :'d',
text:value
}).appendTo($usernamesContainer);
}
function deleteOfflineContacts(existingUsernames,usernames,$userDivs)
{
$.each(existingUsernames,function(index,value)
{
if($.inArray(value,usernames)==-1)
{
removeUserFromList($userDivs[index]);
}
})
}
function addOnlineContacts(existingUsernames,usernames)
{
$.each(usernames,function(index,value)
{
if($.inArray(value,existingUsernames)==-1)
{
addUserToList(value);
}
})
}
function update($userDivs)
{
var existingUsernames = buildExistingUsernames($userDivs);
deleteOfflineContacts(existingUsernames,onlineUsernames,$userDivs);
addOnlineContacts(existingUsernames,onlineUsernames);
}
var $userDivs = $usernamesContainer.children("div");
setTimeout(function()
{
update($userDivs);
},3000);
If you need it here is a working example: http://jsfiddle.net/9gRyQ/2/

Resources