How to get all fonts used on a page using node.js? - node.js

I need to crawl all the pages on a site (the crwling part works fine.) and so i need to run THIS script on my server using node.js. I tried implementing the following logic:
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var jsdom = require("jsdom");
var { JSDOM } = jsdom;
var START_URL = "http://balneol.com/";
var SEARCH_FONT = "helvetica";
var MAX_PAGES_TO_VISIT = 100000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the window.document body
// var window = jsdom.jsdom(body).defaultView();
var { window } = new JSDOM(body);
//var $ = cheerio.load(body);
var helveticaFound = searchForHelvetica(window, 'font-family');
if(helveticaFound) {
console.log('Word ' + SEARCH_FONT + ' found at page ' + url);
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
// callback();
}
});
}
function searchForHelvetica( window , css) {
if(typeof getComputedStyle == "undefined")
getComputedStyle= function(elem){
return elem.currentStyle;
}
var who, hoo, values= [], val,
nodes= window.document.body.getElementsByTagName('*'),
L= nodes.length;
for(var i= 0; i<L; i++){
who= nodes[i];
console.log(nodes[i]);
if(who.style){
hoo= '#'+(who.id || who.nodeName+'('+i+')');
console.log(who.style._values);
// return false;
val= who.style.fontFamily || getComputedStyle(who, '')[css];
if(val){
if(verbose) values.push([hoo, val]);
else if(values.indexOf(val)== -1) values.push(val);
// before IE9 you need to shim Array.indexOf (shown below)
}
}
}
// console.log(values);
// return values;
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
If you see my visit page function you will see the below two lines of code:
var { window } = new JSDOM(body);
var helveticaFound = searchForHelvetica(window, 'font-family');
as you can see on the 2nd line i am passing the window object to the searchForHelvetic function.
In my searchForHelvetic function , if i console.log(nodes[i]); , i don't get the html element and hence the rest of the script does't quite run as expected. does the jsdom window differ from the window object in the browser ? how do i get the script working ? I.E. basically use the window object to run through all the pages on the page and spit out all the fonts used on the page ?
EDIT::-
To break the problem down to a micro level, if i console.log(who); inside searchForHelvetica function , i get the following result:
HTMLElement {}
HTMLDivElement {}
HTMLDivElement {}
HTMLDivElement {}
HTMLAnchorElement {}
HTMLImageElement {}
HTMLDivElement {}
HTMLFormElement {}
HTMLDivElement {}
HTMLLabelElement {}
HTMLInputElement {}
HTMLButtonElement {}
HTMLButtonElement {}
HTMLSpanElement {}
etc..
But if i were to do the same in a web browser the result world be different Eg.
nodes = window.document.body.getElementsByTagName('*');
console.log(node[1]) // <div id="mobile-menu-box" class="hide">...</div>
How do i get a similar result in node.js ?

Related

Titanium http request leak

I have to make a load of subsequent http requests to load product images into the app as it has to function in an offline mode.
Around 2000 calls.
The http client seems toi have a memory leak which causes the persistent mbytes in "instruments" to rise to around 200 without being garbaged.
After use of the http client it is being set to null.
I have tried setting the file property of the httpclient without any success
I have set the unload function to only call the callback function which in turn calls the http send function again (thus looping through the 2000 products to get the respective pictures)
I changed from SDK 7.5.0.v20180824022007 to SDK 8.1.0.v20190423134840 and even SDK 9.0.0.v20181031080737 but the problem remains
the code of my http common module:
function HttpClient(options = {}) {
this.root = options.root || "ROOT_OF_API";
this.endpoint = options.endpoint || false;
this.needsChecksum = options.needsChecksum || false;
this.data = {};
this.method = options.method || "Post";
this.timeout = options.timeout || 5000;
this.calculateChecksum = function () {
var moment = require('alloy/moment');
if (!Alloy.Models.user.authenticated()) {
return false;
}
var sp = (moment().unix() - Alloy.Models.meta.get("timeDiff"))
var hash = Ti.Utils.md5HexDigest("nX" + sp + "FossilSFAapp" + Alloy.Models.user.get('token').substring(10, 14) + "CS")
var checksum = sp + "-" + hash.substring(4, 8)
this.data.checksum = checksum
}
};
HttpClient.prototype.setData = function (data) {
this.data = data
};
HttpClient.prototype.send = function (callback) {
// set new checksum for request if is needed
if (this.needsChecksum) {
this.calculateChecksum()
}
// add app version
if (this.method === "POST") {
this.data.appversion = Ti.App.version;
}
// send
var client = Ti.Network.createHTTPClient({
onload: function () {
callback({
success: true
})
},
onerror: function(e) {
callback({
message: e.messgae,
success: false
})
},
timeout: this.timeout
});
client.open(this.method, this.root + this.endpoint);
if (this.setFile) {
client.file = Ti.Filesystem.getFile(Ti.Filesystem.applicationDataDirectory, this.fileName);
}
client.setRequestHeader('Content-Type', 'application/json');
client.send(JSON.stringify(this.data));
client = null;
};
module.exports = HttpClient;
and then the module is used in the product model like so:
var HttpClient = require('./HttpClient');
var httpClient = new HttpClient();
function getImage (i) {
if (collection.at(i) && collection.at(i).get('iimage0') && collection.at(i).needsImageUpdate()) {
httpClient.endpoint = collection.at(i).get('acarticlenumber') +".jpg";
httpClient.fileName = 'productImages/' + collection.at(i).get('acarticlenumber') + '.jpg'
httpClient.send(function(e){
if (i < collection.length) {
i++
getImage(i)
} else {
finished()
}
});
} else {
if (i < collection.length) {
i++
getImage(i)
} else {
finished()
}
}
}
// start getting images at index 0
getImage(0)
anyone have an idea why these memory leaks appear ?
It only ever occurs when actually sending the http request.

NodeJs + Request-promise - error catching

I'm having trouble with error handling with my function in my bot for Discord. What I've got right now is a command that scraps information from a website, I want to make it so if there is an error (404), the user will get some feedback. How would I go about doing this? Right now I currently have something, but I'm not sure what I'm doing wrong. Here is a snippet of code:
//modules used
const rp = require('request-promise-native');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
if (message.content.startsWith(prefix + 'latest')) {
//website url variables
const website_domain = "https://hypebeast.com/";
let website_path = args[0];
let website_url = website_domain + website_path;
//extra arguments variable
let extra_arg = args.slice(1).join(" ");
if (extra_arg.length > 0) {
message.reply('too many arguments! Please refer to `h.help` for correct usage.');
} else {
//opening url and scrapping elements
function scrapData(website_url) {
return rp(website_url)
.then(body => {
let items = [],
$ = cheerio.load(body).catch(errors.StatusCodeError, function (reason) {
console.log(reason);
});
//web scrapping here
$('.post-box').each(function() {
let title = $(this).find($('.title h2 span')).first().text(),
caption = $(this).find($('.post-box-excerpt p')).first().text(),
article_url = $(this).find($('.col-hb-post-image a')).first().attr('href'),
thumbnail_long = $(this).find($('.thumbnail img')).first().attr('src');
//adding title, caption, etc to list
items.push({title, caption, article_url, thumbnail_long});
//check items in console
console.log(items);
})
return items;
})
}
I have just modified your code little try this below code.
//modules used
const rp = require('request-promise-native');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
if (message.content.startsWith(prefix + 'latest')) {
//website url variables
const website_domain = "https://hypebeast.com/";
let website_path = args[0];
let website_url = website_domain + website_path;
//extra arguments variable
let extra_arg = args.slice(1).join(" ");
if (extra_arg.length > 0) {
message.reply('too many arguments! Please refer to `h.help` for correct usage.');
} else {
var options = {
uri: website_url,
transform: function (body) {
return cheerio.load(body);
}
};
rp(options)
.then(function ($) {
// Process html like you would with jQuery...
$('.post-box').each(function() {
let title = $(this).find($('.title h2 span')).first().text(),
caption = $(this).find($('.post-box-excerpt p')).first().text(),
article_url = $(this).find($('.col-hb-post-image a')).first().attr('href'),
thumbnail_long = $(this).find($('.thumbnail img')).first().attr('src');
//adding title, caption, etc to list
items.push({title, caption, article_url, thumbnail_long});
//check items in console
console.log(items);
});
})
.catch(function (err) {
console.log(err);
});
}

How to wait a request response and return the value?

When I use a request module in node.js server, I have some problem such as wait and return.
I would like to receive a "responseObject" value at requestController.
To solve this problem, I have search the best way but I still do not find it.
How can solve this problem?
Thank in advance!! :)
=========================================================================
var requestToServer = require('request');
function getRequest(requestObject) {
var urlInformation = requestObject['urlInformation'];
var headerInformation = requestObject['headerInformation'];
var jsonObject = new Object( );
// Creating the dynamic body set
for(var i = 0; i < headerInformation.length; i++)
jsonObject[headerInformation[i]['headerName']] = headerInformation[i]['headerValue'];
requestToServer({
url : urlInformation,
method : 'GET',
headers : jsonObject
}, function(error, response ,body) {
// todo response controlling
var responseObject = response.headers;
responseObject.body = body;
});
}
// Controlling the submitted request
exports.requestController = function(requestObject) {
var method = requestObject['methodInformation'];
var resultObject = null;
// Selecting the method
if(method == "GET")
resultObject = getRequest(requestObject);
else if(method =="POST")
resultObject = postRequest(requestObject);
else if(method == "PUT")
resultObject = putRequest(requestObject);
else if(method == "DELETE")
resultObject = deleteRequest(requestObject);
console.log(JSON.stringify(resultObject));
}
You can use callbacks in the following way.
function getRequest(requestObject, callback) {
// some code
requestToServer({
url : urlInformation,
method : 'GET',
headers : jsonObject
}, function(error, response ,body) {
// todo response controlling
var responseObject = response.headers;
responseObject.body = body;
callback(responseObject);
});
}
And
// Controlling the submitted request
exports.requestController = function(requestObject) {
var method = requestObject['methodInformation'];
// Selecting the method
if(method == "GET")
getRequest(requestObject, function(resultObject){
console.log(JSON.stringify(resultObject));
});
//some code
}
Hope, it helps.

Stop node-simplecrawler instance when creating a new one (make it behave like a singleton)

Heyllo, everyone!
I am making a scraper which uses node-simplecrawler. Everything runs fine, but what I can't figure out is how to stop one instance when creating a new one (I want to have only one running at a time). I am using express for this and all the scraping logic is in one route. In order to cancel the crawling right now, I need to stop the node process and run the app again.
Here is the part of the code that concerns running the crawler (note: I've simplified the code a little bit, so it's shorter):
module.exports = function(socket) {
var express = require('express');
var router = express.Router();
[... requires continue...]
/* GET scaning page. */
router.post('/', function(req, res, next) {
res.render('scanning'); // Load the socket.io host page
var render = {};
var pages = [];
var timer = new Date();
// Helper func to log the requests.
function log(message) {
var now = new Date();
console.log(now - timer + 'ms', message);
timer = now;
}
// Ensure URL format, parse URL
// Check if URL exist
request(url.href, function (error, response, body) {
if (!error && response.statusCode == 200) {
// URL exists, so let's scan it
// Exclude links to the following extensions:
var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp',
'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip',
'rar', '7z', 'css', 'js', 'gzip', 'exe', 'xml', 'svg'];
var exts = exclude.join('|');
var regexReject = new RegExp('\.(' + exts + ')', 'i');
var rootURL = url.protocol + '//' + url.host + '/';
// Crawler configuration
var crawler = new Crawler(url.host);
crawler.initialPort = 80;
crawler.initialPath = url.pathname;
crawler.maxConcurrency = 1;
crawler.ignoreWWWDomain = false; // This is a little suspicious...
crawler.filterByDomain = true; // Only URLs from the current domain
crawler.scanSubdomains = true;
crawler.downloadUnsupported = false;
crawler.parseHTMLComments = false;
crawler.parseScriptTags = false;
crawler.acceptCookies = false;
// crawler.maxDepth = 1 // Debug only!
/*
* Fetch Conditions
*/
// Get only URLs, ignore feeds, only from this host
crawler.addFetchCondition(function (parsedURL) {
return (
!parsedURL.path.match(regexReject) && // Only links
(parsedURL.path.search('/feed') === -1) && // Igrnore feeds
(parsedURL.host === url.host) // Page is from this domain
);
});
// Should we only include subpages?
if(onlySubpages) {
crawler.addFetchCondition(function(parsedURL) {
// return parsedURL.path.search(url.pathname) > -1;
return parsedURL.path.startsWith(url.pathname);
// console.log(url, parsedURL);
});
}
// Exclude urls with fragments?
if(excludeUrls.length >= 1 ) {
crawler.addFetchCondition(function(parsedURL) {
var urlFragmentsOk = true;
excludeUrlFragments.forEach(function(fragment) {
if(parsedURL.path.search('/'+fragment) > -1) {
urlFragmentsOk = false;
}
});
return urlFragmentsOk;
});
}
// Include only URLs with fragments
if(includeOnlyUrls.length >= 1) {
crawler.addFetchCondition(function(parsedURL) {
var urlFragmentsOk = false;
var includeUrlFragments = includeOnlyUrls.replace(/\s/, '').split(',');
includeUrlFragments.forEach(function(fragment) {
if(parsedURL.path.search('/'+fragment) !== -1) {
urlFragmentsOk = true;
}
});
return urlFragmentsOk;
});
}
// Run the crawler
crawler.start();
// Execute for each URL, on fetchcomplete
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
[Do stuff with the scraped page]
});
// Completed crawling. Now let's get to work!
crawler.on('complete', function() {
[Get all scraped pages and do something with them]
});
// Error handling
crawler.on('queueerror', function(errorData, URLData) {
console.log('Queue error:', errorData, URLData);
});
crawler.on('fetchdataerror', function(queueitem, response) {
console.log('Fetch error:', queueitem, response);
});
crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
console.log('Fetch timeout:', queueItem, crawlerTimeoutValue);
});
crawler.on('fetchclienterror', function(queueItem, errorData) {
console.log('Fetch local error:', queueItem, errorData);
});
crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
console.log('Crawler timeout:', queueItem, crawlerTimeoutValue);
});
} else if(error) {
console.log(error);
}
});
});
return router;
}
Every simplecrawler instance has a stop method that can be called to prevent the crawler from making any further requests (requests won't be stopped in flight, however).
I would probably store the crawler instance in a scope outside of the route handler, check if its defined first thing in the route handler, in that case call the stop method and then construct a new scraper.
I stripped out a lot of the meat of your code, but something like this is what I had in mind:
module.exports = function(socket) {
var express = require('express');
var router = express.Router();
var Crawler = requrie('simplecrawler');
var crawler;
router.post('/', function(req, res, next) {
// Check if URL exist
request(url.href, function (error, response, body) {
if (!error && response.statusCode == 200) {
// Stop any crawler that's already running
if (crawler instanceof Crawler) {
crawler.stop();
}
// Crawler configuration
crawler = new Crawler(url.host);
crawler.initialPort = 80;
crawler.initialPath = url.pathname;
// Run the crawler
crawler.start();
// Execute for each URL, on fetchcomplete
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
// [Do stuff with the scraped page]
});
// Completed crawling. Now let's get to work!
crawler.on('complete', function() {
// [Get all scraped pages and do something with them]
});
} else if(error) {
console.log(error);
}
});
});
return router;
}

Search for var in external page

I want to make a script that search for a variable in a external page.
For example, i want the script to visit this page: Here,
Check whether a server is available and notify me somehow.
Can someone help me with this?
My solution is to recuperate the text of the source of a page. Then search for any strings that we want in that source. Just add another search call to extend the search. Works in Firefox.
<script>
function print(text) {
alert(text);
}
function search(where, what) {
var position = where.indexOf(what)
if ( position !== -1) {
print("Found \"" + what + "\" at " + position);
return;
} //else
print("\"" + what + "\" not found");
}
var xhr = new XMLHttpRequest();
xhr.open("GET", "https://billing.dacentec.com/hostbill/index.php?/cart/dedicated-servers/", true);
xhr.onload = function (e) {
if (xhr.readyState === 4) {
if (xhr.status === 200) {
var externalText = xhr.responseText;
search(externalText, "technology");
search(externalText, "secure");
search(externalText, "we");
} else {
print(xhr.statusText);
}
}
};
xhr.onerror = function (e) {
print(xhr.statusText);
};
xhr.send(null);
</script>

Resources