Get the HTML content of a page after the JS loading

Get the HTML content of a page after the JS loading - node.js

I'm trying to get the results of a search in the Rust documentation. I made this code to do it :
let HTMLParser = require('node-html-parser');
let https = require('https');
const search = "foo";
let options = {
host: "doc.rust-lang.org",
path: "/std/index.html?search=" + search
};
let request = https.get(options, (res) => {
if (res.statusCode != 200) return console.log(`An error occured : ${res.statusCode}. Retry later.`);
res.setEncoding("utf8");
let output = "";
res.on("data", (chunk) => {
output += chunk
});
res.on("end", () => {
let root = HTMLParser.parse(output);
console.log(root.querySelector(".search-results")); // print "null" because the search is not done when the request response come
});
request.end();
});
But when I run this code, I get the HTML content of the index.html page like if I requested this page without the ?search="foo". I found that the page change dynamically with some JS when we search for something, and then the base content is set to hidden and the search div become visible. So it seems that the JS didn't load when I get the request result, but I needs it to get the results of the search in the documentation. I don't know how I can do that.
Thank you in advance for your answers !

The Rust doc page does not seem to hit a backend when a search is performed. I noticed this using the browser developer tools.
It looks like the page loads a search-index which contains the readily available docs. You can use this js to search for docs. The logic is written in the main.js.
Let me know if you are looking for more info, as I have not found out how the link generation on each doc item is created.
EDIT
All the logic required to build the url is in main.js. The method is as follows. If you take a close look at the aliases.js, main.js, storage.js and search-index.js files, you can reuse almost all of it to create the links and the required search outputs.
function buildHrefAndPath(item) {
var displayPath;
var href;
var type = itemTypes[item.ty];
var name = item.name;
if (type === 'mod') {
displayPath = item.path + '::';
href = rootPath + item.path.replace(/::/g, '/') + '/' + name + '/index.html'
} else if (type === 'primitive' || type === 'keyword') {
displayPath = '';
href = rootPath + item.path.replace(/::/g, '/') + '/' + type + '.' + name + '.html'
} else if (type === 'externcrate') {
displayPath = '';
href = rootPath + name + '/index.html'
} else if (item.parent !== undefined) {
var myparent = item.parent;
var anchor = '#' + type + '.' + name;
var parentType = itemTypes[myparent.ty];
if (parentType === 'primitive') {
displayPath = myparent.name + '::'
} else {
displayPath = item.path + '::' + myparent.name + '::'
}
href = rootPath + item.path.replace(/::/g, '/') + '/' + parentType + '.' + myparent.name + '.html' + anchor
} else {
displayPath = item.path + '::';
href = rootPath + item.path.replace(/::/g, '/') + '/' + type + '.' + name + '.html'
}
return [displayPath,
href]
}

Related

How to authorize with AWS signature 4 -> API Gateway -> Lambda

I've googled around a lot with no luck in finding the solution to my problem. I've read through the entire authentication process for AWS Signature 4 and followed their tutorial as well as view other sources. I'm trying to have client side authentication for a desktop application that makes request to API Gateway.
When I use Postman it works properly but I tried generating my own signature in Nodejs but to no avail, I keep getting 403 messages back from the call.
The function below returns the authenticated requestUrl which is then run by axios.get(requestUrl). When I use the Postman generated request it works perfectly fine but, once I use my generated request I have problems.
Am I missing something while authenticating? Here is what my code currently looks like:
function Authorize() {
const host = "EXAMPLE.execute-api.us-east-1.amazonaws.com"
const reg = 'us-east-1'
const meth = 'GET'
const serv = 'execute-api'
const endpoint = '/development/putImage'
// Keys
let access = "EXAMPLE"
let key = "KEY"
// Get Date
let t = new Date();
let amzDate = t.toJSON().replace(/[-:]/g, "").replace(/\.[0-9]*/, "");
let dateStamp = t.toJSON().replace(/-/g, "").replace(/T.*/, "");
// ************* TASK 1: CREATE CANONICAL REQUEST *************
// Create Canonical Request
let canonical_uri=endpoint
let canonical_headers="host: "+host+"\n"
let signedHeaders = 'host'
let algorithm = 'AWS4-HMAC-SHA256'
let credentialScope = dateStamp + "/" + reg + "/" + serv + "/" + "aws4_request"
// Set query string
let canonicalQueryString = ""
canonicalQueryString += "X-Amz-Date=" + amzDate
canonicalQueryString += "&X-Amz-Algorithm=" + algorithm;
canonicalQueryString += "&X-Amz-Credential=" + encodeURIComponent(access + "/" + credentialScope)
canonicalQueryString += "&X-Amz-SignedHeaders=" + signedHeaders
// Empty payload for get request
var payloadHash = crypto.createHash('sha256').update('').digest('hex');
// Set canonical request
var canonicalRequest = meth + "\n" + canonical_uri + "\n" + canonicalQueryString + "\n" + canonical_headers + "\n" + signedHeaders + "\n" + payloadHash
console.log(canonicalRequest)
// ************* TASK 2: CREATE THE STRING TO SIGN*************
let stringToSign = algorithm + '\n' + amzDate + '\n' + credentialScope + '\n' + crypto.createHash('sha256').update(canonicalRequest).digest('hex');
// ************* TASK 3: CALCULATE THE SIGNATURE *************
var signingKey = getSignatureKey(key, dateStamp, reg, serv)
var signature = crypto.createHmac('sha256', signingKey).update(stringToSign).digest('hex');
// ************* TASK 4: ADD SIGNING INFORMATION TO THE REQUEST *************
canonicalQueryString += '&X-Amz-Signature=' + signature
let requestUrl = "https://"+host+ endpoint + "?" + canonicalQueryString
console.log(requestUrl)
return requestUrl
}

The below code worked for me well. For more info, please visit https://docs.aws.amazon.com/opensearch-service/latest/developerguide/request-signing.html#request-signing-node
const { HttpRequest} = require("#aws-sdk/protocol-http");
const { defaultProvider } = require("#aws-sdk/credential-provider-node");
const { SignatureV4 } = require("#aws-sdk/signature-v4");
const { NodeHttpHandler } = require("#aws-sdk/node-http-handler");
const { Sha256 } = require("#aws-crypto/sha256-browser");
...
var request = new HttpRequest({
body: JSON.stringify({"users":["G0000000B","G0000000A"]}),
headers: {
'Content-Type': 'application/json',
'apiKey':'XXXXXXXXXXXX',
'apiSecret': 'XXXXXXXXXXXXXXXXXX',
'host': 'service2.xxx.xxx.xx'
},
hostname: 'service2.xxx.xxx.xx',
method: 'POST',
path: 'API/user/list'
});
var signer = new SignatureV4({
credentials: defaultProvider(),
region: 'ap-southeast-1',
service: 'execute-api',
sha256: Sha256
});
const signedRequest = await signer.sign(request);
// Send the request
var client = new NodeHttpHandler();
var { response } = await client.handle(signedRequest)
console.log(response.statusCode + ' ' + response.body.statusMessage);
var responseBody = '';
await new Promise(() => {
response.body.on('data', (chunk) => {
responseBody += chunk;
});
response.body.on('end', () => {
console.log('Response body: ' + responseBody);
});
}).catch((error) => {
console.log('Error: ' + error);
});

Unable to split converted mimetype in javascript

I am using the following code to rename a file that has been uploaded to my node.js server. I get the 'mimetype' of the uploaded file and convert it to a string, whereby my goal is to 'split' that string at the backslash ('/'). I should then be able to use the 'extension' (i.e. the 'jpg' or 'gif' or 'png' or whatever) to create a new file name. My problem is I keep getting an 'Unexpected token' error when I do the 'split'...I have confirmed the mimetype has been converted to a string...therefore I cannot to save my life understand why the 'split' does not work...? Code is as follows:
//A means of ensuring only images are uploaded.
//'files' is an array of uploaded image files, omitted for clarity
var len = files.length;
var i;
for (i = 0; i < len; i++) {
if (files[i] != "undefined") {
const host = req.hostname;
const filePath = req.protocol + "://" + host + '/' + files[i].path;
const image = files[i].mimetype.startsWith('image/');
const type = files[i].mimetype.toString(); //convert mimetype to string
if(image) {
console.log('photo #' + i + ' uploaded');
console.log('uploaded file: ' + files[i].filename + ' saved within: ' + files[i].destination + ' at path: ' + files[i].path);
console.log('photo #' + i + ' filepath: ' + filePath);
console.log('photo #' + i + ' image extension is: ' + type); //returns 'image/jpg'
console.log('photo #' + i + ' TYPEOF is: ' + typeof type); //returns 'string'
var extends = type.split("/"); //"split" on "backslash" 'UNEXPECTED TOKEN' ERROR HERE!!!
var targetPath = filePath + "." + extends[1] //'extends[1]' should be jpg or png or whatever
console.log('RENAMED target path for photo #' + i + ' is: ' + targetPath);
fs.rename(filePath, targetPath, function(err) {
if (err) {
console.log("Unable to rename photo #" + i + " file...!")
} else {
console.log("Successfully renamed the file!")
}
})
} else {
console.log("file # " + i + " received--however wrong format");
}
} //if NOT 'undefined'
} //for loop
I thank you in advance for any suggestions...this is driving me crazy...

I have an example with javascript (I can't reply in your answer )

How to get all fonts used on a page using node.js?

I need to crawl all the pages on a site (the crwling part works fine.) and so i need to run THIS script on my server using node.js. I tried implementing the following logic:
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var jsdom = require("jsdom");
var { JSDOM } = jsdom;
var START_URL = "http://balneol.com/";
var SEARCH_FONT = "helvetica";
var MAX_PAGES_TO_VISIT = 100000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the window.document body
// var window = jsdom.jsdom(body).defaultView();
var { window } = new JSDOM(body);
//var $ = cheerio.load(body);
var helveticaFound = searchForHelvetica(window, 'font-family');
if(helveticaFound) {
console.log('Word ' + SEARCH_FONT + ' found at page ' + url);
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
// callback();
}
});
}
function searchForHelvetica( window , css) {
if(typeof getComputedStyle == "undefined")
getComputedStyle= function(elem){
return elem.currentStyle;
}
var who, hoo, values= [], val,
nodes= window.document.body.getElementsByTagName('*'),
L= nodes.length;
for(var i= 0; i<L; i++){
who= nodes[i];
console.log(nodes[i]);
if(who.style){
hoo= '#'+(who.id || who.nodeName+'('+i+')');
console.log(who.style._values);
// return false;
val= who.style.fontFamily || getComputedStyle(who, '')[css];
if(val){
if(verbose) values.push([hoo, val]);
else if(values.indexOf(val)== -1) values.push(val);
// before IE9 you need to shim Array.indexOf (shown below)
}
}
}
// console.log(values);
// return values;
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
If you see my visit page function you will see the below two lines of code:
var { window } = new JSDOM(body);
var helveticaFound = searchForHelvetica(window, 'font-family');
as you can see on the 2nd line i am passing the window object to the searchForHelvetic function.
In my searchForHelvetic function , if i console.log(nodes[i]); , i don't get the html element and hence the rest of the script does't quite run as expected. does the jsdom window differ from the window object in the browser ? how do i get the script working ? I.E. basically use the window object to run through all the pages on the page and spit out all the fonts used on the page ?
EDIT::-
To break the problem down to a micro level, if i console.log(who); inside searchForHelvetica function , i get the following result:
HTMLElement {}
HTMLDivElement {}
HTMLDivElement {}
HTMLDivElement {}
HTMLAnchorElement {}
HTMLImageElement {}
HTMLDivElement {}
HTMLFormElement {}
HTMLDivElement {}
HTMLLabelElement {}
HTMLInputElement {}
HTMLButtonElement {}
HTMLButtonElement {}
HTMLSpanElement {}
etc..
But if i were to do the same in a web browser the result world be different Eg.
nodes = window.document.body.getElementsByTagName('*');
console.log(node[1]) // <div id="mobile-menu-box" class="hide">...</div>
How do i get a similar result in node.js ?

req.param inside res.render causes strange console log

I am using using EJS templates with Node.js and Express. I am trying to pass a request parameter to my EJS template. It is working, however, for some reason my console log is reporting something strange.
Versions:
Node 0.10.26
Express 4.6.1
EJS 0.8.5
Here is the route that handles the ejs template:
var express = require('express');
var router = express.Router();
var data = require('../data.json');
var pkg = require('../../package.json');
router.get('/', function(req, res) {
res.render('index',
{
'acs' : data.acs,
'products' : data.products,
'pkg' : pkg,
'debug' : req.param('debug')
});
});
module.exports = router;
This is the console log (I replaced anything long with "..." to save space)
var __stack = {
lineno: 1,
input: "<!DOCTYPE html>\n<html lang=\"en\"> ... </html>\n",
filename: "/web/app/views/index.ejs" };
function rethrow(err, str, filename, lineno){
var lines = str.split('\n')
, start = Math.max(lineno - 3, 0)
, end = Math.min(lines.length, lineno + 3);
// Error context
var context = lines.slice(start, end).map(function(line, i){
var curr = i + start + 1;
return (curr == lineno ? ' >> ' : ' ')
+ curr
+ '| '
+ line;
}).join('\n');
// Alter exception message
err.path = filename;
err.message = (filename || 'ejs') + ':'
+ lineno + '\n'
+ context + '\n\n'
+ err.message;
throw err;
}
try {
var buf = [];
with (locals || {}) { (function(){
buf.push('<!DOCTYPE html>\n<html lang="en">...</html>\n'); })();
}
return buf.join('');
} catch (err) {
rethrow(err, __stack.input, __stack.filename, __stack.lineno);
}
Like I said, it is working, however I can't tell why this is being logged in the console. Thanks for the help!

The problem is that the second argument passed to res.render() is passed to both the rendering engine AND your template. Because of this behavior, ejs (at least through 1.0 as of this writing), looks for a debug property in that object to determine if debug information will be printed.

Change the file name on the fly for downloading

I am saving user uploads by renaming it's original name to userID + '_' + new Date().getTime() + fileExt. I am storing the file properties in mongodb collections as :
{
name : String //generated name
, originalName : String //its original name
...
}
Now when the user requests for downloading of file I have to provide the file with its original name (which is persisted in db so no problem to grab it) to the user.
For the following request
GET /users/:userId/uploads/:fileId?type=download
I have this handler
//the mongoose query
UserUpload.findById(req.params.fileId).exec(function(err, doc){
var fileLocation = __dirname + '/public/uploads/users/' + req.params.userId + '/' + doc.name;
if(req.query.type && req.query.type == 'download') {
// I don't know what should I do now
// Downloader.download(fileLocation, newName);
}
});
I read wiki of node-static module but could not figure out how to do that job?

I found the answer here : Download a file from NodeJS Server using Express . Both using express and without using express.
It is too simple if you are using Express. Here is the documentation for res.download. I can't believe that the solution is just one line of code :
res.download('/path/to/file.ext', 'newname.ext');

Here what I use in one of my project, smil is a type of file I need to download, nevermind it.
In that project I have DOWNLOAD_DIR as global variable, which contain the full path to the download folder.
It may make a lot of people cringe (especcially the fileExistSync) but it s a start.
var DOWNLOAD_DIR = '/path/to/download',
url = require('url'),
http = require('http'),
/*
download
IN_:
file_url, url of the file to download
OUT:
file_name, full path to downloaded file, null if the file cound t be downloaded.
COM:
download a file, the file name will be the last part of the url (http://why.so/serious.txt => serious.txt).
WARNING: Do NOT follow redirections.
*/
function download(file_url, callback) {
var options = {host: url.parse(file_url).host, port: 80, path: url.parse(file_url).pathname},
file_name = url.parse(file_url).pathname.split('/').pop(),
//Creating the file
file = fs.createWriteStream(DOWNLOAD_DIR + file_name, {flags: 'w', encoding: 'binary'});
console.log('Downloading file from ' + file_url);
console.log('\tto ' + file_name);
http.get(options, function (res) {
res.pipe(file, {end: 'false'});
//When the file is complete
res.on('end', function () {
//Closing the file
file.end();
console.log('\t\tDownloaded '+ file_name);
callback(DOWNLOAD_DIR + file_name);
});
});
process.on('uncaughtException', function(err) {
console.log('Can t download ' + file_url + '\t(' + err + ')', false);
callback(null);
});
}
/*
download_smil
IN_:
file_url, url of the file to download
OUT:
file_name, full path to downloaded file
COM:
Follow http redirection and then call download function to download it.
You can modify the cad function to use custom names.
*/
function download_smil(file_url, callback) {
function cad(link, callback) {
//Does the file already exist?
var file = url.parse(link).pathname.substr(url.parse(link).pathname.lastIndexOf('/') + 1),
pkmn;
pkmn = fs.existsSync(DOWNLOAD_DIR + '/' + file);
if (pkmn) {
//YES: callback
console.log('File ' + file + ' already exist, skipping');
callback(DOWNLOAD_DIR + file, true);
} else {
//NO: Download it
console.log('Will download ' + link);
download(link, callback);
}
}
//GET the page
http.get(file_url, function (res) {
var link;
//console.log('res.statusCode = ' + res.statusCode + ' res.headers.location = ' + res.headers.location);
//Check if it is a redirect
if (res.statusCode > 300 && res.statusCode < 400 && res.headers.location) {
//console.log('redirect');
//Check if the hostname is in the location
if (url.parse(res.headers.location).hostname) {
//console.log('The link to the smil is res.headers.location');
link = res.headers.location;
cad(link, callback);
} else {
//console.log('The link to the smil is url_parse(file_url).hostname + res.headers.location = ' + url_parse(file_url).hostname + res.headers.location);
link = url_parse(file_url).hostname + res.headers.location;
cad(link, callback);
}
} else {
//console.log('The url is good : ' + file_url);
cad(file_url, callback);
}
});
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Get the HTML content of a page after the JS loading - node.js

Related

How to authorize with AWS signature 4 -> API Gateway -> Lambda

Unable to split converted mimetype in javascript

How to get all fonts used on a page using node.js?

req.param inside res.render causes strange console log

Change the file name on the fly for downloading

Categories

Resources