how to display title of websites in node.js - node.js

I want to get title of different site. like this.
localhost:1234/index/?url=google.com&url=www.yahoo.com/&url=twitter.com
if i got to this url it crawl on all the mention site in the url and display title of website.
- Google
- Yahoo
- Twitter

var Urls = 'localhost:1234/index/?url=google.com&url=www.yahoo.com/&url=twitter.com';
// remove all special characters like '/' '&' and '='
Urls = Urls.replace(/\&/g, '').replace(/\//g, '').replace(/\=/g, '');
// split it based on url
Urls = Urls.split('url');
//delete first element as its not required
delete Urls[0]
Urls.forEach(function (url) {
//split each element based on '.'
url = url.split('.');
url.forEach(function (ele) {
// if its not 'www' and 'com'
if (ele !== 'www' && ele !== 'com') {
// the title of url
console.log(ele);
}
})
})
you need to remove all special character as above using regular expression and if urls contains ".org" or ".in" .. etc, then that also need to include inside if condition

Related

Is there a regex to be able to match two url's , one that has a wildcard and one that doesn't?

I am writing a program in Nodejs with the following scenarios.
I have an array of url's that include wildcards, such as the following:
https://*.example.com/example/login
http://www.example2.com/*/example2/callback
Secondly, I have an incoming redirect url that I need to validate matches what is in the array of url's above. I was wondering if there was a way using Regex or anything else that I can use something like arr.includes(incomingRedirectUrl) and compare the two.
I can match non-wildcard url's using array.includes(incomingRedirectUrl), but when it comes to matching the array that has wildcards, I cannot think of a solution.
For example,
https://x.example.com/example/login should work because it matches the first url in the above example, only replacing the "*" with the x.
Is there a way I can achieve this? Or do I have to break down the url's using something like slice at the "*" to compare the two?
Thanks in advance for any help.
for (let i = 0; i < arr.length; i++) {
if (arr[i].indexOf('*') !== -1) {
wildcardArr.push(arr[i]);
} else {
noWildcardArr.push(arr[i]);
}
}
***Note, the reason I check noWildcardArr first is because most of the validate redirect url's do not contain wildcard
if (noWildcardArr.includes(incomingRedirectUrl)) {
//Validated correct url, proceed with the next part of my code (this part already works)
} else if (wildcardArr.includes(incomingRedirectUrl)) {
//need to figure out this logic here, not sure if the above is possible without formatting wildcardArr but url should be validated if url matches with wildcard
} else {
log.error('authorize: Bad Request - Invalid Redirect URL');
context.res = {
status: 400,
body: 'Bad Request - Invalid Redirect URL',
};
}
You could compile your URL array into proper regex and then iterate over them to see if it matches. Similar to something like a web framework would do that allows URL path parameters such as /users/:id.
function makeMatcher(urls) {
const compiled = urls.map(url => {
// regex escape the url but dont escape *
let exp = url.replace(/[-[\]{}()+?.,\\^$|#\s]/g, '\\$&');
// replace * with .+ for the wildcard
exp = exp.replaceAll('*', '.+');
// the expression is used to create the match function
return new RegExp(`^${exp}$`);
});
// return the match function, which returns true, on the first match,
// or false, if there is no match at all
return function match(url) {
return compiled.find(regex => url.match(regex)) == undefined ?
false :
true;
};
}
const matches = makeMatcher([
'https://*.example.com/example/login',
'http://www.example2.com/*/example2/callback'
]);
// these 2 should match
console.log(matches('https://x.example.com/example/login'));
console.log(matches('http://www.example2.com/foo/example2/callback'));
// this one not
console.log(matches('http://nope.example2.com/foo/example2/callback'));

How to extract value from a url

I have a few requests coming in which follow the pattern below
contacts/id/
contacts/x/id/name
contacts/x/y/id/address
contacts/z/address/
I want to extract the value which follows right after 'contacts'
In above cases,
1. id
2. x
3. x
4. z
Here is my regex
(?<=contacts)\/[^\/]+
https://regex101.com/r/ePmv5Y/1
But it is matching along with the trailing '/' for eg. /id, /x etc
How do I optimize to get rid of this trailing slash?
We can use match() here:
var urls = ["contacts/id/", "contacts/x/id/name", "contacts/x/y/id/address", "contacts/z/address/"];
for (var i=0; i < urls.length; ++i) {
var output = urls[i].match(/\bcontacts\/(.*?)\//)[1];
console.log(urls[i] + " => " + output);
}
I have a few requests coming in
If you mean http requests, then this is likely the pathname of the requested URL, and they'll start with a /. (This is the value of req.url in a Node.js server.)
To match on a URL pathname, you can use this expression: ^\/contacts\/([^/?]+). Here's a link to another regular expression builder that demonstrates it and includes an explanation for every character: https://regexr.com/6tugf
The [^/?] is a negated set that matches any token which is not a / or a ? and the + means that it matches 1 or more of those tokens. It's important to include the ? because otherwise it could match into the query string portion of the URL — for example, in this URL:
https://domain.tld/contacts/x/id/name?filter=recent # URL
/contacts/x/id/name?filter=recent # req.url in Node.js
/contacts/x/id/name # pathname
?filter=recent # query string
And here's a runnable code snippet demonstrating the same expression, using String.prototype.match():
const contactIdRegexp = /^\/contacts\/([^/?]+)/;
const inputs = [
'/contacts/id/', // id
'/contacts/x/id/name', // x
'/contacts/x/y/id/address', // x
'/contacts/z/address/', // z
'/contacts/x/id/name?filter=recent', // x
];
for (const str of inputs) {
const id = str.match(contactIdRegexp)?.[1];
console.log(id);
}
You can add the / inside the lookbehind:
(?<=contacts\/)[^\/]+
See a regex demo.
If you like to continue without regex, You can try below.
//get the URL object.
const url = new URL(`${req.protocol}://${req.get('host')}${req.originalUrl}`);
//extract the pathname and split using "/"
const pathName= url.pathname.split("/");
//get the required value using array index.
const val = pathName[2];

Pattern match in nodejs rest url

In my node app I am using the router.use to do the token validation.
I want to skip validation for few urls, so I want to check if the url matches, then call next();
But the URL I want to skip has a URLparam
E.g., this is the URL /service/:appname/getall.
This has to be matched against /service/blah/getall and give a true.
How can this be achieved without splitting the url by '/'
Thanks in advance.
The parameters will match :[^/]+ because it is a : followed by anything other than a / 1 or more times.
If you find the parameters in the template and replace them with a regex that will match any string you can do what you asked for.
let template = '/service/:appname/getall'
let url = '/service/blah/getall'
// find params and replace them with regex
template = template.replace(/:[^/]+/g, '([^/]+)')
// the template is now a regex string '/service/[^/]+/getall'
// which is essentially '/service/ ANYTHING THAT'S NOT A '/' /getall'
// convert to regex and only match from start to end
template = new RegExp(`^${template}$`)
// ^ = beggin
// $ = end
// the template is now /^\/service\/([^\/]+)\/getall$/
matches = url.match(template)
// matches will be null is there is no match.
console.log(matches)
// ["/service/blah/getall", "blah"]
// it will be [full_match, param1, param2...]
Edit: use \w instead of [^/], because:
The name of route parameters must be made up of “word characters” ([A-Za-z0-9_]). https://expressjs.com/en/guide/routing.html#route-parameters
I believe this is true for most parsers so I have updated my answer. The following test data will only work with this updated method.
let template = '/service/:app-:version/get/:amt';
let url = '/service/blah-v1.0.0/get/all';
template = template.replace(/:\w+/g, `([^/]+)` );
template = new RegExp(`^${template}$`);
let matches = url.match(template);
console.log(url);
console.log(template);
console.log(matches);
// Array(4) ["/service/blah-v1.0.0/get/all", "blah", "v1.0.0", "all"]

Groovy - Check last character & if append on condition

This is an idomism question regarding groovy - what is the simplest way to check the last char of a String and if present not append it to a modified String?
For example in Java you could do some like this;
String url = "http://www.google.com"
//check to see if url already has / at the end
if (url.charAt(url.size()-1)=='/')
url =url + "Media"
else
url = "/Media"
​def url = 'http://test.com/'
url += url?.endsWith('/') ? 'Media' : '/Media'
//This should work as well but it would throw an
//ArrayIndexOutOfBoundException if url is empty ('')
//url += url[-1] == '/' ? 'Media' : '/Media'
println url​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​​

Can I allow the extension user to choose matching domains?

Can I allow the domain matching for my extension to be user configurable?
I'd like to let my users choose when the extension runs.
To implement customizable "match patterns" for content scripts, the Content script need to be executed in by the background page using the chrome.tabs.executeScript method (after detecting a page load using the chrome.tabs.onUpdated event listener).
Because the match pattern check is not exposed in any API, you have to create the method yourself. It is implemented in url_pattern.cc, and the specification is available at match patterns.
Here's an example of a parser:
/**
* #param String input A match pattern
* #returns null if input is invalid
* #returns String to be passed to the RegExp constructor */
function parse_match_pattern(input) {
if (typeof input !== 'string') return null;
var match_pattern = '(?:^'
, regEscape = function(s) {return s.replace(/[[^$.|?*+(){}\\]/g, '\\$&');}
, result = /^(\*|https?|file|ftp|chrome-extension):\/\//.exec(input);
// Parse scheme
if (!result) return null;
input = input.substr(result[0].length);
match_pattern += result[1] === '*' ? 'https?://' : result[1] + '://';
// Parse host if scheme is not `file`
if (result[1] !== 'file') {
if (!(result = /^(?:\*|(\*\.)?([^\/*]+))(?=\/)/.exec(input))) return null;
input = input.substr(result[0].length);
if (result[0] === '*') { // host is '*'
match_pattern += '[^/]+';
} else {
if (result[1]) { // Subdomain wildcard exists
match_pattern += '(?:[^/]+\\.)?';
}
// Append host (escape special regex characters)
match_pattern += regEscape(result[2]);
}
}
// Add remainder (path)
match_pattern += input.split('*').map(regEscape).join('.*');
match_pattern += '$)';
return match_pattern;
}
Example: Run content script on pages which match the pattern
In the example below, the array is hard-coded. In practice, you would store the match patterns in an array using localStorage or chrome.storage.
// Example: Parse a list of match patterns:
var patterns = ['*://*/*', '*exampleofinvalid*', 'file://*'];
// Parse list and filter(exclude) invalid match patterns
var parsed = patterns.map(parse_match_pattern)
.filter(function(pattern){return pattern !== null});
// Create pattern for validation:
var pattern = new RegExp(parsed.join('|'));
// Example of filtering:
chrome.tabs.onUpdated.addListener(function(tabId, changeInfo, tab) {
if (changeInfo.status === 'complete') {
var url = tab.url.split('#')[0]; // Exclude URL fragments
if (pattern.test(url)) {
chrome.tabs.executeScript(tabId, {
file: 'contentscript.js'
// or: code: '<JavaScript code here>'
// Other valid options: allFrames, runAt
});
}
}
});
To get this to work, you need to request the following permissions in the manifest file:
"tabs" - To enable the necessary tabs API.
"<all_urls>" - To be able to use chrome.tabs.executeScript to execute a content script in a specific page.
A fixed list of permissions
If the set of match patterns is fixed (ie. the user cannot define new ones, only toggle patterns), "<all_urls>" can be replaced with this set of permissions. You may even use optional permissions to reduce the initial number of requested permissions (clearly explained in the documentation of chrome.permissions).

Resources