Express middleware infinite loop - node.js

Express middleware runs into an infinite loop for encoded URLs,
I have french letters in my URL, so every time I try to hit something a URL like /équipement
The express server runs into an infinite loop trying to process the URL(/%C3%A9quipement/) and I get the ERR_TOO_MANY_REDIRECTS error.
This doesn't happen on my local environment, happens on the dev environment hosted on Azure and has an IIS web server running.
Have 3 middlewares configured,
server.js
app.use(middlewares.ipcheck)
app.use('/:lang*', middlewares.redirection)
app.use(microcache.cacheSeconds(600, middlewares.microcache))
Based on the Azure logs, the code hits the microcache middleware and then goes back to the ipcheck middleware.
ipcheck.js
if (req.query.skipGeo || req.cookies.geoCheck || isBot(req)) {
console.log(`Cookie Present \n`)
let hour = 3600000
res.cookie('geoCheck', 'passed', { maxAge: 365 * 24 * hour })
next()
} else {
console.log(`Checking against mmdb \n`)
const isValidIP = await user.hasValidIp(req.headers['x-forwarded-for'] || req.connection.remoteAddress)
process.env.isValidIP = isValidIP
if (!isValidIP) {
console.log(`Geo Block \n`)
res.status(307).sendFile(path.resolve('public/geo_block.html'))
} else {
console.log(`Success \n`)
next()
}
}
microcache.js
return config.useMicroCache.get() && `${req.hostname}${req.originalUrl}` // useMicroCache is a bool val set based on env.
redirection.js
const params = req.params
let lang = params.lang
const site = utils.user.currentSite(req)
if (!req.originalUrl.includes('api')) {
if (!validLanguage(site, lang)) {
lang = req.cookies.lang && validLanguage(site, req.cookies.lang) ? req.cookies.lang : site.languages.default
config.useMicroCache.set(false)
res.redirect(301, `/${lang}${req.originalUrl}`)
return
} else if (lang !== req.cookies.lang) {
config.useMicroCache.set(false)
} else {
config.useMicroCache.set(true)
}
if (!isLowerCase(req.path)) {
let parsedUrl = url.parse(req.originalUrl)
parsedUrl.pathname = parsedUrl.pathname.toLowerCase()
res.redirect(301, url.format(parsedUrl))
return
}
}
next()

Related

Titanium http request leak

I have to make a load of subsequent http requests to load product images into the app as it has to function in an offline mode.
Around 2000 calls.
The http client seems toi have a memory leak which causes the persistent mbytes in "instruments" to rise to around 200 without being garbaged.
After use of the http client it is being set to null.
I have tried setting the file property of the httpclient without any success
I have set the unload function to only call the callback function which in turn calls the http send function again (thus looping through the 2000 products to get the respective pictures)
I changed from SDK 7.5.0.v20180824022007 to SDK 8.1.0.v20190423134840 and even SDK 9.0.0.v20181031080737 but the problem remains
the code of my http common module:
function HttpClient(options = {}) {
this.root = options.root || "ROOT_OF_API";
this.endpoint = options.endpoint || false;
this.needsChecksum = options.needsChecksum || false;
this.data = {};
this.method = options.method || "Post";
this.timeout = options.timeout || 5000;
this.calculateChecksum = function () {
var moment = require('alloy/moment');
if (!Alloy.Models.user.authenticated()) {
return false;
}
var sp = (moment().unix() - Alloy.Models.meta.get("timeDiff"))
var hash = Ti.Utils.md5HexDigest("nX" + sp + "FossilSFAapp" + Alloy.Models.user.get('token').substring(10, 14) + "CS")
var checksum = sp + "-" + hash.substring(4, 8)
this.data.checksum = checksum
}
};
HttpClient.prototype.setData = function (data) {
this.data = data
};
HttpClient.prototype.send = function (callback) {
// set new checksum for request if is needed
if (this.needsChecksum) {
this.calculateChecksum()
}
// add app version
if (this.method === "POST") {
this.data.appversion = Ti.App.version;
}
// send
var client = Ti.Network.createHTTPClient({
onload: function () {
callback({
success: true
})
},
onerror: function(e) {
callback({
message: e.messgae,
success: false
})
},
timeout: this.timeout
});
client.open(this.method, this.root + this.endpoint);
if (this.setFile) {
client.file = Ti.Filesystem.getFile(Ti.Filesystem.applicationDataDirectory, this.fileName);
}
client.setRequestHeader('Content-Type', 'application/json');
client.send(JSON.stringify(this.data));
client = null;
};
module.exports = HttpClient;
and then the module is used in the product model like so:
var HttpClient = require('./HttpClient');
var httpClient = new HttpClient();
function getImage (i) {
if (collection.at(i) && collection.at(i).get('iimage0') && collection.at(i).needsImageUpdate()) {
httpClient.endpoint = collection.at(i).get('acarticlenumber') +".jpg";
httpClient.fileName = 'productImages/' + collection.at(i).get('acarticlenumber') + '.jpg'
httpClient.send(function(e){
if (i < collection.length) {
i++
getImage(i)
} else {
finished()
}
});
} else {
if (i < collection.length) {
i++
getImage(i)
} else {
finished()
}
}
}
// start getting images at index 0
getImage(0)
anyone have an idea why these memory leaks appear ?
It only ever occurs when actually sending the http request.

Express - Serve folders based on request

For ExpressJs and NodeJs
Assume I have 3 types of users in my application.
Based on type of user(extracting from cookie), how to serve particular folder based on condition?
Say I have 3 folders x, y and z.
I have condition which says for user_type x -> serve folder x contents.
Same with y and z.
I tried following code but it didn't worked.
function checkCookieMiddleware(req, res, next) {
const req_cookies = cookie.parse(req.headers.cookie || '');
if(req_cookies.type){
if(req_cookies.type === "X"){
express.static(basePath + "/client/x");
}
else if(req_cookies.type === "Y"){
express.static(basePath + "/client/y");
}
else {
next();
}
}
else {
next();
}
}
app.use(checkCookieMiddleware, express.static(basePath + "/client/z"));
I found this NPM package - express-dynamic-static - that looks to do what you are looking for. If you don't want to pull in another dependency, the source code for it is fairly small, you could copy it as a custom middleware yourself.
If you were to use it, then I think you code might look something like this:
const express = require('express');
const dynamicStatic = require('express-dynamic-static')();
const app = express();
app.use(dynamicStatic);
function checkCookieMiddleware(req, res, next) {
const req_cookies = cookie.parse(req.headers.cookie || '');
if (req_cookies.type) {
if (req_cookies.type === 'X') {
dynamicStatic.setPath(basePath + '/client/x');
} else if (req_cookies.type === 'Y') {
dynamicStatic.setPath(basePath + '/client/y');
} else {
// Z
dynamicStatic.setPath(basePath + '/client/z');
}
}
next();
}
app.use(checkCookieMiddleware);

Tons of Node.js HTTP requests are causing computer to not have internet connection until I restart the process

I have this basic code:
const cookie = require('cookie')
const https = require('https')
const http = require('http')
const proto = { https, http }
// Use connect method to connect to the server
MongoClient.connect(url, function(err, client) {
console.log("Connected successfully to server")
const db = client.db(dbName)
const foos = db.collection('foos')
perform(foos, function(){
console.log('done')
})
})
function perform(a, done) {
const stream = a.find({ url: null })
// emits each line as a buffer or as a string representing an array of fields
stream.on('data', function(doc){
stream.pause()
request(doc.url, function(){
stream.resume()
})
function request(url, fn, redirect, cookies) {
cookies = cookies || {}
console.log(redirect ? 'redirect' : 'start', url)
const val = url.match(/^https/) ? 'https' : 'http'
var headers = {}
if (Object.keys(cookies).length) {
var ck = []
Object.keys(cookies).forEach(key => {
ck.push(cookie.serialize(key, cookies[key]))
})
headers.Cookie = ck.join('; ')
}
proto[val].get(url, { headers }, function(response) {
// console.log(response.headers)
console.log(response.statusCode, url)
if (response.statusCode == 302 || response.statusCode == 301 || response.statusCode == 307 || response.statusCode == 303) {
if (response.headers['set-cookie']) {
response.headers['set-cookie'].forEach(function(str){
var cks = cookie.parse(str)
for (var key in cks) {
switch (key) {
case 'expires':
case 'path':
case 'domain':
break
default:
cookies[key] = cks[key]
}
}
})
}
var newUrl = response.headers.location
if (!newUrl.match(/^https?:\/\//)) {
if (newUrl.match(/\/\//)) {
newUrl = 'http:' + newUrl
} else if (newUrl.match(/\//)) {
newUrl = domain + newUrl
} else {
newUrl = domain + '/' + newUrl
}
}
request(newUrl, fn, true, cookies)
} else {
// do something
fn()
}
}).on('error', function(err) { // Handle errors
console.log(err.message)
fn()
})
}
})
// now pipe some data into it
stream.on('end', function(){
done()
})
}
It essentially just loads a bunch of URLs from the database and makes the call for each. It uses the streaming feature of MongoDB collections so it only does one request at a time, and when the request completes, it starts the next. However, after about 3-5 minutes of running this script, the process hangs. Not only that, the browser I'm using hangs too! I'm running this as a Node.js script, but for some reason it seems to be blocking all the traffic of my computer after 3-5 minutes.
The thing is, when I restart the process (which takes only an instant/second), everything's fine again and the requests go through. Another aspect is, say I try to go to a URL in the browser (any URL, like stackoverflow.com), and it's hanging because of the script. If I restart the process, the browser window completes its request! I have no idea why this is.
Wondering if you know why this might be happening and how I might go about fixing it.

How to check the req.url path is existing in app?

I'm using sequelizejs, nodejs in my application. I know this will check inbuild, but I want to check manually like in if() condition.
Below is some url path
/user
/user/11d9b6130159 => user/:id
/user/11d9bdfg0159/sample => user/:id/sample
what I want is, there is Middleware, have to check current url these in app route like
if(url.parse(req.url).path === "/user"){
//some action do
}
But I'm failing remaining urls. Please suggest the way to solve. Thanks
If you really want to do the URL parsing manually then the aproach could be like this:
EDIT: Based on your comment, I modified the sample code (more than 3 levels). You can easily extend it based on your needs.
const url = require('url');
const path = ctx.request.href;
const pathName = url.parse(path).pathname;
const pathNameParts = pathName.split('/'');
if (pathNameParts && pathNameParts[1] && pathNameParts[1] === 'user') {
if (pathNameParts[2]) {
const id = pathNameParts[2]; // :id is now defined
if (pathNameParts[3] && pathNameParts[3] === 'sample') {
if (pathNameParts[4]) {
const id2 = pathNameParts[4]; // :id2 is now defined
if (pathNameParts[5] && pathNameParts[5] === 'disable') {
// do some action for /user/:id/sample/:id2/disable
} else {
// do some action for /user/:id/sample/:id2
}
} else {
// do some action for /user/:id/sample
}
} else {
// do some action for /user/:id
}
} else {
// do some action for /user
}
}
So I would do this only, if you really want to do the parsing yourself. Otherwise use something like express router or koa router. Using express router it would be like:
app.use('/user/:id', function (req, res, next) {
console.log('ID:', req.params.id);
next();
});

Stop node-simplecrawler instance when creating a new one (make it behave like a singleton)

Heyllo, everyone!
I am making a scraper which uses node-simplecrawler. Everything runs fine, but what I can't figure out is how to stop one instance when creating a new one (I want to have only one running at a time). I am using express for this and all the scraping logic is in one route. In order to cancel the crawling right now, I need to stop the node process and run the app again.
Here is the part of the code that concerns running the crawler (note: I've simplified the code a little bit, so it's shorter):
module.exports = function(socket) {
var express = require('express');
var router = express.Router();
[... requires continue...]
/* GET scaning page. */
router.post('/', function(req, res, next) {
res.render('scanning'); // Load the socket.io host page
var render = {};
var pages = [];
var timer = new Date();
// Helper func to log the requests.
function log(message) {
var now = new Date();
console.log(now - timer + 'ms', message);
timer = now;
}
// Ensure URL format, parse URL
// Check if URL exist
request(url.href, function (error, response, body) {
if (!error && response.statusCode == 200) {
// URL exists, so let's scan it
// Exclude links to the following extensions:
var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp',
'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip',
'rar', '7z', 'css', 'js', 'gzip', 'exe', 'xml', 'svg'];
var exts = exclude.join('|');
var regexReject = new RegExp('\.(' + exts + ')', 'i');
var rootURL = url.protocol + '//' + url.host + '/';
// Crawler configuration
var crawler = new Crawler(url.host);
crawler.initialPort = 80;
crawler.initialPath = url.pathname;
crawler.maxConcurrency = 1;
crawler.ignoreWWWDomain = false; // This is a little suspicious...
crawler.filterByDomain = true; // Only URLs from the current domain
crawler.scanSubdomains = true;
crawler.downloadUnsupported = false;
crawler.parseHTMLComments = false;
crawler.parseScriptTags = false;
crawler.acceptCookies = false;
// crawler.maxDepth = 1 // Debug only!
/*
* Fetch Conditions
*/
// Get only URLs, ignore feeds, only from this host
crawler.addFetchCondition(function (parsedURL) {
return (
!parsedURL.path.match(regexReject) && // Only links
(parsedURL.path.search('/feed') === -1) && // Igrnore feeds
(parsedURL.host === url.host) // Page is from this domain
);
});
// Should we only include subpages?
if(onlySubpages) {
crawler.addFetchCondition(function(parsedURL) {
// return parsedURL.path.search(url.pathname) > -1;
return parsedURL.path.startsWith(url.pathname);
// console.log(url, parsedURL);
});
}
// Exclude urls with fragments?
if(excludeUrls.length >= 1 ) {
crawler.addFetchCondition(function(parsedURL) {
var urlFragmentsOk = true;
excludeUrlFragments.forEach(function(fragment) {
if(parsedURL.path.search('/'+fragment) > -1) {
urlFragmentsOk = false;
}
});
return urlFragmentsOk;
});
}
// Include only URLs with fragments
if(includeOnlyUrls.length >= 1) {
crawler.addFetchCondition(function(parsedURL) {
var urlFragmentsOk = false;
var includeUrlFragments = includeOnlyUrls.replace(/\s/, '').split(',');
includeUrlFragments.forEach(function(fragment) {
if(parsedURL.path.search('/'+fragment) !== -1) {
urlFragmentsOk = true;
}
});
return urlFragmentsOk;
});
}
// Run the crawler
crawler.start();
// Execute for each URL, on fetchcomplete
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
[Do stuff with the scraped page]
});
// Completed crawling. Now let's get to work!
crawler.on('complete', function() {
[Get all scraped pages and do something with them]
});
// Error handling
crawler.on('queueerror', function(errorData, URLData) {
console.log('Queue error:', errorData, URLData);
});
crawler.on('fetchdataerror', function(queueitem, response) {
console.log('Fetch error:', queueitem, response);
});
crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
console.log('Fetch timeout:', queueItem, crawlerTimeoutValue);
});
crawler.on('fetchclienterror', function(queueItem, errorData) {
console.log('Fetch local error:', queueItem, errorData);
});
crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
console.log('Crawler timeout:', queueItem, crawlerTimeoutValue);
});
} else if(error) {
console.log(error);
}
});
});
return router;
}
Every simplecrawler instance has a stop method that can be called to prevent the crawler from making any further requests (requests won't be stopped in flight, however).
I would probably store the crawler instance in a scope outside of the route handler, check if its defined first thing in the route handler, in that case call the stop method and then construct a new scraper.
I stripped out a lot of the meat of your code, but something like this is what I had in mind:
module.exports = function(socket) {
var express = require('express');
var router = express.Router();
var Crawler = requrie('simplecrawler');
var crawler;
router.post('/', function(req, res, next) {
// Check if URL exist
request(url.href, function (error, response, body) {
if (!error && response.statusCode == 200) {
// Stop any crawler that's already running
if (crawler instanceof Crawler) {
crawler.stop();
}
// Crawler configuration
crawler = new Crawler(url.host);
crawler.initialPort = 80;
crawler.initialPath = url.pathname;
// Run the crawler
crawler.start();
// Execute for each URL, on fetchcomplete
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
// [Do stuff with the scraped page]
});
// Completed crawling. Now let's get to work!
crawler.on('complete', function() {
// [Get all scraped pages and do something with them]
});
} else if(error) {
console.log(error);
}
});
});
return router;
}

Resources