How ports work when scaling server using nodejs pm2 - node.js

I'm learning how to scale servers in a little sandbox I've setup. Here's the very simple code:
'use strict';
const express = require('express');
const bodyParser = require('body-parser');
const app = express();
const instanceId = parseInt(Math.random() * 1000);
//Allow all requests from all domains & localhost
app.all('/*', function(req, res, next) {
res.header("Access-Control-Allow-Origin", "*");
res.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type, Accept");
res.header("Access-Control-Allow-Methods", "POST, GET");
next();
});
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false}));
app.get('/', function(req, res) {
console.log(`[${new Date()}] ${req.method} ${req.originalUrl} from ${req.ip} at ${instanceId}`);
res.send(`received at ${Date.now()} from ${instanceId}`);
});
app.listen(6069);
Nothing crazy, just spits out the date and the instance the request was received at.
The pm2 docs for scaling a nodejs server advised me to run:
pm2 start server.js -i 5
which worked perfectly fine. Here's an example output when I stress tested it using npm module loadtest:
server-0 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 847
server-1 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 261
server-3 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 328
server-2 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 163
server-4 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 351
server-0 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 847
server-3 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 328
server-1 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 261
server-2 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 163
server-4 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 351
server-0 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 847
server-3 [Sun Aug 07 2016 00:13:53 GMT-0400 (EDT)] GET / from ::ffff:127.0.0.1 at 328
Here's my question. Why didn't node throw an error that port 6069 is in use? Multiple servers are attempting to use the port—yet there's no complaining. Why?

PM2 creates it's own "embedded load-balancer which uses Round-robin algorithm to better distribute load among the workers". So it basically wraps a load-balancer around your app and proxies the request to each node it creates.
When using Round-robin scheduling policy, the master accepts() all
incoming connections and sends the TCP handle for that particular
connection to the chosen worker (via IPC).

Related

Server under attack caused node.js time out error

Last night, my server was under attack. This happens a lot, and shouldn’t be reason to panic. However, my node.js application sets up a connection with MongoDB, and this timed out. So this morning, the MongoDB was unreachable until I restarted the node.js process. At first, I figured out what was wrong because of this error in the nginx error.log file, there were lots of these errors:
upstream timed out (110: Connection timed out) while reading response header from upstream
Then, I started to inspect my console logs of node.js, and found the following:
[Sat, 09 May 2020 00:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.514 ms
[Sat, 09 May 2020 01:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.328 ms
[Sat, 09 May 2020 01:31:31 GMT] GET /solr/admin/info/system?wt=json 404 161 - ::ffff:127.0.0.1 - 0.283 ms
[Sat, 09 May 2020 01:39:47 GMT] GET /?a=fetch&content=<php>die(#md5(HelloThinkCMF))</php> 200 382 - ::ffff:127.0.0.1 - 0.543 ms
[Sat, 09 May 2020 01:39:47 GMT] GET /?XDEBUG_SESSION_START=phpstorm 200 382 - ::ffff:127.0.0.1 - 0.501 ms
[Sat, 09 May 2020 01:42:40 GMT] GET /.git/config 404 150 - ::ffff:127.0.0.1 - 0.241 ms
[Sat, 09 May 2020 01:48:04 GMT] POST /GponForm/diag_Form?style/ 404 158 - ::ffff:127.0.0.1 - 0.499 ms
[Sat, 09 May 2020 01:54:18 GMT] GET /index.php?s=/Index/\think\app/invokefunction&function=call_user_func_array&vars[0]=md5&vars[1][]=HelloThinkPHP 404 148 - ::ffff:127.0.0.1 - 0.2$
[Sat, 09 May 2020 02:01:37 GMT] GET / 200 382 - ::ffff:127.0.0.1 - 0.516 ms
[Sat, 09 May 2020 02:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.396 ms
[Sat, 09 May 2020 02:49:15 GMT] GET / 200 382 - ::ffff:127.0.0.1 - 1.597 ms
[Sat, 09 May 2020 02:58:03 GMT] GET / 200 382 - ::ffff:127.0.0.1 - 1.016 ms
events.js:183
throw er; // Unhandled 'error' event
^
Error: read ETIMEDOUT
at _errnoException (util.js:1022:11)
at TCP.onread (net.js:628:25)
error: Forever detected script exited with code: 1
error: Script restart attempt #127
Server is up and running
MongoDB connection error: { MongooseTimeoutError: Server selection timed out after 30000 ms
at new MongooseTimeoutError (/root/node_modules/mongoose/lib/error/timeout.js:22:11)
at NativeConnection.Connection.openUri (/root/node_modules/mongoose/lib/connection.js:803:19)
at Mongoose.connect (/root/node_modules/mongoose/lib/index.js:332:15)
at Object.<anonymous> (/root/index.js:131:10)
at Module._compile (module.js:652:30)
at Object.Module._extensions..js (module.js:663:10)
at Module.load (module.js:565:32)
at tryModuleLoad (module.js:505:12)
at Function.Module._load (module.js:497:3)
at Function.Module.runMain (module.js:693:10)
at startup (bootstrap_node.js:188:16)
at bootstrap_node.js:609:3
message: 'Server selection timed out after 30000 ms',
name: 'MongooseTimeoutError',
[Symbol(mongoErrorContextSymbol)]: {} }
(node:7896) UnhandledPromiseRejectionWarning: MongooseTimeoutError: Server selection timed out after 30000 ms
at new MongooseTimeoutError (/root/node_modules/mongoose/lib/error/timeout.js:22:11)
at NativeConnection.Connection.openUri (/root/node_modules/mongoose/lib/connection.js:803:19)
at Mongoose.connect (/root/node_modules/mongoose/lib/index.js:332:15)
at Object.<anonymous> (/root/index.js:131:10)
at Module._compile (module.js:652:30)
at Object.Module._extensions..js (module.js:663:10)
at Module.load (module.js:565:32)
at tryModuleLoad (module.js:505:12)
at Function.Module._load (module.js:497:3)
at Function.Module.runMain (module.js:693:10)
at startup (bootstrap_node.js:188:16)
at bootstrap_node.js:609:3
(node:7896) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecti$
(node:7896) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process wit$
[Sat, 09 May 2020 03:19:59 GMT] GET / 200 382 - ::ffff:127.0.0.1 - 7.701 ms
[Sat, 09 May 2020 03:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 1.523 ms
[Sat, 09 May 2020 04:06:50 GMT] POST /api/jsonws/invoke 404 157 - ::ffff:127.0.0.1 - 20.066 ms
[Sat, 09 May 2020 04:06:50 GMT] GET /?XDEBUG_SESSION_START=phpstorm 200 382 - ::ffff:127.0.0.1 - 0.729 ms
[Sat, 09 May 2020 04:06:50 GMT] GET /index.php?s=/Index/\think\app/invokefunction&function=call_user_func_array&vars[0]=md5&vars[1][]=HelloThinkPHP 404 148 - ::ffff:127.0.0.1 - 0.6$
[Sat, 09 May 2020 04:07:17 GMT] GET /vendor/phpunit/phpunit/src/Util/PHP/eval-stdin.php 404 189 - ::ffff:127.0.0.1 - 1.356 ms
[Sat, 09 May 2020 04:07:19 GMT] POST /vendor/phpunit/phpunit/src/Util/PHP/eval-stdin.php 404 190 - ::ffff:127.0.0.1 - 1.735 ms
[Sat, 09 May 2020 04:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.846 ms
[Sat, 09 May 2020 04:35:36 GMT] GET /api/jsonws/invoke 404 156 - ::ffff:127.0.0.1 - 0.365 ms
[Sat, 09 May 2020 05:06:04 GMT] GET / 200 382 - ::ffff:127.0.0.1 - 0.706 ms
[Sat, 09 May 2020 05:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.366 ms
[Sat, 09 May 2020 05:26:19 GMT] GET /wp-login.php 404 151 - ::ffff:127.0.0.1 - 0.394 ms
[Sat, 09 May 2020 06:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.352 ms
[Sat, 09 May 2020 07:23:55 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.413 ms
[Sat, 09 May 2020 07:42:09 GMT] GET / 200 382 - ::ffff:127.0.0.1 - 1.998 ms
[Sat, 09 May 2020 07:54:13 GMT] HEAD / 200 382 - ::ffff:127.0.0.1 - 0.339 ms
So, while nothing got compromised, my MongoDB timed out, and was unable to connect for a couple of hours, until I reset the node.js connection myself. For your information, I am using 3 droplets: one for node.js (1 GB Memory / 25 GB Disk / AMS3 - Ubuntu 18.04.3 (LTS) x64), one for mysql database (2 GB Memory / 50 GB Disk / AMS3 - Ubuntu MySQL on 18.04) and one for mongodb database (1 GB Memory / 25 GB Disk / AMS3 - Ubuntu MongoDB 4.0.3 on 18.04).
Also, my application kept running, I just wasn't able to connect to MongoDB and retrieve the data from there. Everything from MySQL could be fetched, but as soon as I wanted to retrieve data from MongoDB, nothing showed up in my application, so I guess the process froze or something.
How could I avoid this in the future?
You should use nginx rate limiting.
Alternatively, you can use rate limiting on application level with a package like rate-limiter-flexible or any other suitable.

Solid server (project by Sir Tim Berners-Lee and MIT) not loading correct configuration

I was trying to spin a Solid Server. I've followed the instruction from Solid project (https://solid.inrupt.com/docs) to set up an server. However, when the service is running, it fails to start and complaints about path is not defined:
● solid.service - solid - Social Linked Data
Loaded: loaded (/lib/systemd/system/solid.service; enabled; vendor preset: enabled)
Active: inactive (dead) since Mon 2018-10-01 21:47:49 UTC; 4s ago
Docs: https://solid.inrupt.com/docs/
Process: 2519 ExecStart=/usr/bin/solid start -v (code=exited, status=0/SUCCESS)
Main PID: 2519 (code=exited, status=0/SUCCESS)
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Config path: ./config
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Suffix Acl: .acl
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Suffix Meta: .meta
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Filesystem Root: /var/www/my.server.com/
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Allow WebID authentication: true
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Live-updates: true
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Multi-user: false
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Suppress default data browser app: undefined
Oct 01 21:47:49 Mars solid[2519]: Mon, 01 Oct 2018 21:47:49 GMT solid:settings Default data browser app file path: default
Oct 01 21:47:49 Mars solid[2519]: ERROR Path must be a string. Received undefined
From this log, it seems like solid is still trying to read config from ~/.config path, although when I ran solid init command, I changed the location to /var/www/my.server.com/config/config.json. So the question is how do I resolve this issue? if the server is not loading current config or loading a cached config, how do I clear it?
I would really appreciate any help on this. Thanks.
I ran into the exact same issue (and more so afterwards...)
What fixed it for me, was to move the config.json into /var/www/my.solid.server and not in the config/ subdirectory. The service definition points to 'var/www/my.solid.server' as its working directory and the solid executable tries to find config.json in that folder.
The documentation is a on pretty minimal level. Hope they keep growing this great idea. Hope this helps a little.

ASP.NET Core Angular app fails to run on Ubuntu 16.04 with Nginx and Systemd

I have an ASP.NET Core Angular application targeting dotnet 1.1.0.
I installed Nginx on my Linux Ubuntu 16.04 and configured the nginx confog file as follows:
server {
listen 80;
location / {
proxy_pass http://localhost:5000;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection keep-alive;
proxy_set_header Host $host;
proxy_cache_bypass $http_upgrade;
}
}
and myapp.services file as follows:
[Unit]
Description=Sample application.
[Service]
Type=simple
WorkingDirectory=/var/myappfolder
ExecStart=/usr/bin/dotnet /var/myappfolder/myapp.dll
#User=web
[Install]
WantedBy=multi-user.target
I tested this set up with a simple sample app and it worked fine. However as soon as I deploy my proper app to /var/myappfolder and configure
systemclt start mywebsite
systemclt daemon_reload
and then check
systemclt status mywebsite
I get this error:
jtrade.service - Sample application. Loaded: loaded
(/lib/systemd/system/jtrade.service; disabled; vendor preset: enabled)
Active: failed (Result: signal) since Wed 2017-08-30 18:08:08 UTC; 9s
ago Process: 4640 ExecStart=/usr/bin/dotnet /var/jtrade/jtradep.dll
(code=killed, signal=ABRT) Main PID: 4640 (code=killed, signal=ABRT)
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Configurer.NuGetCacheSentinel.get_NuGetCachePath()
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Configurer.NuGetCacheSentinel.Exists()
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Configurer.DotnetFirstTimeUseConfigurer.ShouldPrimeNugetCache()
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Configurer.DotnetFirstTimeUseConfigurer.Configure()
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Cli.Program.ConfigureDotNetForFirstTimeUse(INuGetCacheSentinel
nugetCacheSentinel)
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Cli.Program.ProcessArgs(String[] args, ITelemetry
telemetryClient)
Aug 30 18:08:08 localhost dotnet[4640]: at
Microsoft.DotNet.Cli.Program.Main(String[] args)
Aug 30 18:08:08 localhost systemd[1]: jtrade.service: Main process
exited, code=killed, status=6/ABRT
Aug 30 18:08:08 localhost systemd[1]: jtrade.service: Unit entered
failed state.
Aug 30 18:08:08 localhost systemd[1]: jtrade.service: Failed with
result 'signal'.
So I dived deeper into debugging this error with journalctl -u myappname and got some more useful info:
Started Sample application..
Aug 31 05:13:34 localhost dotnet[10290]: Unhandled Exception:
System.InvalidOperationException: Required environment variable 'HOME'
is not set. Try setting 'HOME' and running the operation again.
Aug 31 05:13:34 localhost dotnet[10290]: at
NuGet.Common.NuGetEnvironment.GetValueOrThrowMissingEnvVar(Func`1
getValue, String name)
Aug 31 05:13:34 localhost dotnet[10290]: at
NuGet.Common.NuGetEnvironment.GetHome() Aug 31 05:13:34 localhost
dotnet[10290]: at
NuGet.Common.NuGetEnvironment.<>c.<.cctor>b__12_0()
Aug 31 05:13:34 localhost dotnet[10290]: at
System.Lazy`1.CreateValue()
Aug 31 05:13:34 localhost dotnet[10290]: --- End of stack trace from
previous location where exception was thrown ---
Aug 31 05:13:34 localhost dotnet[10290]: at
System.Runtime.ExceptionServices.ExceptionDispatchInfo.Throw()
Aug 31 05:13:34 localhost dotnet[10290]: at
System.Lazy`1.get_Value()
Aug 31 05:13:34 localhost dotnet[10290]: at
NuGet.Common.NuGetEnvironment.GetFolderPath(SpecialFolder folder)
Aug 31 05:13:34 localhost dotnet[10290]: at
NuGet.Common.NuGetEnvironment.GetFolderPath(NuGetFolderPath folder)
Aug 31 05:13:34 localhost dotnet[10290]: at
NuGet.Configuration.SettingsUtility.GetGlobalPackagesFolder(ISettings
settings)
Aug 31 05:13:34 localhost dotnet[10290]: at
NuGet.Configuration.NuGetPathContext.Create(ISettings settings)
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Configurer.NuGetCacheSentinel.get_NuGetCachePath()
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Configurer.NuGetCacheSentinel.Exists()
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Configurer.DotnetFirstTimeUseConfigurer.ShouldPrimeNugetCache()
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Configurer.DotnetFirstTimeUseConfigurer.Configure()
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Cli.Program.ConfigureDotNetForFirstTimeUse(INuGetCacheSentinel
nugetCacheSentinel)
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Cli.Program.ProcessArgs(String[] args, ITelemetry
telemetryClient)
Aug 31 05:13:34 localhost dotnet[10290]: at
Microsoft.DotNet.Cli.Program.Main(String[] args)
Aug 31 05:13:34 localhost systemd[1]: jtrade.service: Main process
exited, code=killed, status=6/ABRT
Aug 31 05:13:34 localhost systemd[1]: jtrade.service: Unit entered
failed state.
Aug 31 05:13:34 localhost systemd[1]: jtrade.service: Failed with
result 'signal'.
From here if I run to see my environment variables with printenv, I find that HOME= /root
Maybe it should be set to something else?
Apparently I just wrote
Environment=HOME=/root
in the .service file and everything started working

Meteor app crashes after being deployed but works on local host

I deployed my meteor app to a .meteor.com server and it states that it is crashing. However, the site works fine on my local machine. Here are the logs:
[Tue Mar 17 2015 20:58:58 GMT+0000 (UTC)] NOTICE Starting application on port
25107
[Tue Mar 17 2015 20:58:58 GMT+0000 (UTC)] INFO STATUS null -> starting
[Tue Mar 17 2015 20:58:58 GMT+0000 (UTC)] INFO STATUS killed -> sleeping
[Tue Mar 17 2015 20:58:58 GMT+0000 (UTC)] INFO STATUS starting -> killed
[Tue Mar 17 2015 20:58:58 GMT+0000 (UTC)] NOTICE Starting application on port
9393
[Tue Mar 17 2015 20:58:58 GMT+0000 (UTC)] INFO STATUS null -> starting
[Tue Mar 17 2015 20:58:59 GMT+0000 (UTC)] INFO STATUS starting -> crashed
[Tue Mar 17 2015 20:59:31 GMT+0000 (UTC)] INFO HIT / 174.100.99.102
[Tue Mar 17 2015 20:59:37 GMT+0000 (UTC)] INFO HIT / 174.100.99.102
[Tue Mar 17 2015 20:59:37 GMT+0000 (UTC)] INFO HIT /favicon.ico 174.100.99.102
[Tue Mar 17 2015 20:59:43 GMT+0000 (UTC)] INFO HIT /favicon.ico 174.100.99.102
[Tue Mar 17 2015 20:59:53 GMT+0000 (UTC)] INFO HIT /_GALAXY_ 174.100.99.102
[Tue Mar 17 2015 20:59:59 GMT+0000 (UTC)] INFO HIT /_GALAXY_ 174.100.99.102
Ok,
I forgot to comment out the email package code and remove the test key.
Once I did that it deployed fine.

Is it possible to prevent fetching of remote design document in couchdb

Update
As #AkshatJiwanSharma suggested I have tried a few things while locally replicating. Very instructive! I have renamed the question since the problem is not that the design document gets replicated, in fact it isn't replicated, but it is fetched via an HTTP GET as part of the initial replication "negotiation" phase.
I've moved the original question to the bottom to make the new question clearer. The new question is:
It seems inefficient (particularly in the case of CouchApps) to fetch the entire design document - i.e. the entire remote app - when initiating a replication with a remote source. Can this be avoided?
It is particularly problematic in our case, on high latency links (less than 7.2Kbps), with relatively large design documents (3MB).
Remote Target
I have first tried by using a "remote" target by setting the replication target to http://127.0.0.1:5984/emr_replica.
[Fri, 08 Aug 2014 08:36:20 GMT] [info] [<0.18947.7>] Document `88fa1b1a1315d27ded663466c6003578` triggered replication `e8e66a554d198b88b6263a572a072fd3+continuous`
[Fri, 08 Aug 2014 08:36:20 GMT] [info] [<0.18946.7>] starting new replication `e8e66a554d198b88b6263a572a072fd3+continuous` at <0.18947.7> (`emr_demo` -> `http://127.0.0.1:5984/emr_replica/`)
[Fri, 08 Aug 2014 08:36:20 GMT] [info] [<0.18928.7>] 127.0.0.1 - - POST /emr_replica/_revs_diff 200
[Fri, 08 Aug 2014 08:36:20 GMT] [info] [<0.18915.7>] y.y.y.y - - GET /_utils/_sidebar.html 200
[Fri, 08 Aug 2014 08:36:20 GMT] [info] [<0.18916.7>] y.y.y.y - - GET /_replicator/88fa1b1a1315d27ded663466c6003578?revs_info=true 200
In that case the design document doesn't seem to be fetched.
Remote Source
Then setting the source as "remote" like this
{
"_id": "88fa1b1a1315d27ded663466c6003a4a",
"_rev": "3-b6408e98acafe729da0153c35d9df113",
"source": "http://127.0.0.1:5984/emr_demo",
"target": "emr_replica",
"continuous": true,
"filter": "emr/user_data",
"owner": "jun"
}
Then the server fetches the remote design document before starting the replication (GET /emr_demo/_design/emr 200).
[Fri, 08 Aug 2014 08:42:17 GMT] [info] [<0.19687.7>] Document `88fa1b1a1315d27ded663466c6003a4a` triggered replication `bd8f6288970bca974dba36dbc6e5353b+continuous`
[Fri, 08 Aug 2014 08:42:17 GMT] [info] [<0.19686.7>] starting new replication `bd8f6288970bca974dba36dbc6e5353b+continuous` at <0.19687.7> (`http://127.0.0.1:5984/emr_demo/` -> `emr_replica`)
[Fri, 08 Aug 2014 08:42:17 GMT] [info] [<0.19648.7>] 127.0.0.1 - - HEAD /emr_demo/ 200
[Fri, 08 Aug 2014 08:42:17 GMT] [info] [<0.19648.7>] 127.0.0.1 - - GET /emr_demo/_design/emr 200
[Fri, 08 Aug 2014 08:42:18 GMT] [info] [<0.19656.7>] 127.0.0.1 - - GET /emr_demo/5cc2db69a32a84091b96c244273fda0e?revs=true&open_revs=%5B%221-ef8967557f2e99eb137f963daccddb3f%22%5D&latest=true 200
Further testing shows that this fetching of the design document is only done once. Further replications (including after restarting the server) only fetch the changes with the appropriate filter:
[Fri, 08 Aug 2014 09:06:36 GMT] [info] [<0.520.0>] Document `88fa1b1a1315d27ded663466c6003a4a` triggered replication `bd8f6288970bca974dba36dbc6e5353b+continuous`
[Fri, 08 Aug 2014 09:06:36 GMT] [info] [<0.519.0>] starting new replication `bd8f6288970bca974dba36dbc6e5353b+continuous` at <0.520.0> (`http://127.0.0.1:5984/emr_demo/` -> `emr_replica`)
[Fri, 08 Aug 2014 09:06:36 GMT] [info] [<0.335.0>] 127.0.0.1 - - GET /emr_demo/_changes?filter=emr%2Fuser_data&feed=continuous&style=all_docs&since=1607&heartbeat=1666 200
[Fri, 08 Aug 2014 09:06:36 GMT] [info] [<0.334.0>] 127.0.0.1 - - GET /emr_demo/5cc2db69a32a84091b96c24427560310?atts_since=%5B%2218-b613d3160bd09c45ac07a5485c9c7bce%22%5D&revs=true&open_revs=%5B%2219-d50438143337a3a0af5ed8ceb75b42f5%22%5D&latest=true 200
Former question
We're trying to use the couchdb replication over a very high latency link (slow, frequent disconnections,...). We want to avoid to replicate the design document which is heavy. We have a filter in place and when using the following curl command, the design document doesn't appear, as expected:
curl http://x.x.x.x:5984/emr/_changes?filter=emr/user_data
Our replication document is:
{
"_id": "e0e38be8cc0b11356dfb03bc8400074d",
"_rev": "1-d77117f03d63099e1e505b9f9de3371d",
"source": "http://x.x.x.x:5984/emr",
"target": "emr",
"continuous": true,
"filter": "emr/user_data",
"create_target": true,
"owner": "jun"
}
We have deactivated authentication while we're debugging. When using an existing database and removing create_target, the same problem occurs.
The source server outputs the following:
[Mon, 10 Mar 2014 21:22:03 GMT] [info] [<0.135.0>] Retrying HEAD request to http://x.x.x.x:5984/emr/ in 0.25 seconds due to error {conn_failed,{error,etimedout}}
[Mon, 10 Mar 2014 21:23:47 GMT] [info] [<0.135.0>] Retrying GET request to http://x.x.x.x:5984/emr/_design/emr in 0.25 seconds due to error req_timedout
[Mon, 10 Mar 2014 21:24:14 GMT] [error] [<0.135.0>] Replicator, request GET to "http://x.x.x.x:5984/emr/_design/emr" failed due to error {error,req_timedout}
[Mon, 10 Mar 2014 21:24:14 GMT] [error] [<0.135.0>] Replication manager, error processing document `e0e38be8cc0b11356dfb03bc8400074d`: Couldn't open document `_design/emr` from source database `http://x.x.x.x:5984/emr/`: {'EXIT',{http_request_failed,"GET","http://x.x.x.x:5984/emr/_design/emr",
{error,{error,req_timedout}}}}
When using tcpdump, it's clear that the replication fails because the replication manager attempts to download the heavy design document (http://x.x.x.x:5984/emr/_design/emr).
FYI the replicator's configuration is:
replicator connection_timeout 5000
db _replicator
http_connections 1
max_replication_retry_count 3
retries_per_request 1
socket_options [{keepalive, true}, {nodelay, true}]
ssl_certificate_max_depth 3
verify_ssl_certificates false
worker_batch_size 1
worker_processes 1
EDIT: The user_data function (which correctly hides the design document when ran through curl as above) is :
exports.user_data = function(doc, req) {
if (doc.collection == "visits" || doc.collection == "patients" || doc.collection == "reports") {
return true;
}
return false;
}
Hope someone can help!
Suggestion
Try defining a filter function in another, small, dedicated design document and see if that fixes your problem.
// replicator document:
{
"_id": "e0e38be8cc0b11356dfb03bc8400074d",
"_rev": "1-d77117f03d63099e1e505b9f9de3371d",
"source": "http://x.x.x.x:5984/emr",
"target": "emr",
"continuous": true,
"filter": "small-design-doc/user_data",
"create_target": true,
"owner": "jun"
}
// _design/small-design-doc
// -- will be replicated, but is quite small:
{
"_id": "_design/small-design-doc",
"_rev": "1-...",
"filters": {
"user_data": "function(doc, req) { ... }"
}
}
Explanation
According to a current snapshot of the source code, it seems the replicator is trying to fetch the design document (_design/emr) from the source database, simply because the filter function is defined there (emr/user_data).
If you specify a filter function in another design document, the replicator should try to download that very document before executing replication. So you cannot quite circumvent downloading any design document, but you are able to select which one.
Great question by the way. And very thoroughly investigated!

Resources