Presto - get sum of array elements - presto

I'm using Athena and trying to get a sum of array elements which are double but the reduce function seems to only work on integers:
SELECT reduce(ARRAY [5.0, 20.0, 50.4], 0, (s, x) -> s + x, s -> s);
Will throw an error:
Unexpected parameters (array(double), integer, com.facebook.presto.sql.analyzer.TypeSignatureProvider#762f0fa7, com.facebook.presto.sql.analyzer.TypeSignatureProvider#29dfe267) for function reduce. Expected: reduce(array(T), S, function(S,T,S), function(S,R)) T, S, R
Is there a way to do it?

This is a know bug in Presto. You can track https://github.com/prestosql/presto/issues/2760.
As a workaround, you can cast your array(decimal(..)) to either array(decimal(38,..)) or array(double):
presto> SELECT reduce(cast(ARRAY[5.0, 20.0, 50.4] as array(decimal(38,5))), 0, (s, x) -> s + x, s -> s);
_col0
----------
75.40000
presto> SELECT reduce(cast(ARRAY[5.0, 20.0, 50.4] as array(double)), 0, (s, x) -> s + x, s -> s);
_col0
-------
75.4
Athena is based on an older Presto version (currently .172, 3 years old), and apparently none of the above works in Athena (based on the fact your array is already array(double)).

Doesn't look clean but I have found a way to make it work:
SELECT reduce(ARRAY [5.0, 20.0, 50.4],
CAST(ROW(0.0, 0) AS ROW(sum DOUBLE, count INTEGER)),
(s, x) -> CAST(ROW(x + s.sum, s.count + 1) AS ROW(sum DOUBLE, count INTEGER)),
s -> IF(s.count = 0, NULL, s.sum));

Related

Rust signed modulo unsigned -> unsigned

In (stable) Rust, is there a relatively straightforward method of implementing the following function?
fn mod_euclid(val: i128, modulo: u128) -> u128;
Note the types! That is, 'standard' euclidean modulus (result is always in the range of [0, mod)), avoiding spurious overflow/underflow in the intermediate calculation. Some test cases:
// don't-care, just no panic or UB.
// Mild preference for treating this as though it was mod=1<<128 instead of 0.
assert_dc!(mod_euclid(i128::MAX, 0));
assert_dc!(mod_euclid( 0, 0));
assert_dc!(mod_euclid(i128::MIN, 0));
assert_eq!(mod_euclid( 1, 10), 1);
assert_eq!(mod_euclid( -1, 10), 9);
assert_eq!(mod_euclid( 11, 10), 1);
assert_eq!(mod_euclid( -11, 10), 9);
assert_eq!(mod_euclid(i128::MAX, 1), 0);
assert_eq!(mod_euclid( 0, 1), 0);
assert_eq!(mod_euclid(i128::MIN, 1), 0);
assert_eq!(mod_euclid(i128::MAX, u128::MAX), i128::MAX as u128);
assert_eq!(mod_euclid( 0, u128::MAX), 0);
assert_eq!(mod_euclid(i128::MIN, u128::MAX), i128::MAX as u128);
For signed%signed->signed, or unsigned%unsigned->unsigned, this is relatively straightforward. However, I can't find a good way of calculating signed % unsigned -> unsigned without converting one of the arguments - and as the last example illustrates, this may overflow or underflow no matter which direction you choose.
As far as I can tell, there is no such function in the standard library, but it's not very difficult to write one yourself:
fn mod_euclid(a: i128, b: u128) -> u128 {
if a >= 0 {
(a as u128) % b
} else {
let r = (!a as u128) % b;
b - r - 1
}
}
Playground link
How it works:
If a is non-negative then it's straightforward - just use the unsigned remainder operator.
Otherwise, the bitwise complement !a is non-negative (because the sign bit is flipped), and numerically equal to -a - 1. This means r is equivalent to b - a - 1 modulo b, and hence b - r - 1 is equivalent to a modulo b. Conveniently, b - r - 1 is in the expected range 0..b.
Maybe a little bit more straight forward, use rem_euclid where possible and else return the positive value equivalent to a:
pub fn mod_euclid(a: i128, b: u128) -> u128 {
const UPPER: u128 = i128::MAX as u128;
match b {
1..=UPPER => a.rem_euclid(b as i128) as u128,
_ if a >= 0 => a as u128,
// turn a from two's complement negative into it's
// equivalent positive value by adding u128::MAX
// essentialy calculating u128::MAX - |a|
_ => u128::MAX.wrapping_add_signed(a),
//_ => a as u128 - (a < 0) as u128,
}
}
(The parser didn't like my casting in the match hence UPPER)
Playground
Results in a little fewer instructions & jumps on x86_64 as well.

Python Bit Summation Algorithm

I am trying to implement a function that will be used to judge whether a generator's output is continuous. The method I am gravitating towards is to iterate through the generator. For each value, I right justify the bits of the value (disregarding the 0b), count the number of ones, and shift the number of ones.
#!/usr/bin/python3
from typing import Tuple
def find_bit_sum(top: int, pad_length: int) -> int :
"""."""
return pad_length * (top + 1)
def find_pad_length(top: int) -> int :
"""."""
return len(bin(top)) - 2 # -"0b"
def guess_certain(top: int, pad_length: int) -> Tuple[int, int, int] :
"""."""
_both: int = find_bit_sum(top, pad_length)
_ones: int = sum(sum(int(_i_in) for _i_in in bin(_i_out)[2 :]) for _i_out in range(1, top + 1))
return _both - _ones, _ones, _both # zeros, ones, sum
def guess(top: int, pad_length: int) -> Tuple[int, int, int] : # zeros then ones then sum
"""."""
_bit_sum: int = find_bit_sum(top, pad_length) # number of bits in total
_zeros: int = _bit_sum # ones are deducted
_ones: int = 0 # _bit_sum - _zeros
# detect ones
for _indexed in range(pad_length) :
_ones_found: int = int(top // (2 ** (_indexed + 1))) # HELP!!!
_zeros -= _ones_found
_ones += _ones_found
#
return _zeros, _ones, _bit_sum
def test_the_guess(max_value: int) -> bool : # the range is int [0, max_value + 1)
pad: int = find_pad_length(max_value)
_zeros0, _ones0, _total0 = guess_certain(max_value, pad)
_zeros1, _ones1, _total1 = guess(max_value, pad)
return all((
_zeros0 == _zeros1,
_ones0 == _ones1,
_total0 == _total1
))
if __name__ == '__main__' : # should produce a lot of True
for x in range(3000) :
print(test_the_guess(x))
For the life of me, I cannot make guess() agree with guess_certain(). The time complexity of guess_certain() is my problem: it works for small ranges [0, top], but one can forget 256-bit numbers (tops). The find_bit_sum() function works perfectly. The find_pad_length() function also works.
top // (2 ** (_indexed + 1))
I've tried 40 or 50 variations of the guess() function. It has thoroughly frustrated me. The guess() function is probabilistic. In its finished state: if it returns False, then the Generator definitely isn't producing every value in range(top + 1); however, if it returns True, then the Generator could be. We already know that the generator range(top + 1) is continuous because it does produce each number between 0 and top inclusively; so, test_the_guess() should be returning True.
I sincerely do apologise for the chaotic explanation. If you have anny questions, please don't hesitate to ask.
I adjusted your ones_found assignment statement to account for the number of powers of two per int(top // (2 ** (_indexed + 1))), as well as a additional "rollover" ones that occur before the next power of two. Here is the resulting statement:
_ones_found: int = int(top // (2 ** (_indexed + 1))) * (2 ** (_indexed)) + max(0, (top % (2 ** (_indexed + 1))) - (2 ** _indexed) + 1)
I also took the liberty of converting the statement to bitwise operators for both clarity and speed, as shown below:
_ones_found: int = ((top >> _indexed + 1) << _indexed) + max(0, (top & (1 << _indexed + 1) - 1) - (1 << _indexed) + 1)

Check if a float can be converted to integer without loss

I wanted to check whether an integer was a power of 2. My standard approach would have been to see if logâ‚‚(x) was an integer value, however I found no elegant way to do this. My approaches were the following:
let x = 65;
let y = (x as f64).log(2.0);
// Compute the difference between y and the result of
// of truncating y by casting to int and back
let difference = y - (y as i64 as f64);
// This looks nice but matches on float values will be phased out
match difference {
0.0 => println!("{} is a power of 2", x),
_ => println!("{} is NO power of 2", x),
}
// This seems kind of clunky
if difference == 0.0 {
println!("{} is a power of 2", x);
} else {
println!("{} is NO power of 2", x);
}
Is there a builtin option in Rust to check if a float can be converted to an integer without truncation?
Something that behaves like:
42.0f64.is_int() // True/ Ok()
42.23f64.is_int() // False/ Err()
In other words, a method/ macro/ etc. that allows me to check if I will lose information (decimals) by casting to int.
I already found that checking whether an integer is a power of 2 can be done efficiently with x.count_ones() == 1.
You can use fract to check if there is a non-zero fractional part:
42.0f64.fract() == 0.0;
42.23f64.fract() != 0.0;
Note that this only works if you already know that the number is in range. If you need an extra check to test that the floating-point number is between 0 and u32::MAX (or between i32::MIN and i32::MAX), then you might as well do the conversion and check that it didn't lose precision:
x == (x as u32) as f64

How to enforce the left-to-right node ordering in GraphViz rank layout?

I am visualizing a collection of process with GraphViz. Each process consists of some Read or Write operations in program order. Naturally, it is desirable to arrange the operations in the left-to-right order with respect to each process.
Using GraphViz (version 2.28), my code goes like this:
digraph G
{
ranksep = 1.0; size = "10,10";
{
node [shape = plaintext, fontsize = 20];
0 -> 1 -> 2 -> 3 -> 4;
}
node [shape = box];
{rank = same;0;wy1;rf1;rc1;rz1;ry1;ra1;rb1;rx2;}
{rank = same;1;wf1;}
{rank = same;2;wx2;wc1;}
{rank = same;3;wf2;wz2;wx3;wa1;}
{rank = same;4;wz1;wy2;wx5;wb1;}
wy1 -> rf1;
rf1 -> rc1;
rc1 -> rz1;
rz1 -> ry1;
ry1 -> ra1;
ra1 -> rb1;
rb1 -> rx2;
wx2 -> wc1;
wf2 -> wz2;
wz2 -> wx3;
wx3 -> wa1;
wz1 -> wy2;
wy2 -> wx5;
wx5 -> wb1;
wf1 -> rf1[color = blue];
wc1 -> rc1[color = blue];
wz1 -> rz1[color = blue];
wy1 -> ry1[color = blue];
wa1 -> ra1[color = blue];
wb1 -> rb1[color = blue];
wx2 -> rx2[color = blue];
// W'WR Order:
wx3 -> wx2[style = dashed, color = red];
// W'WR Order:
wx5 -> wx2[style = dashed, color = red];
}
I am sorry to say that I am not allowed to post the output picture with too low reputation. If you can run the code, you will see that the result is not so satisfying due to the out of order in process with pid = 3. Specifically, GraphViz layout algorithm has rearranged the (ideal) order "wf2-> wz2 -> wa1 -> wx3" to "wx3, wf2, wz2, wa1". Therefore, my problem is:
My Problem: How to enforce the left-to-right node ordering in the rank environment?
With exploring in this site, I have found some similar problems and potential solutions. However, they just did not work in my specific example:
Graphviz .dot node ordering: the constraint = false option made my PDF picture worse. I checked the dot User's Manual which says:
During rank assignment, the head node of an edge is constrained to be on a higher rank than the tail node. If the edge has constraint=false, however, this requirement is not enforced.
Based on the above statements, (I guess) constraint = false option takes effect between different ranks instead of in the same rank.
Graphviz---random node order and edges going through labels: With surprise, the constraint = false option helped the "finite state machine" a lot in the same rank. Again, it does not save me from the trouble.
graphviz: circular layout while preserving node order: The process graph is dynamic both in number of nodes and edges. So, it maybe not attractive to use the absolute position for nodes (to cause many edge crossing?).
Thanks for any suggestions. And executable code will be appreciated very much.
digraph G
{
ranksep = 1.0; size = "10,10";
{
node [shape = plaintext, fontsize = 20];
0 -> 1 -> 2 -> 3 -> 4;
}
node [shape = box];
{
rank = same;
0->wy1->rf1->rc1->rz1->ry1->ra1->rb1->rx2 [color=white];
rankdir=LR;
}
{
rank = same;
1->wf1[color=white];
rankdir=LR
}
{
rank = same;
2->wx2->wc1[color=white];
rankdir=LR;
}
{
rank = same;
3->wf2->wz2->wx3->wa1[color=white];
rankdir=LR;
}
{
rank = same;
4->wz1->wy2->wx5->wb1[color=white];
rankdir=LR;
}
wy1 -> rf1;
rf1 -> rc1;
rc1 -> rz1;
rz1 -> ry1;
ry1 -> ra1;
ra1 -> rb1;
rb1 -> rx2;
wx2 -> wc1;
wf2 -> wz2;
wz2 -> wx3;
wx3 -> wa1;
wz1 -> wy2;
wy2 -> wx5;
wx5 -> wb1;
wf1 -> rf1[color = blue];
wc1 -> rc1[color = blue];
wz1 -> rz1[color = blue];
wy1 -> ry1[color = blue];
wa1 -> ra1[color = blue];
wb1 -> rb1[color = blue];
wx2 -> rx2[color = blue];
// W'WR Order:
wx3 -> wx2[style = dashed, color = red];
// W'WR Order:
wx5 -> wx2[style = dashed, color = red];
}
I am not quite sure that i've correctly got your problem, but try using this and comment please if it is what you want. I've added invisible edges for correct ranking of nodes and used rankdir to use left-right layout.

How to find "nearest" value in a large list in Erlang

Suppose I have a large collection of integers (say 50,000,000 of them).
I would like to write a function that returns me the largest integer in the collection that doesn't exceed a value passed as a parameter to the function. E.g. if the values were:
Values = [ 10, 20, 30, 40, 50, 60]
then find(Values, 25) should return 20.
The function will be called many times a second and the collection is large. Assuming that the performance of a brute-force search is too slow, what would be an efficient way to do it? The integers would rarely change, so they can be stored in a data structure that would give the fastest access.
I've looked at gb_trees but I don't think you can obtain the "insertion point" and then get the previous entry.
I realise I could do this from scratch by building my own tree structure, or binary chopping a sorted array, but is there some built-in way to do it that I've overlooked?
To find nearest value in large unsorted list I'd suggest you to use divide and conquer strategy - and process different parts of list in parallel. But enough small parts of list may be processed sequentially.
Here is code for you:
-module( finder ).
-export( [ nearest/2 ] ).
-define( THRESHOLD, 1000 ).
%%
%% sequential finding of nearest value
%%
%% if nearest value doesn't exists - return null
%%
nearest( Val, List ) when length(List) =< ?THRESHOLD ->
lists:foldl(
fun
( X, null ) when X < Val ->
X;
( _X, null ) ->
null;
( X, Nearest ) when X < Val, X > Nearest ->
X;
( _X, Nearest ) ->
Nearest
end,
null,
List );
%%
%% split large lists and process each part in parallel
%%
nearest( Val, List ) ->
{ Left, Right } = lists:split( length(List) div 2, List ),
Ref1 = spawn_nearest( Val, Left ),
Ref2 = spawn_nearest( Val, Right ),
Nearest1 = receive_nearest( Ref1 ),
Nearest2 = receive_nearest( Ref2 ),
%%
%% compare nearest values from each part
%%
case { Nearest1, Nearest2 } of
{ null, null } ->
null;
{ null, Nearest2 } ->
Nearest2;
{ Nearest1, null } ->
Nearest1;
{ Nearest1, Nearest2 } when Nearest2 > Nearest1 ->
Nearest2;
{ Nearest1, Nearest2 } when Nearest2 =< Nearest1 ->
Nearest1
end.
spawn_nearest( Val, List ) ->
Ref = make_ref(),
SelfPid = self(),
spawn(
fun() ->
SelfPid ! { Ref, nearest( Val, List ) }
end ),
Ref.
receive_nearest( Ref ) ->
receive
{ Ref, Nearest } -> Nearest
end.
Testing in shell:
1> c(finder).
{ok,finder}
2>
2> List = [ random:uniform(1000) || _X <- lists:seq(1,100000) ].
[444,724,946,502,312,598,916,667,478,597,143,210,698,160,
559,215,458,422,6,563,476,401,310,59,579,990,331,184,203|...]
3>
3> finder:nearest( 500, List ).
499
4>
4> finder:nearest( -100, lists:seq(1,100000) ).
null
5>
5> finder:nearest( 40000, lists:seq(1,100000) ).
39999
6>
6> finder:nearest( 4000000, lists:seq(1,100000) ).
100000
Performance: (single node)
7>
7> timer:tc( finder, nearest, [ 40000, lists:seq(1,10000) ] ).
{3434,10000}
8>
8> timer:tc( finder, nearest, [ 40000, lists:seq(1,100000) ] ).
{21736,39999}
9>
9> timer:tc( finder, nearest, [ 40000, lists:seq(1,1000000) ] ).
{314399,39999}
Versus plain iterating:
1>
1> timer:tc( lists, foldl, [ fun(_X, Acc) -> Acc end, null, lists:seq(1,10000) ] ).
{14994,null}
2>
2> timer:tc( lists, foldl, [ fun(_X, Acc) -> Acc end, null, lists:seq(1,100000) ] ).
{141951,null}
3>
3> timer:tc( lists, foldl, [ fun(_X, Acc) -> Acc end, null, lists:seq(1,1000000) ] ).
{1374426,null}
So, yo may see, that on list with 1000000 elements, function finder:nearest is faster than plain iterating through list with lists:foldl.
You may find optimal value of THRESHOLD in your case.
Also you may improve performance, if spawn processes on different nodes.
Here is another code sample that uses ets. I believe a lookup would be made in about constant time:
1> ets:new(tab,[named_table, ordered_set, public]).
2> lists:foreach(fun(N) -> ets:insert(tab,{N,[]}) end, lists:seq(1,50000000)).
3> timer:tc(fun() -> ets:prev(tab, 500000) end).
{21,499999}
4> timer:tc(fun() -> ets:prev(tab, 41230000) end).
{26,41229999}
The code surrounding would be a bit more than this of course but it is rather neat
So if the input isn't sorted, you can get a linear version by doing:
closest(Target, [Hd | Tl ]) ->
closest(Target, Tl, Hd).
closest(_Target, [], Best) -> Best;
closest(Target, [ Target | _ ], _) -> Target;
closest(Target, [ N | Rest ], Best) ->
CurEps = erlang:abs(Target - Best),
NewEps = erlang:abs(Target - N),
if NewEps < CurEps ->
closest(Target, Rest, N);
true ->
closest(Target, Rest, Best)
end.
You should be able to do better if the input is sorted.
I invented my own metric for 'closest' here as I allow the closest value to be higher than the target value - you could change it to be 'closest but not greater than' if you liked.
In my opinion, if you have a huge collection of data that does not change often, you shoud think about organize it.
I have wrote a simple one based on ordered list, including insertion an deletion functions. It gives good results for both inserting and searching.
-module(finder).
-export([test/1,find/2,insert/2,remove/2,new/0]).
-compile(export_all).
new() -> [].
insert(V,L) ->
{R,P} = locate(V,L,undefined,-1),
insert(V,R,P,L).
find(V,L) ->
locate(V,L,undefined,-1).
remove(V,L) ->
{R,P} = locate(V,L,undefined,-1),
remove(V,R,P,L).
test(Max) ->
{A,B,C} = erlang:now(),
random:seed(A,B,C),
L = lists:seq(0,100*Max,100),
S = random:uniform(100000000),
I = random:uniform(100000000),
io:format("start insert at ~p~n",[erlang:now()]),
L1 = insert(I,L),
io:format("start find at ~p~n",[erlang:now()]),
R = find(S,L1),
io:format("end at ~p~n result is ~p~n",[erlang:now(),R]).
remove(_,_,-1,L) -> L;
remove(V,V,P,L) ->
{L1,[V|L2]} = lists:split(P,L),
L1 ++ L2;
remove(_,_,_,L) ->L.
insert(V,V,_,L) -> L;
insert(V,_,-1,L) -> [V|L];
insert(V,_,P,L) ->
{L1,L2} = lists:split(P+1,L),
L1 ++ [V] ++ L2.
locate(_,[],R,P) -> {R,P};
locate (V,L,R,P) ->
%% io:format("locate, value = ~p, liste = ~p, current result = ~p, current pos = ~p~n",[V,L,R,P]),
{L1,[M|L2]} = lists:split(Le1 = (length(L) div 2), L),
locate(V,R,P,Le1+1,L1,M,L2).
locate(V,_,P,Le,_,V,_) -> {V,P+Le};
locate(V,_,P,Le,_,M,L2) when V > M -> locate(V,L2,M,P+Le);
locate(V,R,P,_,L1,_,_) -> locate(V,L1,R,P).
which give the following results
(exec#WXFRB1824L)6> finder:test(10000000).
start insert at {1347,28177,618000}
start find at {1347,28178,322000}
end at {1347,28178,728000}
result is {72983500,729836}
that is 704ms to insert a new value in a list of 10 000 000 elements and 406ms to find the nearest value int the same list.
I tried to have a more accurate information about the performance of the algorithm I proposed above, an reading the very interesting solution of Stemm, I decide to use the tc:timer/3 function. Big deception :o). On my laptop, I got a very bad accuracy of the time. So I decided to left my corei5 (2 cores * 2 threads) + 2Gb DDR3 + windows XP 32bit to use my home PC: Phantom (6 cores) + 8Gb + Linux 64bit.
Now tc:timer works as expected, I am able to manipulate lists of 100 000 000 integers. I was able to see that I was loosing a lot of time calling at each step the length function, so I re-factored the code a little to avoid it:
-module(finder).
-export([test/2,find/2,insert/2,remove/2,new/0]).
%% interface
new() -> {0,[]}.
insert(V,{S,L}) ->
{R,P} = locate(V,L,S,undefined,-1),
insert(V,R,P,L,S).
find(V,{S,L}) ->
locate(V,L,S,undefined,-1).
remove(V,{S,L}) ->
{R,P} = locate(V,L,S,undefined,-1),
remove(V,R,P,L,S).
remove(_,_,-1,L,S) -> {S,L};
remove(V,V,P,L,S) ->
{L1,[V|L2]} = lists:split(P,L),
{S-1,L1 ++ L2};
remove(_,_,_,L,S) ->{S,L}.
%% local
insert(V,V,_,L,S) -> {S,L};
insert(V,_,-1,L,S) -> {S+1,[V|L]};
insert(V,_,P,L,S) ->
{L1,L2} = lists:split(P+1,L),
{S+1,L1 ++ [V] ++ L2}.
locate(_,[],_,R,P) -> {R,P};
locate (V,L,S,R,P) ->
S1 = S div 2,
S2 = S - S1 -1,
{L1,[M|L2]} = lists:split(S1, L),
locate(V,R,P,S1+1,L1,S1,M,L2,S2).
locate(V,_,P,Le,_,_,V,_,_) -> {V,P+Le};
locate(V,_,P,Le,_,_,M,L2,S2) when V > M -> locate(V,L2,S2,M,P+Le);
locate(V,R,P,_,L1,S1,_,_,_) -> locate(V,L1,S1,R,P).
%% test
test(Max,Iter) ->
{A,B,C} = erlang:now(),
random:seed(A,B,C),
L = {Max+1,lists:seq(0,100*Max,100)},
Ins = test_insert(L,Iter,[]),
io:format("insert:~n~s~n",[stat(Ins,Iter)]),
Fin = test_find(L,Iter,[]),
io:format("find:~n ~s~n",[stat(Fin,Iter)]).
test_insert(_L,0,Res) -> Res;
test_insert(L,I,Res) ->
V = random:uniform(1000000000),
{T,_} = timer:tc(finder,insert,[V,L]),
test_insert(L,I-1,[T|Res]).
test_find(_L,0,Res) -> Res;
test_find(L,I,Res) ->
V = random:uniform(1000000000),
{T,_} = timer:tc(finder,find,[V,L]),
test_find(L,I-1,[T|Res]).
stat(L,N) ->
Aver = lists:sum(L)/N,
{Min,Max,Var} = lists:foldl(fun (X,{Mi,Ma,Va}) -> {min(X,Mi),max(X,Ma),Va+(X-Aver)*(X-Aver)} end, {999999999999999999999999999,0,0}, L),
Sig = math:sqrt(Var/N),
io_lib:format(" average: ~p,~n minimum: ~p,~n maximum: ~p,~n sigma : ~p.~n",[Aver,Min,Max,Sig]).
Here are some results.
1> finder:test(1000,10).
insert:
average: 266.7,
minimum: 216,
maximum: 324,
sigma : 36.98121144581393.
find:
average: 136.1,
minimum: 105,
maximum: 162,
sigma : 15.378231367748375.
ok
2> finder:test(100000,10).
insert:
average: 10096.5,
minimum: 9541,
maximum: 12222,
sigma : 762.5642595873478.
find:
average: 5077.4,
minimum: 4666,
maximum: 6937,
sigma : 627.126494417195.
ok
3> finder:test(1000000,10).
insert:
average: 109871.1,
minimum: 94747,
maximum: 139916,
sigma : 13852.211285206417.
find:
average: 40428.0,
minimum: 31297,
maximum: 56965,
sigma : 7797.425562325042.
ok
4> finder:test(100000000,10).
insert:
average: 8067547.8,
minimum: 6265625,
maximum: 16590349,
sigma : 3199868.809140206.
find:
average: 8484876.4,
minimum: 5158504,
maximum: 15950944,
sigma : 4044848.707872872.
ok
On the 100 000 000 list, it is slow, and the multi process solution cannot help on this dichotomy algorithm... It is a weak point of this solution, but if you have several processes in parallel requesting to find a nearest value, it will be able to use the multicore anyway.
Pascal.

Resources