RCPP and the %*% operator, revisited - rcpp

I'm trying to decide if it makes sense to implement R's %*% operator in RCpp
if my dataset is huge. BUT, I am really having trouble getting a RCpp implementation.
Here is my example R code
# remove everything in the global environment
rm(list = ls())
n_states = 4 # number of states
v_n <- c("H", "S1", "S2", "D") # the 4 states of the model:
n_t = 100 # number of transitions
# create transition matrix with random numbers. This transition matrix is constant.
m_P = matrix(runif(n_states*n_t), # insert n_states * n_t random numbers (can change this later)
nrow = n_states,
ncol = n_states,
dimnames = list(v_n, v_n))
# create markov trace, what proportion of population in each state at each period (won't make sense due to random numbers but that is fine)
m_TR <- matrix(NA,
nrow = n_t + 1 ,
ncol = n_states,
dimnames = list(0:n_t, v_n)) # create Markov trace (n_t + 1 because R doesn't understand Cycle 0)
# initialize Markov trace
m_TR[1, ] <- c(1, 0, 0, 0)
# run the loop
microbenchmark::microbenchmark( # function from microbenchmark library used to calculate how long this takes to run
for (t in 1:n_t){ # throughout the number of cycles
m_TR[t + 1, ] <- m_TR[t, ] %*% m_P # estimate the Markov trace for cycle the next cycle (t + 1)
}
) # end of micro-benchmark function
print(m_TR) # print the result.
And, here is the replacement for the %*% operator: (WHich doesn't seem to work correctly at all, although I can't fgure out why.
library(Rcpp)
cppFunction(
'void estimate_markov(int n_t, NumericMatrix m_P, NumericMatrix m_TR )
{
// We want to reproduce this
// matrix_A[X+1,] <- matrix_A[X,] %*% matrix_B
// The %*% operation behaves as follows for a vector_V %*% matrix_M
// Here the Matrix M is populated with letters so that you can
// more easily see how the operation is performed
// So, a multiplication like this:
//
// V M
// {1} %*% {A D}
// {2} {B E}
// {3} {C F}
//
// Results in a vector:
// V_result
// {1*A + 1*D}
// {2*B + 2*E}
// {3*C + 3*F}
//
// Now use values instead of letters for M (ex: A=1, B=2, .., F=6)
// V_result
// {1*1 + 1*4} {1 + 4} {5}
// {2*2 + 2*5} => {4 + 10} => {14}
// {3*3 + 3*6} {9 + 18} {27}
//
// Note that the RHS matrix may contain any number of columns,
// but *MUST* must contain the same number of rows as LHS vector
// Get dimensions of matricies , and sanity check
// number of elements in a vector from the LHS matrix must equal == number of row on the RHS
if( m_TR.cols() != m_P.rows())
throw std::range_error("Matrix mismatch, m_P.rows != m_TR.rows");
// we want to know these dimensions, and there is no reason to call these functons in a loop
// store the values once
int cnt_P_cols = m_P.cols();
int cnt_TR_cols = m_TR.cols();
//
for(int Index = 1; Index <= n_t; ++Index)
{
// iterate over the columns in m_TR
for(int col_iter = 0; col_iter < cnt_TR_cols; ++col_iter)
{
// an accumulator for the vector multiplication
double sum = 0;
// The new value comes from the previous row (Index-1)
double orig_TR = m_TR(col_iter, Index-1);
// iterate over the columns in m_P corresponding to this Index
for(int p_iter = 0; p_iter < cnt_P_cols; ++p_iter)
{
// accumulate the value of this TR scalar * the m_P vector
sum += orig_TR * m_P(p_iter, Index);
}
m_TR(col_iter, Index) = sum;
}
}
}'
)
Can someone point me to where my logic is going wrong.

Related

Sequence Of Zero

Consider the sequence of numbers from 1 to 𝑁. For example, for 𝑁 = 9,
we have 1, 2, 3, 4, 5, 6, 7, 8, 9.
Now, place among the numbers one of the three following operators:
"+" sum
"-" subtraction
"#" Paste Operator --> paste the previous and the next operands.
For example, 1#2 = 12
How can I calculate the number of possible sequences that yield zero ?
Example for N = 7:
1+2-3+4-5-6+7
1+2-3-4+5+6-7
1-2#3+4+5+6+7
1-2#3-4#5+6#7
1-2+3+4-5+6-7
1-2-3-4-5+6+7
See the fourth sequence, it is same as 1-23-45+67 and the result is 0.
All of the above sequences evaluate to zero.
Here is my recursion based solution just to build your intuition so that you can approach and improve this solution using dynamic programming on your own (implemented in c++):
// N is the input
// index_count is the index count in the given sequence
// sum is the total sum of a given sequence
int isEvaluteToZero(int N, int index_count, int sum){
// if N==1, then the sequence only contains 1 which is not 0, so return 0
if(N==1){
return 0;
}
// Base case
// if index_count is equal to N and total sum is 0, return 1, else 0
if(index_count==N){
if(sum==0){
return 1;
}
return 0;
}
// recursively call by considering '+' between index_count and index_count+1
// increase index_count by 1
int placeAdd = isEvaluteToZero(N, index_count+1, sum+index_count+1);
// recursively call by considering '-' between index_count and index_count+1
// increase index_count by 1
int placeMinus = isEvaluteToZero(N, index_count+1, sum-index_count-1);
// place '#'
int placePaste;
if(index_count+2<=N){
// paste the previous and the next operands
// For e.g., (8#9) = 8*(10^1)+9 = 89
// (9#10) = 9*(10^2)+10 = 910
// (99#100) = 99*(10^3)+100 = 99100
// (999#1000) = 999*(10^4)+1000 = 9991000
int num1 = index_count+1;
int num2 = index_count+2;
int concat_num = num1*(int)(pow(10, (int)num2/10 + 1) + 0.5)+num2;
placePaste = isEvaluteToZero(N, index_count+2, sum+concat_num) + isEvaluteToZero(N, index_count+2, sum-concat_num);
}else{
// in case index_count+2>N
placePaste = 0;
}
return (placeAdd+placeMinus+placePaste);
}
int main(){
int N, res=1, index_count=1;
cout<<"Enter N:";
cin>>N;
cout<<isEvaluteToZero(N, index_count, res)<<endl;
return 0;
}
output:
N=1 output=0
N=2 output=0
N=3 output=1
N=4 output=1
N=7 output=6

Optimizing a primality test based on runtime in Python

I'm pretty new to algorithms and runtimes, and I'm trying to optimise a bit of my code for a personal project.
import math
for num in range(0, 10000000000000000000000):
if all((num**(num+1)+(num+1)**(num))%i!=0 for i in range(2,int(math.sqrt((num**(num+1)+(num+1)**(num))))+1)):
print(num)
What can I do to speed this up? I know that num=80 should work but my code isn't getting past num=0, 1, 2 (it's not fast enough).
First I define my range, then I say if 'such-and-such' is prime from range 2 to sqrt(such-and-such) + 1, then return that number. Sqrt(n) + 1 is the minimum number of factors to test for the primality of n.
This is a primality test of sequence A051442
You would probably get a minor boost from computing (num**(num+1)+(num+1)**(num)) only once per iteration instead of sqrt(num**(num+1)+(num+1)**(num)) times. As you can see, this will greatly reduce the constant factor in your complexity. It won't change the fundamental complexity because you still need to compute the remainder. Change
if all((num**(num+1)+(num+1)**(num))%i!=0 for i in range(2,int(math.sqrt((num**(num+1)+(num+1)**(num))))+1)):
to
k = num**(num+1)+(num+1)**(num)
if all(k%i for i in range(2,int(math.sqrt(k))+1)):
The != 0 is implicit in Python.
Update
All this is just trivial improvement to an extremely inefficieny algorithm. The biggest speedup I can think of is to reduce the check k % i to only prime i. For any composite i = a * b such that k % i == 0, it must be the case that k % a == 0 and k % b == 0 (if k is divisible by i, it must also be divisible by the factors of i).
I am assuming that you don't want to use any kind of pre-computed prime tables in your code. In that case, you can compute the table yourself. This will involve checking all the numbers up to a given sqrt(k) only once ever, instead of once per iteration of num, since we can stash the previously computed primes in say a list. That will effectively increase the lower limit of the range in your current all from 2 to the square root of the previous k.
Let's define a function to extend our set of primes using the seive of Eratosthenes:
from math import sqrt
def extend(primes, from_, to):
"""
primes: a sequence containing prime numbers from 2 to `from - 1`, in order
from_: the number to start checking with
to: the number to end with (inclusive)
"""
if not primes:
primes.extend([2, 3])
return
for k in range(max(from_, 5), to + 1):
s = int(sqrt(k)) # No need to compute this more than once per k
for p in primes:
if p > s:
# Reached sqrt(k) -> short circuit success
primes.append(k)
break
elif not k % p:
# Found factor -> short circuit failure
break
Now we can use this function to extend our list of primes at every iteration of the original loop. This allows us to check the divisibility of k only against the slowly growing list of primes, not against all numbers:
primes = []
prev = 0
for num in range(10000000000000000000000):
k = num**(num + 1) + (num + 1)**num
lim = int(sqrt(k)) + 1
extend(primes, prev, lim)
#print('Num={}, K={}, checking {}-{}, p={}'.format(num, k, prev, lim, primes), end='... ')
if k <= 3 and k in primes or all(k % i for i in primes):
print('{}: {} Prime!'.format(num, k))
else:
print('{}: {} Nope'.format(num, k))
prev = lim + 1
I am not 100% sure that my extend function is optimal, but I am able to get to num == 13, k == 4731091158953433 in <10 minutes on my ridiculously old and slow laptop, so I guess it's not too bad. That means that the algorithm builds a complete table of primes up to ~7e7 in that time.
Update #2
A sort-of-but-not-really optimization you could do would be to check all(k % i for i in primes) before calling extend. This would save you a lot of cycles for numbers that have small prime factors, but would probably catch up to you later on, when you would end up having to compute all the primes up to some enormous number. Here is a sample of how you could do that:
primes = []
prev = 0
for num in range(10000000000000000000000):
k = num**(num + 1) + (num + 1)**num
lim = int(sqrt(k)) + 1
if not all(k % i for i in primes):
print('{}: {} Nope'.format(num, k))
continue
start = len(primes)
extend(primes, prev, lim)
if all(k % i for i in primes[start:]):
print('{}: {} Prime!'.format(num, k))
else:
print('{}: {} Nope'.format(num, k))
prev = lim + 1
While this version does not do much for the long run, it does explain why you were able to get to 15 so quickly in your original run. The prime table does note get extended after num == 3, until num == 16, which is when the terrible delay occurs in this version as well. The net runtime to 16 should be identical in both versions.
Update #3
As #paxdiablo suggests, the only numbers we need to consider in extend are multiples of 6 +/- 1. We can combine that with the fact that only a small number of primes generally need to be tested, and convert the functionality of extend into a generator that will only compute as many primes as absolutely necessary. Using Python's lazy generation should help. Here is a completely rewritten version:
from itertools import count
from math import ceil, sqrt
prime_table = [2, 3]
def prime_candidates(start=0):
"""
Infinite generator of prime number candidates starting with the
specified number.
Candidates are 2, 3 and all numbers that are of the form 6n-1 and 6n+1
"""
if start <= 3:
if start <= 2:
yield 2
yield 3
start = 5
delta = 2
else:
m = start % 6
if m < 2:
start += 1 - m
delta = 4
else:
start += 5 - m
delta = 2
while True:
yield start
start += delta
delta = 6 - delta
def isprime(n):
"""
Checks if `n` is prime.
All primes up to sqrt(n) are expected to already be present in
the generated `prime_table`.
"""
s = int(ceil(sqrt(n)))
for p in prime_table:
if p > s:
break
if not n % p:
return False
return True
def generate_primes(max):
"""
Generates primes up to the specified maximum.
First the existing table is yielded. Then, the new primes are
found in the sequence generated by `prime_candidates`. All verified
primes are added to the existing cache.
"""
for p in prime_table:
if p > max:
return
yield p
for k in prime_candidates(prime_table[-1] + 1):
if isprime(k):
prime_table.append(k)
if k > max:
# Putting the return here ensures that we always stop on a prime and therefore don't do any extra work
return
else:
yield k
for num in count():
k = num**(num + 1) + (num + 1)**num
lim = int(ceil(sqrt(k)))
b = all(k % i for i in generate_primes(lim))
print('n={}, k={} is {}prime'.format(num, k, '' if b else 'not '))
This version gets to 15 almost instantly. It gets stuck at 16 because the smallest prime factor for k=343809097055019694337 is 573645313. Some future expectations:
17 should be a breeze: 16248996011806421522977 has factor 19
18 will take a while: 812362695653248917890473 has factor 22156214713
19 is easy: 42832853457545958193355601 is divisible by 3
20 also easy: 2375370429446951548637196401 is divisible by 58967
21: 138213776357206521921578463913 is divisible by 13
22: 8419259736788826438132968480177 is divisible by 103
etc... (link to sequence)
So in terms of instant gratification, this method will get you much further if you can make it past 18 (which will take >100 times longer than getting past 16, which in my case took ~1.25hrs).
That being said, your greatest speedup at this point would be re-writing this in C or some similar low-level language that does not have as much overhead for loops.
Update #4
Just for giggles, here is an implementation of the latest Python version in C. I chose to go with GMP for arbitrary precision integers, because it is easy to use and install on my Red Hat system, and the docs are very clear:
#include <stdio.h>
#include <stdlib.h>
#include <gmp.h>
typedef struct {
size_t alloc;
size_t size;
mpz_t *numbers;
} PrimeTable;
void init_table(PrimeTable *buf)
{
buf->alloc = 0x100000L;
buf->size = 2;
buf->numbers = malloc(buf->alloc * sizeof(mpz_t));
if(buf == NULL) {
fprintf(stderr, "No memory for prime table\n");
exit(1);
}
mpz_init_set_si(buf->numbers[0], 2);
mpz_init_set_si(buf->numbers[1], 3);
return;
}
void append_table(PrimeTable *buf, mpz_t number)
{
if(buf->size == buf->alloc) {
size_t new = 2 * buf->alloc;
mpz_t *tmp = realloc(buf->numbers, new * sizeof(mpz_t));
if(tmp == NULL) {
fprintf(stderr, "Ran out of memory for prime table\n");
exit(1);
}
buf->alloc = new;
buf->numbers = tmp;
}
mpz_set(buf->numbers[buf->size], number);
buf->size++;
return;
}
size_t print_table(PrimeTable *buf, FILE *file)
{
size_t i, n;
n = fprintf(file, "Array contents = [");
for(i = 0; i < buf->size; i++) {
n += mpz_out_str(file, 10, buf->numbers[i]);
if(i < buf->size - 1)
n += fprintf(file, ", ");
}
n += fprintf(file, "]\n");
return n;
}
void free_table(PrimeTable *buf)
{
for(buf->size--; ((signed)(buf->size)) >= 0; buf->size--)
mpz_clear(buf->numbers[buf->size]);
free(buf->numbers);
return;
}
int isprime(mpz_t num, PrimeTable *table)
{
mpz_t max, rem, next;
size_t i, d, r;
mpz_inits(max, rem, NULL);
mpz_sqrtrem(max, rem, num);
// Check if perfect square: definitely not prime
if(!mpz_cmp_si(rem, 0)) {
mpz_clears(rem, max, NULL);
return 0;
}
/* Normal table lookup */
for(i = 0; i < table->size; i++) {
// Got to sqrt(n) -> prime
if(mpz_cmp(max, table->numbers[i]) < 0) {
mpz_clears(rem, max, NULL);
return 1;
}
// Found a factor -> not prime
if(mpz_divisible_p(num, table->numbers[i])) {
mpz_clears(rem, max, NULL);
return 0;
}
}
/* Extend table and do lookup */
// Start with last found prime + 2
mpz_init_set(next, table->numbers[i - 1]);
mpz_add_ui(next, next, 2);
// Find nearest number of form 6n-1 or 6n+1
r = mpz_fdiv_ui(next, 6);
if(r < 2) {
mpz_add_ui(next, next, 1 - r);
d = 4;
} else {
mpz_add_ui(next, next, 5 - r);
d = 2;
}
// Step along numbers of form 6n-1/6n+1. Check each candidate for
// primality. Don't stop until next prime after sqrt(n) to avoid
// duplication.
for(;;) {
if(isprime(next, table)) {
append_table(table, next);
if(mpz_divisible_p(num, next)) {
mpz_clears(next, rem, max, NULL);
return 0;
}
if(mpz_cmp(max, next) <= 0) {
mpz_clears(next, rem, max, NULL);
return 1;
}
}
mpz_add_ui(next, next, d);
d = 6 - d;
}
// Return can only happen from within loop.
}
int main(int argc, char *argv[])
{
PrimeTable table;
mpz_t k, a, b;
size_t n, next;
int p;
init_table(&table);
mpz_inits(k, a, b, NULL);
for(n = 0; ; n = next) {
next = n + 1;
mpz_set_ui(a, n);
mpz_pow_ui(a, a, next);
mpz_set_ui(b, next);
mpz_pow_ui(b, b, n);
mpz_add(k, a, b);
p = isprime(k, &table);
printf("n=%ld k=", n);
mpz_out_str(stdout, 10, k);
printf(" p=%d\n", p);
//print_table(&table, stdout);
}
mpz_clears(b, a, k, NULL);
free_table(&table);
return 0;
}
While this version has the exact same algorithmic complexity as the Python one, I expect it to run a few orders of magnitude faster because of the relatively minimal overhead incurred in C. And indeed, it took about 15 minutes to get stuck at n == 18, which is ~5 times faster than the Python version so far.
Update #5
This is going to be the last one, I promise.
GMP has a function called mpz_nextprime, which offers a potentially much faster implementation of this algorightm, especially with caching. According to the docs:
This function uses a probabilistic algorithm to identify primes. For practical purposes it’s adequate, the chance of a composite passing will be extremely small.
This means that it is probably much faster than the current prime generator I implemented, with a slight cost offset of some false primes being added to the cache. This cost should be minimal: even adding a few thousand extra modulo operations should be fine if the prime generator is faster than it is now.
The only part that needs to be replaced/modified is the portion of isprime below the comment /* Extend table and do lookup */. Basically that whole section just becomes a series of calls to mpz_nextprime instead of recursion.
At that point, you may as well adapt isprime to use mpz_probab_prime_p when possible. You only need to check for sure if the result of mpz_probab_prime_p is uncertain:
int isprime(mpz_t num, PrimeTable *table)
{
mpz_t max, rem, next;
size_t i, r;
int status;
status = mpz_probab_prime_p(num, 50);
// Status = 2 -> definite yes, Status = 0 -> definite no
if(status != 1)
return status != 0;
mpz_inits(max, rem, NULL);
mpz_sqrtrem(max, rem, num);
// Check if perfect square: definitely not prime
if(!mpz_cmp_si(rem, 0)) {
mpz_clears(rem, max, NULL);
return 0;
}
mpz_clear(rem);
/* Normal table lookup */
for(i = 0; i < table->size; i++) {
// Got to sqrt(n) -> prime
if(mpz_cmp(max, table->numbers[i]) < 0) {
mpz_clear(max);
return 1;
}
// Found a factor -> not prime
if(mpz_divisible_p(num, table->numbers[i])) {
mpz_clear(max);
return 0;
}
}
/* Extend table and do lookup */
// Start with last found prime + 2
mpz_init_set(next, table->numbers[i - 1]);
mpz_add_ui(next, next, 2);
// Step along probable primes
for(;;) {
mpz_nextprime(next, next);
append_table(table, next);
if(mpz_divisible_p(num, next)) {
r = 0;
break;
}
if(mpz_cmp(max, next) <= 0) {
r = 1;
break;
}
}
mpz_clears(next, max, NULL);
return r;
}
Sure enough, this version makes it to n == 79 in a couple of seconds at most. It appears to get stuck on n == 80, probably because mpz_probab_prime_p can't determine if k is a prime for sure. I doubt that computing all the primes up to ~10^80 is going to take a trivial amount of time.

Maximum element in array which is equal to product of two elements in array

We need to find the maximum element in an array which is also equal to product of two elements in the same array. For example [2,3,6,8] , here 6=2*3 so answer is 6.
My approach was to sort the array and followed by a two pointer method which checked whether the product exist for each element. This is o(nlog(n)) + O(n^2) = O(n^2) approach. Is there a faster way to this ?
There is a slight better solution with O(n * sqrt(n)) if you are allowed to use O(M) memory M = max number in A[i]
Use an array of size M to mark every number while you traverse them from smaller to bigger number.
For each number try all its factors and see if those were already present in the array map.
Here is a pseudo code for that:
#define M 1000000
int array_map[M+2];
int ans = -1;
sort(A,A+n);
for(i=0;i<n;i++) {
for(j=1;j<=sqrt(A[i]);j++) {
int num1 = j;
if(A[i]%num1==0) {
int num2 = A[i]/num1;
if(array_map[num1] && array_map[num2]) {
if(num1==num2) {
if(array_map[num1]>=2) ans = A[i];
} else {
ans = A[i];
}
}
}
}
array_map[A[i]]++;
}
There is an ever better approach if you know how to find all possible factors in log(M) this just becomes O(n*logM). You have to use sieve and backtracking for that
#JerryGoyal 's solution is correct. However, I think it can be optimized even further if instead of using B pointer, we use binary search to find the other factor of product if arr[c] is divisible by arr[a]. Here's the modification for his code:
for(c=n-1;(c>1)&& (max==-1);c--){ // loop through C
for(a=0;(a<c-1)&&(max==-1);a++){ // loop through A
if(arr[c]%arr[a]==0) // If arr[c] is divisible by arr[a]
{
if(binary_search(a+1, c-1, (arr[c]/arr[a]))) //#include<algorithm>
{
max = arr[c]; // if the other factor x of arr[c] is also in the array such that arr[c] = arr[a] * x
break;
}
}
}
}
I would have commented this on his solution, unfortunately I lack the reputation to do so.
Try this.
Written in c++
#include <vector>
#include <algorithm>
using namespace std;
int MaxElement(vector< int > Input)
{
sort(Input.begin(), Input.end());
int LargestElementOfInput = 0;
int i = 0;
while (i < Input.size() - 1)
{
if (LargestElementOfInput == Input[Input.size() - (i + 1)])
{
i++;
continue;
}
else
{
if (Input[i] != 0)
{
LargestElementOfInput = Input[Input.size() - (i + 1)];
int AllowedValue = LargestElementOfInput / Input[i];
int j = 0;
while (j < Input.size())
{
if (Input[j] > AllowedValue)
break;
else if (j == i)
{
j++;
continue;
}
else
{
int Product = Input[i] * Input[j++];
if (Product == LargestElementOfInput)
return Product;
}
}
}
i++;
}
}
return -1;
}
Once you have sorted the array, then you can use it to your advantage as below.
One improvement I can see - since you want to find the max element that meets the criteria,
Start from the right most element of the array. (8)
Divide that with the first element of the array. (8/2 = 4).
Now continue with the double pointer approach, till the element at second pointer is less than the value from the step 2 above or the match is found. (i.e., till second pointer value is < 4 or match is found).
If the match is found, then you got the max element.
Else, continue the loop with next highest element from the array. (6).
Efficient solution:
2 3 8 6
Sort the array
keep 3 pointers C, B and A.
Keeping C at the last and A at 0 index and B at 1st index.
traverse the array using pointers A and B till C and check if A*B=C exists or not.
If it exists then C is your answer.
Else, Move C a position back and traverse again keeping A at 0 and B at 1st index.
Keep repeating this till you get the sum or C reaches at 1st index.
Here's the complete solution:
int arr[] = new int[]{2, 3, 8, 6};
Arrays.sort(arr);
int n=arr.length;
int a,b,c,prod,max=-1;
for(c=n-1;(c>1)&& (max==-1);c--){ // loop through C
for(a=0;(a<c-1)&&(max==-1);a++){ // loop through A
for(b=a+1;b<c;b++){ // loop through B
prod=arr[a]*arr[b];
if(prod==arr[c]){
System.out.println("A: "+arr[a]+" B: "+arr[b]);
max=arr[c];
break;
}
if(prod>arr[c]){ // no need to go further
break;
}
}
}
}
System.out.println(max);
I came up with below solution where i am using one array list, and following one formula:
divisor(a or b) X quotient(b or a) = dividend(c)
Sort the array.
Put array into Collection Col.(ex. which has faster lookup, and maintains insertion order)
Have 2 pointer a,c.
keep c at last, and a at 0.
try to follow (divisor(a or b) X quotient(b or a) = dividend(c)).
Check if a is divisor of c, if yes then check for b in col.(a
If a is divisor and list has b, then c is the answer.
else increase a by 1, follow step 5, 6 till c-1.
if max not found then decrease c index, and follow the steps 4 and 5.
Check this C# solution:
-Loop through each element,
-loop and multiply each element with other elements,
-verify if the product exists in the array and is the max
private static int GetGreatest(int[] input)
{
int max = 0;
int p = 0; //product of pairs
//loop through the input array
for (int i = 0; i < input.Length; i++)
{
for (int j = i + 1; j < input.Length; j++)
{
p = input[i] * input[j];
if (p > max && Array.IndexOf(input, p) != -1)
{
max = p;
}
}
}
return max;
}
Time complexity O(n^2)

Generate string permutations recursively; each character appears n times

I'm trying to write an algorithm that will generate all strings of length nm, with exactly n of each number 1, 2, ... m,
For instance all strings of length 6, with exactly two 1's, two 2's and two 3's e.g. 112233, 121233,
I managed to do this with just 1's and 2's using a recursive method, but can't seem to get something that works when I introduce 3's.
When m = 2, the algorithm I have is:
generateAllStrings(int len, int K, String str)
{
if(len == 0)
{
output(str);
}
if(K > 0)
{
generateAllStrings(len - 1, K - 1, str + '2');
}
if(len > K)
{
generateAllStrings(len - 1, K, str + '1');
}
}
I've tried inserting similar conditions for the third number but the algorithm doesn't give a correct output. After that I wouldn't even know how to generalise for 4 numbers and above.
Is recursion the right thing to do? Any help would be appreciated.
One option would be to list off all distinct permutations of the string 111...1222...2...nnn....n. There are nice algorithms for enumerating all distinct permutations of a string in time proportional to the length of the string, and they'd probably be a good way to go about solving this problem.
To use a simple recursive algorithm, give each recursion the permutation so far (variable perm), and the number of occurances of each digit that is still available (array count).
Run the code snippet to generate all unique permutations for n=2 and m=4 (set: 11223344).
function permutations(n, m) {
var perm = "", count = []; // start with empty permutation
for (var i = 0; i < m; i++) count[i] = n; // set available number for each digit = n
permute(perm, count); // start recursion with "" and [n,n,n...]
function permute(perm, count) {
var done = true;
for (var i = 0; i < count.length; i++) { // iterate over all digits
if (count[i] > 0) { // more instances of digit i available
var c = count.slice(); // create hard copy of count array
--c[i]; // decrement count of digit i
permute(perm + (i + 1), c); // add digit to permutation and recurse
done = false; // digits left over: not the last step
}
}
if (done) document.write(perm + "<BR>"); // no digits left: complete permutation
}
}
permutations(2, 4);
You can easily do this using DFS (or BFS alternatively). We can define an graph such that each node contains one string and a node is connected to any node that holds a string with a pair of int swaped in comparison to the original string. This graph is connected, thus we can easily generate a set of all nodes; which will contain all strings that are searched:
set generated_strings
list nodes
nodes.add(generateInitialString(N , M))
generated_strings.add(generateInitialString(N , M))
while(!nodes.empty())
string tmp = nodes.remove(0)
for (int i in [0 , N * M))
for (int j in distinct([0 , N * M) , i))
string new = swap(tmp , i , j)
if (!generated_strings.contains(new))
nodes.add(new)
generated_strings.add(new)
//generated_strings now contains all strings that can possibly be generated.

Thrust vectorized search: Efficiently combine lower_bound and binary_search to find both position and existence

I'm trying to use Thrust to detect if each element of an array can be found in another array and where (both arrays are sorted). I came across the vectorized search routines (lower_bound and binary_search).
lower_bound will return for each value the index where it could be inserted in a list respecting its ordering.
I also need to know if the value is found or not (which can be done with binary_search), not just its position.
Is it possible to achieve both efficiently without making two searches (calling binary_search and then lower_bound)?
I know in the scalar case, lower_bound will return a pointer to end of the array if a value cannot be found, but this does not happens in the vectorized version.
Thanks!
You can check that the element that lower_bound returns is the same as the one you searched for. E.g. given a = {1,3,5} and searching for b = {1,4}, the result will be c = {0,2}. We have a[c[0]] == b[0], so b[0] is in a, but a[c[1]] != b[1] so b[1] is not in a.
(Note that you will need to ensure that you don't make any out-of-bounds memory accesses, since lower_bound can return an index that is beyond the end of the array.)
#tat0: you can also play around with Arrayfire:
vectorized search using lower_bound() does not give you the answer immediately
while with setintersect() in arrayfire, you get the "intersection" of two arrays directly:
float A_host[] = {3,22,4,5,2,9,234,11,6,17,7,873,23,45,454};
int szA = sizeof(A_host) / sizeof(float);
float B_host[] = {345,5,55,6,7,8,19,2,63};
int szB = sizeof(B_host) / sizeof(float);
// initialize arrays from host data
array A(szA, 1, A_host);
array B(szB, 1, B_host);
array U = setintersect(A, B); // compute intersection of 2 arrays
int n_common = U.elements();
std::cout << "common: ";
print(U);
the output is:
common: U = 2.0000
5.0000
6.0000
7.0000
to get the actual locations of these elements in array A, you can use the following
construct (provided that elements in A are unique):
int n_common = U.elements();
array loc = zeros(n_common); // empty array
gfor(array i, n_common) // parallel for loop
loc(i) = sum((A == U(i))*seq(szA));
print(loc);
then: loc =
4.0000
3.0000
8.0000
10.0000
Furthermore, thrust::lower_bound() seems to be slower than setintersect(),
i benchmarked it with the following program:
int *g_data = 0;
int g_N = 0;
void thrust_test() {
thrust::device_ptr<int> A = thrust::device_pointer_cast((int *)g_data),
B = thrust::device_pointer_cast((int *)g_data + g_N);
thrust::device_vector<int> output(g_N);
thrust::lower_bound(A, A + g_N, B, B + g_N,
output.begin(),
thrust::less<int>());
std::cout << "thrust: " << output.size() << "\n";
}
void af_test()
{
array A(g_N, 1, g_data, afDevicePointer);
array B(g_N, 1, g_data + g_N, afDevicePointer);
array U = setintersect(A, B);
std::cout << "intersection sz: " << U.elements() << "\n";
}
int main()
{
g_N = 3e6; // 3M entries
thrust::host_vector< int > input(g_N*2);
for(int i = 0; i < g_N*2; i++) { // generate some input
if(i & 1)
input[i] = (i*i) % 1131;
else
input[i] = (i*i*i-1) % 1223 ;
}
thrust::device_vector< int > dev_input = input;
// sort the vector A
thrust::sort(dev_input.begin(), dev_input.begin() + g_N);
// sort the vector B
thrust::sort(dev_input.begin() + g_N, dev_input.begin() + g_N*2);
g_data = thrust::raw_pointer_cast(dev_input.data());
try {
info();
printf("thrust: %.5f seconds\n", timeit(thrust_test));
printf("af: %.5f seconds\n", timeit(af_test));
} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
}
return 0;
}
and the results:
CUDA toolkit 4.2, driver 295.59
GPU0 GeForce GT 650M, 2048 MB, Compute 3.0 (single,double)
Memory Usage: 1937 MB free (2048 MB total)
thrust: 0.13008 seconds
arrayfire: 0.06702 seconds

Resources