RcppAramadillo Cube::operator() : index out of bounds - rcpp

I have been fiddling with the following C++ code for integration with R code that I have written (too much to include here), but keep getting an error that the Cube::operator() index is out of bounds and I am unsure as to why this is occurring. My suspicion is that the 3D array is not being filled correctly as described in
making 3d array with arma::cube in Rcpp shows cube error
but I am uncertain how to properly solve the issue.
Below is my full C++ code:
// [[Rcpp::depends(RcppArmadillo)]]
#define ARMA_DONT_PRINT_OPENMP_WARNING
#include <RcppArmadillo.h>
#include <RcppArmadilloExtensions/sample.h>
#include <set>
using namespace Rcpp;
int sample_one(int n) {
return n * unif_rand();
}
int sample_n_distinct(const IntegerVector& x,
int k,
const int * pop_ptr) {
IntegerVector ind_index = RcppArmadillo::sample(x, k, false);
std::set<int> distinct_container;
for (int i = 0; i < k; i++) {
distinct_container.insert(pop_ptr[ind_index[i]]);
}
return distinct_container.size();
}
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int num_specs = specs.size();
arma::Cube<int> res(perms, num_specs, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < num_specs; k++) {
for (j = 0; j < perms; j++) {
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}
Does someone have an idea as to what may be producing the said error?
Below is the R code with a call to the C++ function (including a commented-out triply-nested 'for' loop that the C++ code reproduces).
## Set up container(s) to hold the identity of each individual from each permutation ##
num.specs <- ceiling(N / K)
## Create an ID for each haplotype ##
haps <- 1:Hstar
## Assign individuals (N) to each subpopulation (K) ##
specs <- 1:num.specs
## Generate permutations, assume each permutation has N individuals, and sample those individuals' haplotypes from the probabilities ##
gen.perms <- function() {
sample(haps, size = num.specs, replace = TRUE, prob = probs)
}
pop <- array(dim = c(perms, num.specs, K))
for (i in 1:K) {
pop[,, i] <- replicate(perms, gen.perms())
}
## Make a matrix to hold individuals from each permutation ##
# HAC.mat <- array(dim = c(perms, num.specs, K))
## Perform haplotype accumulation ##
# for (k in specs) {
# for (j in 1:perms) {
# for (i in 1:K) {
# select.perm <- sample(1:nrow(pop), size = 1, replace = TRUE) # randomly sample a permutation
# ind.index <- sample(specs, size = k, replace = FALSE) # randomly sample individuals
# select.subpop <- sample(i, size = 1, replace = TRUE) # randomly sample a subpopulation
# hap.plot <- pop[select.perm, ind.index, select.subpop] # extract data
# HAC.mat[j, k, i] <- length(unique(hap.plot)) # how many haplotypes are recovered
# }
# }
# }
HAC.mat <- fillCube(pop, specs, perms, K)

This is an out-of-bounds error. The gist of problem is the call
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
since
sample_one(perms)
is being placed as an access index where the max length is num_specs. This is seen by how res is defined:
arma::Cube<int> res(perms, num_specs, K);
Thus, moving out perms out of num_specs place should resolve the issue.
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int num_specs = specs.size();
arma::Cube<int> res(perms, num_specs, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < num_specs; k++) {
for (j = 0; j < perms; j++) {
// swapped location
pop_ptr = &(pop(sample_one(perms), 0, sample_one(K)));
// should the middle index be 0?
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}

Related

Rcpp nested loops over lists of vectors of different types: Efficient template solution?

Another interesting Rcpp template programming problem: I want to compute the pairwise complete observations of all columns in an R data.frame or matrix. For matrices the code is easy thanks to RCPP_RETURN macros:
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
using namespace Rcpp;
template <int RTYPE>
IntegerMatrix pwNobsmCppImpl(const Matrix<RTYPE>& x) {
int l = x.nrow(), col = x.ncol();
auto isnnanT = (RTYPE == REALSXP) ? [](typename Rcpp::traits::storage_type<RTYPE>::type x) { return x == x; } :
[](typename Rcpp::traits::storage_type<RTYPE>::type x) { return x != Vector<RTYPE>::get_na(); };
IntegerMatrix out = no_init_matrix(col, col);
for(int j = 0; j != col; ++j) {
ConstMatrixColumn<RTYPE> colj = x( _ , j);
int nj = std::count_if(colj.begin(), colj.end(), isnnanT);
out(j, j) = nj;
for(int k = j+1; k != col; ++k) {
ConstMatrixColumn<RTYPE> colk = x( _ , k);
int njk = 0;
for(int i = l; i--; ) if(isnnanT(colj[i]) && isnnanT(colk[i])) ++njk; // fastest? or save logical vector with colj Non-missing?
out(j, k) = out(k, j) = njk;
}
}
out.attr("dimnames") = List::create(colnames(x), colnames(x));
return out;
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<CPLXSXP>& x) {
stop("Not supported SEXP type!");
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<VECSXP>& x) {
stop("Not supported SEXP type!");
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<RAWSXP>& x) {
stop("Not supported SEXP type!");
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<EXPRSXP>& x) {
stop("Not supported SEXP type!");
}
// [[Rcpp::export]]
IntegerMatrix pwNobsmCpp(SEXP x){
RCPP_RETURN_MATRIX(pwNobsmCppImpl, x);
}
Now the tricky but comes for data.frames / lists, whose colums can be of different types. It seems to me the native Rcpp solution is a switch(TYPEOF(x[j]), ...) statement that handles differently typed columns of a list on a case-by-case basis. Hoever in this case this would require 2 nested switches, leading to an enormous amount of code duplication. I am wondering if this can be avoided through some smart template programming. The basic code for Lists is:
// [[Rcpp::export]]
IntegerMatrix pwNobslCpp(const List& x) {
int l = x.size();
IntegerMatrix out = no_init_matrix(l, l);
for(int j = 0; j != l; ++j) {
int RTYPEj = TYPEOF(x[j]); // The code fails because the SEXPTYPE of x[j] is unknown beforehand!
auto isnnanTj = (RTYPEj == REALSXP) ? [](typename Rcpp::traits::storage_type<RTYPEj>::type x) { return x == x; } :
[](typename Rcpp::traits::storage_type<RTYPEj>::type x) { return x != Vector<RTYPEj>::get_na(); };
Vector<RTYPEj> colj = x[j];
int nj = std::count_if(colj.begin(), colj.end(), isnnanTj);
int rowj = colj.size();
out(j, j) = nj;
for(int k = j+1; k != l; ++k) {
int RTYPEk = TYPEOF(x[k]); // The code fails because the SEXPTYPE of x[k] is unknown beforehand!
auto isnnanTk = (RTYPEk == REALSXP) ? [](typename Rcpp::traits::storage_type<RTYPEk>::type x) { return x == x; } :
[](typename Rcpp::traits::storage_type<RTYPEk>::type x) { return x != Vector<RTYPEk>::get_na(); };
Vector<RTYPEk> colk = x[k];
if(colk.size() != rowj) stop("All columns need to have the same length!");
int njk = 0;
for(int i = rowj; i--; ) if(isnnanTj(colj[i]) && isnnanTk(colk[i])) ++njk; // fastest? or save logical vector with
j Non-missing?
out(j, k) = out(k, j) = njk;
}
}
out.attr("dimnames") = List::create(names(x), names(x));
return out;
}
This code of course fails because the Rcpp templates contained in it are not activated with known values. But templating the whole code does not make sense either because of the pairwise nature of the computation. Is there any way one can do this without the feared nested switch?

subset NumericMatrix by row and column names in Rcpp

I am trying to create a function in Rcpp that will take as input a pairwise numeric matrix, as well as a list of vectors, each element being a subset of row/column names. I would like this function identify the subset of the matrix that matches those names, and return the mean of the values.
Below I generated some dummy data that resembles the sort of data I have, and follow with an attempt of a Rcpp function.
library(Rcpp)
dat <- c(spA = 4, spB = 10, spC = 8, spD = 1, spE = 5, spF = 9)
pdist <- as.matrix(dist(dat))
pdist[upper.tri(pdist, diag = TRUE)] <- NA
Here I have a list made up of character vectors of various subsets of the row/column names in pdist
subsetList <- replicate(10, sample(names(dat), 4), simplify=FALSE)
For each of these sets of names, I would like to identify the subset of the pairwise matrix and take the mean of the values
Here is what I have so far, which does not work, but I think it illustrates where I am trying to get.
cppFunction('
List meanDistByCell(List input, NumericMatrix pairmat) {
int n = input.size();
List out(n);
List dimnames = pairmat.attr( "dimnames" );
CharacterVector colnames = dimnames[1];
for (int i = 0; i < n; i++) {
CharacterVector sp = as< CharacterVector >(input[i]);
if (sp.size() > 0) {
out[i] = double(mean(pairmat(sp, sp)));
} else {
out[i] = NA_REAL;
}
}
return out;
}
')
Any help would be greatly appreciated! Thanks!
Although (contiguous) range-based subsetting is available (e.g. x(Range(first_row, last_row), Range(first_col, last_col))), as coatless pointed out, subsetting by CharacterVector is not currently supported, so you will have to roll your own for the time being. A general-ish approach might look something like this:
template <int RTYPE> inline Matrix<RTYPE>
Subset2D(const Matrix<RTYPE>& x, CharacterVector crows, CharacterVector ccols) {
R_xlen_t i = 0, j = 0, rr = crows.length(), rc = ccols.length(), pos;
Matrix<RTYPE> res(rr, rc);
CharacterVector xrows = rownames(x), xcols = colnames(x);
IntegerVector rows = match(crows, xrows), cols = match(ccols, xcols);
for (; j < rc; j++) {
// NB: match returns 1-based indices
pos = cols[j] - 1;
for (i = 0; i < rr; i++) {
res(i, j) = x(rows[i] - 1, pos);
}
}
rownames(res) = crows;
colnames(res) = ccols;
return res;
}
// [[Rcpp::export]]
NumericMatrix subset2d(NumericMatrix x, CharacterVector rows, CharacterVector cols) {
return Subset2D(x, rows, cols);
}
This assumes that the input matrix has both row and column names, and that the row and column lookup vectors are valid subsets of those dimnames; additional defensive code could be added to make this more robust. To demonstrate,
subset2d(pdist, subsetList[[1]], subsetList[[1]])
# spB spD spE spC
# spB NA NA NA NA
# spD 9 NA NA 7
# spE 5 4 NA 3
# spC 2 NA NA NA
pdist[subsetList[[1]], subsetList[[1]]]
# spB spD spE spC
# spB NA NA NA NA
# spD 9 NA NA 7
# spE 5 4 NA 3
# spC 2 NA NA NA
Subset2D takes care of most of the boilerplate involved in implementing meanDistByCell; all that remains is to loop over the input list, apply this to each list element, and store the mean of the result in the output list:
// [[Rcpp::export]]
List meanDistByCell(List keys, NumericMatrix x, bool na_rm = false) {
R_xlen_t i = 0, sz = keys.size();
List res(sz);
if (!na_rm) {
for (; i < sz; i++) {
res[i] = NumericVector::create(
mean(Subset2D(x, keys[i], keys[i]))
);
}
} else {
for (; i < sz; i++) {
res[i] = NumericVector::create(
mean(na_omit(Subset2D(x, keys[i], keys[i])))
);
}
}
return res;
}
all.equal(
lapply(subsetList, function(x) mean(pdist[x, x], na.rm = TRUE)),
meanDistByCell2(subsetList, pdist, TRUE)
)
# [1] TRUE
Although the use of Subset2D allows for a much cleaner implementation of meanDistByCell, in this situation it is inefficient for at least a couple of reasons:
It sets the dimnames of the return object (rownames(res) = crows;, colnames(res) = ccols;), which you have no need for here.
It makes a call to match to obtain indices for each of rownames and colnames, which is unnecessary since you know in advance that rownames(x) == colnames(x).
You will incur the cost of both of these points k times, for an input list with length k.
A more efficient -- but consequently less concise -- approach would be to essentially implement only the aspects of Subset2D that are needed, inline inside of meanDistByCell:
// [[Rcpp::export]]
List meanDistByCell2(List keys, NumericMatrix x, bool na_rm = false) {
R_xlen_t k = 0, sz = keys.size(), i = 0, j = 0, nidx, pos;
List res(sz);
CharacterVector cx = colnames(x);
if (!na_rm) {
for (; k < sz; k++) {
// NB: match returns 1-based indices
IntegerVector idx = match(as<CharacterVector>(keys[k]), cx) - 1;
nidx = idx.size();
NumericVector tmp(nidx * nidx);
for (j = 0; j < nidx; j++) {
pos = idx[j];
for (i = 0; i < nidx; i++) {
tmp[nidx * j + i] = x(idx[i], pos);
}
}
res[k] = NumericVector::create(mean(tmp));
}
} else {
for (; k < sz; k++) {
IntegerVector idx = match(as<CharacterVector>(keys[k]), cx) - 1;
nidx = idx.size();
NumericVector tmp(nidx * nidx);
for (j = 0; j < nidx; j++) {
pos = idx[j];
for (i = 0; i < nidx; i++) {
tmp[nidx * j + i] = x(idx[i], pos);
}
}
res[k] = NumericVector::create(mean(na_omit(tmp)));
}
}
return res;
}
all.equal(
meanDistByCell(subsetList, pdist, TRUE),
meanDistByCell2(subsetList, pdist, TRUE)
)
# [1] TRUE

Errors with repeated FFTW calls

I'm having a strange issue that I can't resolve. I made this as a simple example that demonstrates the problem. I have a sine wave defined between [0, 2*pi]. I take the Fourier transform using FFTW. Then I have a for loop where I repeatedly take the inverse Fourier transform. In each iteration, I take the average of my solution and print the results. I expect that the average stays the same with each iteration because there is no change to solution, y. However, when I pick N = 256 and other even values of N, I note that the average grows as if there are numerical errors. However, if I choose, say, N = 255 or N = 257, this is not the case and I get what is expect (avg = 0.0 for each iteration).
Code:
#include <stdio.h>
#include <stdlib.h>
#include <fftw3.h>
#include <math.h>
int main(void)
{
int N = 256;
double dx = 2.0 * M_PI / (double)N, dt = 1.0e-3;
double *x, *y;
x = (double *) malloc (sizeof (double) * N);
y = (double *) malloc (sizeof (double) * N);
// initial conditions
for (int i = 0; i < N; i++) {
x[i] = (double)i * dx;
y[i] = sin(x[i]);
}
fftw_complex yhat[N/2 + 1];
fftw_plan fftwplan, fftwplan2;
// forward plan
fftwplan = fftw_plan_dft_r2c_1d(N, y, yhat, FFTW_ESTIMATE);
fftw_execute(fftwplan);
// set N/2th mode to zero if N is even
if (N % 2 < 1.0e-13) {
yhat[N/2][0] = 0.0;
yhat[N/2][1] = 0.0;
}
// backward plan
fftwplan2 = fftw_plan_dft_c2r_1d(N, yhat, y, FFTW_ESTIMATE);
for (int i = 0; i < 50; i++) {
// yhat to y
fftw_execute(fftwplan2);
// rescale
for (int j = 0; j < N; j++) {
y[j] = y[j] / (double)N;
}
double avg = 0.0;
for (int j = 0; j < N; j++) {
avg += y[j];
}
printf("%.15f\n", avg/N);
}
fftw_destroy_plan(fftwplan);
fftw_destroy_plan(fftwplan2);
void fftw_cleanup(void);
free(x);
free(y);
return 0;
}
Output for N = 256:
0.000000000000000
0.000000000000000
0.000000000000000
-0.000000000000000
0.000000000000000
0.000000000000022
-0.000000000000007
-0.000000000000039
0.000000000000161
-0.000000000000314
0.000000000000369
0.000000000004775
-0.000000000007390
-0.000000000079126
-0.000000000009457
-0.000000000462023
0.000000000900855
-0.000000000196451
0.000000000931323
-0.000000009895302
0.000000039348379
0.000000133179128
0.000000260770321
-0.000003233551979
0.000008285045624
-0.000016331672668
0.000067450106144
-0.000166893005371
0.001059055328369
-0.002521514892578
0.005493164062500
-0.029907226562500
0.093383789062500
-0.339111328125000
1.208251953125000
-3.937500000000000
13.654296875000000
-43.812500000000000
161.109375000000000
-479.250000000000000
1785.500000000000000
-5369.000000000000000
19376.000000000000000
-66372.000000000000000
221104.000000000000000
-753792.000000000000000
2387712.000000000000000
-8603776.000000000000000
29706240.000000000000000
-96833536.000000000000000
Any ideas?
libfftw has the odious habit of modifying its inputs. Back up yhat if you want to do repeated inverse transforms.
OTOH, it's perverse, but why are you repeating the same operation if you don't expect it give different results? (Despite this being the case)
As indicated in comments: "if you want to keep the input data unchanged, use the FFTW_PRESERVE_INPUT flag. Per http://www.fftw.org/doc/Planner-Flags.html"
For example:
// backward plan
fftwplan2 = fftw_plan_dft_c2r_1d(N, yhat, y, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT);

CodeJam 2014: How to solve task "New Lottery Game"?

I want to know efficient approach for the New Lottery Game problem.
The Lottery is changing! The Lottery used to have a machine to generate a random winning number. But due to cheating problems, the Lottery has decided to add another machine. The new winning number will be the result of the bitwise-AND operation between the two random numbers generated by the two machines.
To find the bitwise-AND of X and Y, write them both in binary; then a bit in the result in binary has a 1 if the corresponding bits of X and Y were both 1, and a 0 otherwise. In most programming languages, the bitwise-AND of X and Y is written X&Y.
For example:
The old machine generates the number 7 = 0111.
The new machine generates the number 11 = 1011.
The winning number will be (7 AND 11) = (0111 AND 1011) = 0011 = 3.
With this measure, the Lottery expects to reduce the cases of fraudulent claims, but unfortunately an employee from the Lottery company has leaked the following information: the old machine will always generate a non-negative integer less than A and the new one will always generate a non-negative integer less than B.
Catalina wants to win this lottery and to give it a try she decided to buy all non-negative integers less than K.
Given A, B and K, Catalina would like to know in how many different ways the machines can generate a pair of numbers that will make her a winner.
For small input we can check all possible pairs but how to do it with large inputs. I guess we represent the binary number into string first and then check permutations which would give answer less than K. But I can't seem to figure out how to calculate possible permutations of 2 binary strings.
I used a general DP technique that I described in a lot of detail in another answer.
We want to count the pairs (a, b) such that a < A, b < B and a & b < K.
The first step is to convert the numbers to binary and to pad them to the same size by adding leading zeroes. I just padded them to a fixed size of 40. The idea is to build up the valid a and b bit by bit.
Let f(i, loA, loB, loK) be the number of valid suffix pairs of a and b of size 40 - i. If loA is true, it means that the prefix up to i is already strictly smaller than the corresponding prefix of A. In that case there is no restriction on the next possible bit for a. If loA ist false, A[i] is an upper bound on the next bit we can place at the end of the current prefix. loB and loK have an analogous meaning.
Now we have the following transition:
long long f(int i, bool loA, bool loB, bool loK) {
// TODO add memoization
if (i == 40)
return loA && loB && loK;
int hiA = loA ? 1: A[i]-'0'; // upper bound on the next bit in a
int hiB = loB ? 1: B[i]-'0'; // upper bound on the next bit in b
int hiK = loK ? 1: K[i]-'0'; // upper bound on the next bit in a & b
long long res = 0;
for (int a = 0; a <= hiA; ++a)
for (int b = 0; b <= hiB; ++b) {
int k = a & b;
if (k > hiK) continue;
res += f(i+1, loA || a < A[i]-'0',
loB || b < B[i]-'0',
loK || k < K[i]-'0');
}
return res;
}
The result is f(0, false, false, false).
The runtime is O(max(log A, log B)) if memoization is added to ensure that every subproblem is only solved once.
What I did was just to identify when the answer is A * B.
Otherwise, just brute force the rest, this code passed the large input.
// for each test cases
long count = 0;
if ((K > A) || (K > B)) {
count = A * B;
continue; // print count and go to the next test case
}
count = A * B - (A-K) * (B-K);
for (int i = K; i < A; i++) {
for (int j = K; j < B; j++) {
if ((i&j) < K) count++;
}
}
I hope this helps!
just as Niklas B. said.
the whole answer is.
#include <algorithm>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <map>
#include <sstream>
#include <string>
#include <vector>
using namespace std;
#define MAX_SIZE 32
int A, B, K;
int arr_a[MAX_SIZE];
int arr_b[MAX_SIZE];
int arr_k[MAX_SIZE];
bool flag [MAX_SIZE][2][2][2];
long long matrix[MAX_SIZE][2][2][2];
long long
get_result();
int main(int argc, char *argv[])
{
int case_amount = 0;
cin >> case_amount;
for (int i = 0; i < case_amount; ++i)
{
const long long result = get_result();
cout << "Case #" << 1 + i << ": " << result << endl;
}
return 0;
}
long long
dp(const int h,
const bool can_A_choose_1,
const bool can_B_choose_1,
const bool can_K_choose_1)
{
if (MAX_SIZE == h)
return can_A_choose_1 && can_B_choose_1 && can_K_choose_1;
if (flag[h][can_A_choose_1][can_B_choose_1][can_K_choose_1])
return matrix[h][can_A_choose_1][can_B_choose_1][can_K_choose_1];
int cnt_A_max = arr_a[h];
int cnt_B_max = arr_b[h];
int cnt_K_max = arr_k[h];
if (can_A_choose_1)
cnt_A_max = 1;
if (can_B_choose_1)
cnt_B_max = 1;
if (can_K_choose_1)
cnt_K_max = 1;
long long res = 0;
for (int i = 0; i <= cnt_A_max; ++i)
{
for (int j = 0; j <= cnt_B_max; ++j)
{
int k = i & j;
if (k > cnt_K_max)
continue;
res += dp(h + 1,
can_A_choose_1 || (i < cnt_A_max),
can_B_choose_1 || (j < cnt_B_max),
can_K_choose_1 || (k < cnt_K_max));
}
}
flag[h][can_A_choose_1][can_B_choose_1][can_K_choose_1] = true;
matrix[h][can_A_choose_1][can_B_choose_1][can_K_choose_1] = res;
return res;
}
long long
get_result()
{
cin >> A >> B >> K;
memset(arr_a, 0, sizeof(arr_a));
memset(arr_b, 0, sizeof(arr_b));
memset(arr_k, 0, sizeof(arr_k));
memset(flag, 0, sizeof(flag));
memset(matrix, 0, sizeof(matrix));
int i = 31;
while (i >= 1)
{
arr_a[i] = A % 2;
A /= 2;
arr_b[i] = B % 2;
B /= 2;
arr_k[i] = K % 2;
K /= 2;
i--;
}
return dp(1, 0, 0, 0);
}

Finding the ranking of a word (permutations) with duplicate letters

I'm posting this although much has already been posted about this question. I didn't want to post as an answer since it's not working. The answer to this post (Finding the rank of the Given string in list of all possible permutations with Duplicates) did not work for me.
So I tried this (which is a compilation of code I've plagiarized and my attempt to deal with repetitions). The non-repeating cases work fine. BOOKKEEPER generates 83863, not the desired 10743.
(The factorial function and letter counter array 'repeats' are working correctly. I didn't post to save space.)
while (pointer != length)
{
if (sortedWordChars[pointer] != wordArray[pointer])
{
// Swap the current character with the one after that
char temp = sortedWordChars[pointer];
sortedWordChars[pointer] = sortedWordChars[next];
sortedWordChars[next] = temp;
next++;
//For each position check how many characters left have duplicates,
//and use the logic that if you need to permute n things and if 'a' things
//are similar the number of permutations is n!/a!
int ct = repeats[(sortedWordChars[pointer]-64)];
// Increment the rank
if (ct>1) { //repeats?
System.out.println("repeating " + (sortedWordChars[pointer]-64));
//In case of repetition of any character use: (n-1)!/(times)!
//e.g. if there is 1 character which is repeating twice,
//x* (n-1)!/2!
int dividend = getFactorialIter(length - pointer - 1);
int divisor = getFactorialIter(ct);
int quo = dividend/divisor;
rank += quo;
} else {
rank += getFactorialIter(length - pointer - 1);
}
} else
{
pointer++;
next = pointer + 1;
}
}
Note: this answer is for 1-based rankings, as specified implicitly by example. Here's some Python that works at least for the two examples provided. The key fact is that suffixperms * ctr[y] // ctr[x] is the number of permutations whose first letter is y of the length-(i + 1) suffix of perm.
from collections import Counter
def rankperm(perm):
rank = 1
suffixperms = 1
ctr = Counter()
for i in range(len(perm)):
x = perm[((len(perm) - 1) - i)]
ctr[x] += 1
for y in ctr:
if (y < x):
rank += ((suffixperms * ctr[y]) // ctr[x])
suffixperms = ((suffixperms * (i + 1)) // ctr[x])
return rank
print(rankperm('QUESTION'))
print(rankperm('BOOKKEEPER'))
Java version:
public static long rankPerm(String perm) {
long rank = 1;
long suffixPermCount = 1;
java.util.Map<Character, Integer> charCounts =
new java.util.HashMap<Character, Integer>();
for (int i = perm.length() - 1; i > -1; i--) {
char x = perm.charAt(i);
int xCount = charCounts.containsKey(x) ? charCounts.get(x) + 1 : 1;
charCounts.put(x, xCount);
for (java.util.Map.Entry<Character, Integer> e : charCounts.entrySet()) {
if (e.getKey() < x) {
rank += suffixPermCount * e.getValue() / xCount;
}
}
suffixPermCount *= perm.length() - i;
suffixPermCount /= xCount;
}
return rank;
}
Unranking permutations:
from collections import Counter
def unrankperm(letters, rank):
ctr = Counter()
permcount = 1
for i in range(len(letters)):
x = letters[i]
ctr[x] += 1
permcount = (permcount * (i + 1)) // ctr[x]
# ctr is the histogram of letters
# permcount is the number of distinct perms of letters
perm = []
for i in range(len(letters)):
for x in sorted(ctr.keys()):
# suffixcount is the number of distinct perms that begin with x
suffixcount = permcount * ctr[x] // (len(letters) - i)
if rank <= suffixcount:
perm.append(x)
permcount = suffixcount
ctr[x] -= 1
if ctr[x] == 0:
del ctr[x]
break
rank -= suffixcount
return ''.join(perm)
If we use mathematics, the complexity will come down and will be able to find rank quicker. This will be particularly helpful for large strings.
(more details can be found here)
Suggest to programmatically define the approach shown here (screenshot attached below) given below)
I would say David post (the accepted answer) is super cool. However, I would like to improve it further for speed. The inner loop is trying to find inverse order pairs, and for each such inverse order, it tries to contribute to the increment of rank. If we use an ordered map structure (binary search tree or BST) in that place, we can simply do an inorder traversal from the first node (left-bottom) until it reaches the current character in the BST, rather than traversal for the whole map(BST). In C++, std::map is a perfect one for BST implementation. The following code reduces the necessary iterations in loop and removes the if check.
long long rankofword(string s)
{
long long rank = 1;
long long suffixPermCount = 1;
map<char, int> m;
int size = s.size();
for (int i = size - 1; i > -1; i--)
{
char x = s[i];
m[x]++;
for (auto it = m.begin(); it != m.find(x); it++)
rank += suffixPermCount * it->second / m[x];
suffixPermCount *= (size - i);
suffixPermCount /= m[x];
}
return rank;
}
#Dvaid Einstat, this was really helpful. It took me a WHILE to figure out what you were doing as I am still learning my first language(C#). I translated it into C# and figured that I'd give that solution as well since this listing helped me so much!
Thanks!
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
namespace CsharpVersion
{
class Program
{
//Takes in the word and checks to make sure that the word
//is between 1 and 25 charaters inclusive and only
//letters are used
static string readWord(string prompt, int high)
{
Regex rgx = new Regex("^[a-zA-Z]+$");
string word;
string result;
do
{
Console.WriteLine(prompt);
word = Console.ReadLine();
} while (word == "" | word.Length > high | rgx.IsMatch(word) == false);
result = word.ToUpper();
return result;
}
//Creates a sorted dictionary containing distinct letters
//initialized with 0 frequency
static SortedDictionary<char,int> Counter(string word)
{
char[] wordArray = word.ToCharArray();
int len = word.Length;
SortedDictionary<char,int> count = new SortedDictionary<char,int>();
foreach(char c in word)
{
if(count.ContainsKey(c))
{
}
else
{
count.Add(c, 0);
}
}
return count;
}
//Creates a factorial function
static int Factorial(int n)
{
if (n <= 1)
{
return 1;
}
else
{
return n * Factorial(n - 1);
}
}
//Ranks the word input if there are no repeated charaters
//in the word
static Int64 rankWord(char[] wordArray)
{
int n = wordArray.Length;
Int64 rank = 1;
//loops through the array of letters
for (int i = 0; i < n-1; i++)
{
int x=0;
//loops all letters after i and compares them for factorial calculation
for (int j = i+1; j<n ; j++)
{
if (wordArray[i] > wordArray[j])
{
x++;
}
}
rank = rank + x * (Factorial(n - i - 1));
}
return rank;
}
//Ranks the word input if there are repeated charaters
//in the word
static Int64 rankPerm(String word)
{
Int64 rank = 1;
Int64 suffixPermCount = 1;
SortedDictionary<char, int> counter = Counter(word);
for (int i = word.Length - 1; i > -1; i--)
{
char x = Convert.ToChar(word.Substring(i,1));
int xCount;
if(counter[x] != 0)
{
xCount = counter[x] + 1;
}
else
{
xCount = 1;
}
counter[x] = xCount;
foreach (KeyValuePair<char,int> e in counter)
{
if (e.Key < x)
{
rank += suffixPermCount * e.Value / xCount;
}
}
suffixPermCount *= word.Length - i;
suffixPermCount /= xCount;
}
return rank;
}
static void Main(string[] args)
{
Console.WriteLine("Type Exit to end the program.");
string prompt = "Please enter a word using only letters:";
const int MAX_VALUE = 25;
Int64 rank = new Int64();
string theWord;
do
{
theWord = readWord(prompt, MAX_VALUE);
char[] wordLetters = theWord.ToCharArray();
Array.Sort(wordLetters);
bool duplicate = false;
for(int i = 0; i< theWord.Length - 1; i++)
{
if(wordLetters[i] < wordLetters[i+1])
{
duplicate = true;
}
}
if(duplicate)
{
SortedDictionary<char, int> counter = Counter(theWord);
rank = rankPerm(theWord);
Console.WriteLine("\n" + theWord + " = " + rank);
}
else
{
char[] letters = theWord.ToCharArray();
rank = rankWord(letters);
Console.WriteLine("\n" + theWord + " = " + rank);
}
} while (theWord != "EXIT");
Console.WriteLine("\nPress enter to escape..");
Console.Read();
}
}
}
If there are k distinct characters, the i^th character repeated n_i times, then the total number of permutations is given by
(n_1 + n_2 + ..+ n_k)!
------------------------------------------------
n_1! n_2! ... n_k!
which is the multinomial coefficient.
Now we can use this to compute the rank of a given permutation as follows:
Consider the first character(leftmost). say it was the r^th one in the sorted order of characters.
Now if you replace the first character by any of the 1,2,3,..,(r-1)^th character and consider all possible permutations, each of these permutations will precede the given permutation. The total number can be computed using the above formula.
Once you compute the number for the first character, fix the first character, and repeat the same with the second character and so on.
Here's the C++ implementation to your question
#include<iostream>
using namespace std;
int fact(int f) {
if (f == 0) return 1;
if (f <= 2) return f;
return (f * fact(f - 1));
}
int solve(string s,int n) {
int ans = 1;
int arr[26] = {0};
int len = n - 1;
for (int i = 0; i < n; i++) {
s[i] = toupper(s[i]);
arr[s[i] - 'A']++;
}
for(int i = 0; i < n; i++) {
int temp = 0;
int x = 1;
char c = s[i];
for(int j = 0; j < c - 'A'; j++) temp += arr[j];
for (int j = 0; j < 26; j++) x = (x * fact(arr[j]));
arr[c - 'A']--;
ans = ans + (temp * ((fact(len)) / x));
len--;
}
return ans;
}
int main() {
int i,n;
string s;
cin>>s;
n=s.size();
cout << solve(s,n);
return 0;
}
Java version of unrank for a String:
public static String unrankperm(String letters, int rank) {
Map<Character, Integer> charCounts = new java.util.HashMap<>();
int permcount = 1;
for(int i = 0; i < letters.length(); i++) {
char x = letters.charAt(i);
int xCount = charCounts.containsKey(x) ? charCounts.get(x) + 1 : 1;
charCounts.put(x, xCount);
permcount = (permcount * (i + 1)) / xCount;
}
// charCounts is the histogram of letters
// permcount is the number of distinct perms of letters
StringBuilder perm = new StringBuilder();
for(int i = 0; i < letters.length(); i++) {
List<Character> sorted = new ArrayList<>(charCounts.keySet());
Collections.sort(sorted);
for(Character x : sorted) {
// suffixcount is the number of distinct perms that begin with x
Integer frequency = charCounts.get(x);
int suffixcount = permcount * frequency / (letters.length() - i);
if (rank <= suffixcount) {
perm.append(x);
permcount = suffixcount;
if(frequency == 1) {
charCounts.remove(x);
} else {
charCounts.put(x, frequency - 1);
}
break;
}
rank -= suffixcount;
}
}
return perm.toString();
}
See also n-th-permutation-algorithm-for-use-in-brute-force-bin-packaging-parallelization.

Resources