Rcpp nested loops over lists of vectors of different types: Efficient template solution? - rcpp

Another interesting Rcpp template programming problem: I want to compute the pairwise complete observations of all columns in an R data.frame or matrix. For matrices the code is easy thanks to RCPP_RETURN macros:
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
using namespace Rcpp;
template <int RTYPE>
IntegerMatrix pwNobsmCppImpl(const Matrix<RTYPE>& x) {
int l = x.nrow(), col = x.ncol();
auto isnnanT = (RTYPE == REALSXP) ? [](typename Rcpp::traits::storage_type<RTYPE>::type x) { return x == x; } :
[](typename Rcpp::traits::storage_type<RTYPE>::type x) { return x != Vector<RTYPE>::get_na(); };
IntegerMatrix out = no_init_matrix(col, col);
for(int j = 0; j != col; ++j) {
ConstMatrixColumn<RTYPE> colj = x( _ , j);
int nj = std::count_if(colj.begin(), colj.end(), isnnanT);
out(j, j) = nj;
for(int k = j+1; k != col; ++k) {
ConstMatrixColumn<RTYPE> colk = x( _ , k);
int njk = 0;
for(int i = l; i--; ) if(isnnanT(colj[i]) && isnnanT(colk[i])) ++njk; // fastest? or save logical vector with colj Non-missing?
out(j, k) = out(k, j) = njk;
}
}
out.attr("dimnames") = List::create(colnames(x), colnames(x));
return out;
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<CPLXSXP>& x) {
stop("Not supported SEXP type!");
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<VECSXP>& x) {
stop("Not supported SEXP type!");
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<RAWSXP>& x) {
stop("Not supported SEXP type!");
}
template <>
IntegerMatrix pwNobsmCppImpl(const Matrix<EXPRSXP>& x) {
stop("Not supported SEXP type!");
}
// [[Rcpp::export]]
IntegerMatrix pwNobsmCpp(SEXP x){
RCPP_RETURN_MATRIX(pwNobsmCppImpl, x);
}
Now the tricky but comes for data.frames / lists, whose colums can be of different types. It seems to me the native Rcpp solution is a switch(TYPEOF(x[j]), ...) statement that handles differently typed columns of a list on a case-by-case basis. Hoever in this case this would require 2 nested switches, leading to an enormous amount of code duplication. I am wondering if this can be avoided through some smart template programming. The basic code for Lists is:
// [[Rcpp::export]]
IntegerMatrix pwNobslCpp(const List& x) {
int l = x.size();
IntegerMatrix out = no_init_matrix(l, l);
for(int j = 0; j != l; ++j) {
int RTYPEj = TYPEOF(x[j]); // The code fails because the SEXPTYPE of x[j] is unknown beforehand!
auto isnnanTj = (RTYPEj == REALSXP) ? [](typename Rcpp::traits::storage_type<RTYPEj>::type x) { return x == x; } :
[](typename Rcpp::traits::storage_type<RTYPEj>::type x) { return x != Vector<RTYPEj>::get_na(); };
Vector<RTYPEj> colj = x[j];
int nj = std::count_if(colj.begin(), colj.end(), isnnanTj);
int rowj = colj.size();
out(j, j) = nj;
for(int k = j+1; k != l; ++k) {
int RTYPEk = TYPEOF(x[k]); // The code fails because the SEXPTYPE of x[k] is unknown beforehand!
auto isnnanTk = (RTYPEk == REALSXP) ? [](typename Rcpp::traits::storage_type<RTYPEk>::type x) { return x == x; } :
[](typename Rcpp::traits::storage_type<RTYPEk>::type x) { return x != Vector<RTYPEk>::get_na(); };
Vector<RTYPEk> colk = x[k];
if(colk.size() != rowj) stop("All columns need to have the same length!");
int njk = 0;
for(int i = rowj; i--; ) if(isnnanTj(colj[i]) && isnnanTk(colk[i])) ++njk; // fastest? or save logical vector with
j Non-missing?
out(j, k) = out(k, j) = njk;
}
}
out.attr("dimnames") = List::create(names(x), names(x));
return out;
}
This code of course fails because the Rcpp templates contained in it are not activated with known values. But templating the whole code does not make sense either because of the pairwise nature of the computation. Is there any way one can do this without the feared nested switch?

Related

Extract code between the function using regxr or any suitable way in python

Goal / Need help to: extract code inside the Functions as well as Class using regex (code inside the curly brackets {} )
However, its okay to use some other library other than regex to find the solution
Programming language: Python
Issues: not all the code inside the function is extracted (it partially provides the code inside the class.
Test File: JAVA code (merge sort algorithm)
My python code: ( in the code, I am trying to extract code between class but not working and also want to do for functions)
f=open("MergeSort.java","r")
code=f.read()
className="MergeSort"
results = set()
regxStr=className+' \{.*?\}'
codeTraverse=code.replace("\n", " ")+""
codeTraverse=codeTraverse.replace("\t"," ")
re.findall(regxStr, codeTraverse)
print("------------------**************------------answers",re.findall(regxStr, codeTraverse))
print("finish*******")
Output picture:
enter image description here
Target File: JAVAFILE that i am trying to extract the code from is follows
/* Java program for Merge Sort Code taken from geeks for geeks*/
class MergeSort
{
void merge(int arr[], int l, int m, int r)
{
int n1 = m - l + 1;
int n2 = r - m;
int L[] = new int[n1];
int R[] = new int[n2];
for (int i = 0; i < n1; ++i)
L[i] = arr[l + i];
for (int j = 0; j < n2; ++j)
R[j] = arr[m + 1 + j];
int i = 0, j = 0;
int k = l;
while (i < n1 && j < n2) {
if (L[i] <= R[j]) {
arr[k] = L[i];
i++;
}
else {
arr[k] = R[j];
j++;
}
k++;
}
while (i < n1) {
arr[k] = L[i];
i++;
k++;
}
while (j < n2) {
arr[k] = R[j];
j++;
k++;
}
}
void sort(int arr[], int l, int r)
{
if (l < r) {
int m =l+ (r-l)/2;
sort(arr, l, m);
sort(arr, m + 1, r);
merge(arr, l, m, r);
}
}
static void printArray(int arr[])
{
int n = arr.length;
for (int i = 0; i < n; ++i)
System.out.print(arr[i] + " ");
System.out.println();
}
public static void main(String args[])
{
int arr[] = { 12, 11, 13, 5, 6, 7 };
System.out.println("Given Array");
printArray(arr);
MergeSort ob = new MergeSort();
ob.sort(arr, 0, arr.length - 1);
System.out.println("\nSorted array");
printArray(arr);
}
}
Notes:
So lets say if I request for function "printArray" from python. I am trying to get all the code inside the function "printArray" using regxr and similarily, when I request code inside the class name, I should be able to get code inside the class.
I did take a look over regex code on stackoverflow like following,https://stackoverflow.com/questions/38456603/extract-string-inside-nested-brackets
But I am still not able to use it efficiently.
I am stuck and need some help from stackoverflow community. Thank you and please.
r"(?<=\{)(\D*|\d*)(?=\})"gm
This should grab everything that's not a digit \D (including new lines), and everything that is a digit \d, between curly braces. This also excludes the curly braces themselves with positive look behind (?<=\{) and positive look ahead (?=\}).

RcppAramadillo Cube::operator() : index out of bounds

I have been fiddling with the following C++ code for integration with R code that I have written (too much to include here), but keep getting an error that the Cube::operator() index is out of bounds and I am unsure as to why this is occurring. My suspicion is that the 3D array is not being filled correctly as described in
making 3d array with arma::cube in Rcpp shows cube error
but I am uncertain how to properly solve the issue.
Below is my full C++ code:
// [[Rcpp::depends(RcppArmadillo)]]
#define ARMA_DONT_PRINT_OPENMP_WARNING
#include <RcppArmadillo.h>
#include <RcppArmadilloExtensions/sample.h>
#include <set>
using namespace Rcpp;
int sample_one(int n) {
return n * unif_rand();
}
int sample_n_distinct(const IntegerVector& x,
int k,
const int * pop_ptr) {
IntegerVector ind_index = RcppArmadillo::sample(x, k, false);
std::set<int> distinct_container;
for (int i = 0; i < k; i++) {
distinct_container.insert(pop_ptr[ind_index[i]]);
}
return distinct_container.size();
}
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int num_specs = specs.size();
arma::Cube<int> res(perms, num_specs, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < num_specs; k++) {
for (j = 0; j < perms; j++) {
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}
Does someone have an idea as to what may be producing the said error?
Below is the R code with a call to the C++ function (including a commented-out triply-nested 'for' loop that the C++ code reproduces).
## Set up container(s) to hold the identity of each individual from each permutation ##
num.specs <- ceiling(N / K)
## Create an ID for each haplotype ##
haps <- 1:Hstar
## Assign individuals (N) to each subpopulation (K) ##
specs <- 1:num.specs
## Generate permutations, assume each permutation has N individuals, and sample those individuals' haplotypes from the probabilities ##
gen.perms <- function() {
sample(haps, size = num.specs, replace = TRUE, prob = probs)
}
pop <- array(dim = c(perms, num.specs, K))
for (i in 1:K) {
pop[,, i] <- replicate(perms, gen.perms())
}
## Make a matrix to hold individuals from each permutation ##
# HAC.mat <- array(dim = c(perms, num.specs, K))
## Perform haplotype accumulation ##
# for (k in specs) {
# for (j in 1:perms) {
# for (i in 1:K) {
# select.perm <- sample(1:nrow(pop), size = 1, replace = TRUE) # randomly sample a permutation
# ind.index <- sample(specs, size = k, replace = FALSE) # randomly sample individuals
# select.subpop <- sample(i, size = 1, replace = TRUE) # randomly sample a subpopulation
# hap.plot <- pop[select.perm, ind.index, select.subpop] # extract data
# HAC.mat[j, k, i] <- length(unique(hap.plot)) # how many haplotypes are recovered
# }
# }
# }
HAC.mat <- fillCube(pop, specs, perms, K)
This is an out-of-bounds error. The gist of problem is the call
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
since
sample_one(perms)
is being placed as an access index where the max length is num_specs. This is seen by how res is defined:
arma::Cube<int> res(perms, num_specs, K);
Thus, moving out perms out of num_specs place should resolve the issue.
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int num_specs = specs.size();
arma::Cube<int> res(perms, num_specs, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < num_specs; k++) {
for (j = 0; j < perms; j++) {
// swapped location
pop_ptr = &(pop(sample_one(perms), 0, sample_one(K)));
// should the middle index be 0?
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}

Finding the ranking of a word (permutations) with duplicate letters

I'm posting this although much has already been posted about this question. I didn't want to post as an answer since it's not working. The answer to this post (Finding the rank of the Given string in list of all possible permutations with Duplicates) did not work for me.
So I tried this (which is a compilation of code I've plagiarized and my attempt to deal with repetitions). The non-repeating cases work fine. BOOKKEEPER generates 83863, not the desired 10743.
(The factorial function and letter counter array 'repeats' are working correctly. I didn't post to save space.)
while (pointer != length)
{
if (sortedWordChars[pointer] != wordArray[pointer])
{
// Swap the current character with the one after that
char temp = sortedWordChars[pointer];
sortedWordChars[pointer] = sortedWordChars[next];
sortedWordChars[next] = temp;
next++;
//For each position check how many characters left have duplicates,
//and use the logic that if you need to permute n things and if 'a' things
//are similar the number of permutations is n!/a!
int ct = repeats[(sortedWordChars[pointer]-64)];
// Increment the rank
if (ct>1) { //repeats?
System.out.println("repeating " + (sortedWordChars[pointer]-64));
//In case of repetition of any character use: (n-1)!/(times)!
//e.g. if there is 1 character which is repeating twice,
//x* (n-1)!/2!
int dividend = getFactorialIter(length - pointer - 1);
int divisor = getFactorialIter(ct);
int quo = dividend/divisor;
rank += quo;
} else {
rank += getFactorialIter(length - pointer - 1);
}
} else
{
pointer++;
next = pointer + 1;
}
}
Note: this answer is for 1-based rankings, as specified implicitly by example. Here's some Python that works at least for the two examples provided. The key fact is that suffixperms * ctr[y] // ctr[x] is the number of permutations whose first letter is y of the length-(i + 1) suffix of perm.
from collections import Counter
def rankperm(perm):
rank = 1
suffixperms = 1
ctr = Counter()
for i in range(len(perm)):
x = perm[((len(perm) - 1) - i)]
ctr[x] += 1
for y in ctr:
if (y < x):
rank += ((suffixperms * ctr[y]) // ctr[x])
suffixperms = ((suffixperms * (i + 1)) // ctr[x])
return rank
print(rankperm('QUESTION'))
print(rankperm('BOOKKEEPER'))
Java version:
public static long rankPerm(String perm) {
long rank = 1;
long suffixPermCount = 1;
java.util.Map<Character, Integer> charCounts =
new java.util.HashMap<Character, Integer>();
for (int i = perm.length() - 1; i > -1; i--) {
char x = perm.charAt(i);
int xCount = charCounts.containsKey(x) ? charCounts.get(x) + 1 : 1;
charCounts.put(x, xCount);
for (java.util.Map.Entry<Character, Integer> e : charCounts.entrySet()) {
if (e.getKey() < x) {
rank += suffixPermCount * e.getValue() / xCount;
}
}
suffixPermCount *= perm.length() - i;
suffixPermCount /= xCount;
}
return rank;
}
Unranking permutations:
from collections import Counter
def unrankperm(letters, rank):
ctr = Counter()
permcount = 1
for i in range(len(letters)):
x = letters[i]
ctr[x] += 1
permcount = (permcount * (i + 1)) // ctr[x]
# ctr is the histogram of letters
# permcount is the number of distinct perms of letters
perm = []
for i in range(len(letters)):
for x in sorted(ctr.keys()):
# suffixcount is the number of distinct perms that begin with x
suffixcount = permcount * ctr[x] // (len(letters) - i)
if rank <= suffixcount:
perm.append(x)
permcount = suffixcount
ctr[x] -= 1
if ctr[x] == 0:
del ctr[x]
break
rank -= suffixcount
return ''.join(perm)
If we use mathematics, the complexity will come down and will be able to find rank quicker. This will be particularly helpful for large strings.
(more details can be found here)
Suggest to programmatically define the approach shown here (screenshot attached below) given below)
I would say David post (the accepted answer) is super cool. However, I would like to improve it further for speed. The inner loop is trying to find inverse order pairs, and for each such inverse order, it tries to contribute to the increment of rank. If we use an ordered map structure (binary search tree or BST) in that place, we can simply do an inorder traversal from the first node (left-bottom) until it reaches the current character in the BST, rather than traversal for the whole map(BST). In C++, std::map is a perfect one for BST implementation. The following code reduces the necessary iterations in loop and removes the if check.
long long rankofword(string s)
{
long long rank = 1;
long long suffixPermCount = 1;
map<char, int> m;
int size = s.size();
for (int i = size - 1; i > -1; i--)
{
char x = s[i];
m[x]++;
for (auto it = m.begin(); it != m.find(x); it++)
rank += suffixPermCount * it->second / m[x];
suffixPermCount *= (size - i);
suffixPermCount /= m[x];
}
return rank;
}
#Dvaid Einstat, this was really helpful. It took me a WHILE to figure out what you were doing as I am still learning my first language(C#). I translated it into C# and figured that I'd give that solution as well since this listing helped me so much!
Thanks!
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
namespace CsharpVersion
{
class Program
{
//Takes in the word and checks to make sure that the word
//is between 1 and 25 charaters inclusive and only
//letters are used
static string readWord(string prompt, int high)
{
Regex rgx = new Regex("^[a-zA-Z]+$");
string word;
string result;
do
{
Console.WriteLine(prompt);
word = Console.ReadLine();
} while (word == "" | word.Length > high | rgx.IsMatch(word) == false);
result = word.ToUpper();
return result;
}
//Creates a sorted dictionary containing distinct letters
//initialized with 0 frequency
static SortedDictionary<char,int> Counter(string word)
{
char[] wordArray = word.ToCharArray();
int len = word.Length;
SortedDictionary<char,int> count = new SortedDictionary<char,int>();
foreach(char c in word)
{
if(count.ContainsKey(c))
{
}
else
{
count.Add(c, 0);
}
}
return count;
}
//Creates a factorial function
static int Factorial(int n)
{
if (n <= 1)
{
return 1;
}
else
{
return n * Factorial(n - 1);
}
}
//Ranks the word input if there are no repeated charaters
//in the word
static Int64 rankWord(char[] wordArray)
{
int n = wordArray.Length;
Int64 rank = 1;
//loops through the array of letters
for (int i = 0; i < n-1; i++)
{
int x=0;
//loops all letters after i and compares them for factorial calculation
for (int j = i+1; j<n ; j++)
{
if (wordArray[i] > wordArray[j])
{
x++;
}
}
rank = rank + x * (Factorial(n - i - 1));
}
return rank;
}
//Ranks the word input if there are repeated charaters
//in the word
static Int64 rankPerm(String word)
{
Int64 rank = 1;
Int64 suffixPermCount = 1;
SortedDictionary<char, int> counter = Counter(word);
for (int i = word.Length - 1; i > -1; i--)
{
char x = Convert.ToChar(word.Substring(i,1));
int xCount;
if(counter[x] != 0)
{
xCount = counter[x] + 1;
}
else
{
xCount = 1;
}
counter[x] = xCount;
foreach (KeyValuePair<char,int> e in counter)
{
if (e.Key < x)
{
rank += suffixPermCount * e.Value / xCount;
}
}
suffixPermCount *= word.Length - i;
suffixPermCount /= xCount;
}
return rank;
}
static void Main(string[] args)
{
Console.WriteLine("Type Exit to end the program.");
string prompt = "Please enter a word using only letters:";
const int MAX_VALUE = 25;
Int64 rank = new Int64();
string theWord;
do
{
theWord = readWord(prompt, MAX_VALUE);
char[] wordLetters = theWord.ToCharArray();
Array.Sort(wordLetters);
bool duplicate = false;
for(int i = 0; i< theWord.Length - 1; i++)
{
if(wordLetters[i] < wordLetters[i+1])
{
duplicate = true;
}
}
if(duplicate)
{
SortedDictionary<char, int> counter = Counter(theWord);
rank = rankPerm(theWord);
Console.WriteLine("\n" + theWord + " = " + rank);
}
else
{
char[] letters = theWord.ToCharArray();
rank = rankWord(letters);
Console.WriteLine("\n" + theWord + " = " + rank);
}
} while (theWord != "EXIT");
Console.WriteLine("\nPress enter to escape..");
Console.Read();
}
}
}
If there are k distinct characters, the i^th character repeated n_i times, then the total number of permutations is given by
(n_1 + n_2 + ..+ n_k)!
------------------------------------------------
n_1! n_2! ... n_k!
which is the multinomial coefficient.
Now we can use this to compute the rank of a given permutation as follows:
Consider the first character(leftmost). say it was the r^th one in the sorted order of characters.
Now if you replace the first character by any of the 1,2,3,..,(r-1)^th character and consider all possible permutations, each of these permutations will precede the given permutation. The total number can be computed using the above formula.
Once you compute the number for the first character, fix the first character, and repeat the same with the second character and so on.
Here's the C++ implementation to your question
#include<iostream>
using namespace std;
int fact(int f) {
if (f == 0) return 1;
if (f <= 2) return f;
return (f * fact(f - 1));
}
int solve(string s,int n) {
int ans = 1;
int arr[26] = {0};
int len = n - 1;
for (int i = 0; i < n; i++) {
s[i] = toupper(s[i]);
arr[s[i] - 'A']++;
}
for(int i = 0; i < n; i++) {
int temp = 0;
int x = 1;
char c = s[i];
for(int j = 0; j < c - 'A'; j++) temp += arr[j];
for (int j = 0; j < 26; j++) x = (x * fact(arr[j]));
arr[c - 'A']--;
ans = ans + (temp * ((fact(len)) / x));
len--;
}
return ans;
}
int main() {
int i,n;
string s;
cin>>s;
n=s.size();
cout << solve(s,n);
return 0;
}
Java version of unrank for a String:
public static String unrankperm(String letters, int rank) {
Map<Character, Integer> charCounts = new java.util.HashMap<>();
int permcount = 1;
for(int i = 0; i < letters.length(); i++) {
char x = letters.charAt(i);
int xCount = charCounts.containsKey(x) ? charCounts.get(x) + 1 : 1;
charCounts.put(x, xCount);
permcount = (permcount * (i + 1)) / xCount;
}
// charCounts is the histogram of letters
// permcount is the number of distinct perms of letters
StringBuilder perm = new StringBuilder();
for(int i = 0; i < letters.length(); i++) {
List<Character> sorted = new ArrayList<>(charCounts.keySet());
Collections.sort(sorted);
for(Character x : sorted) {
// suffixcount is the number of distinct perms that begin with x
Integer frequency = charCounts.get(x);
int suffixcount = permcount * frequency / (letters.length() - i);
if (rank <= suffixcount) {
perm.append(x);
permcount = suffixcount;
if(frequency == 1) {
charCounts.remove(x);
} else {
charCounts.put(x, frequency - 1);
}
break;
}
rank -= suffixcount;
}
}
return perm.toString();
}
See also n-th-permutation-algorithm-for-use-in-brute-force-bin-packaging-parallelization.

How to find smallest substring which contains all characters from a given string?

I have recently come across an interesting question on strings. Suppose you are given following:
Input string1: "this is a test string"
Input string2: "tist"
Output string: "t stri"
So, given above, how can I approach towards finding smallest substring of string1 that contains all the characters from string 2?
To see more details including working code, check my blog post at:
http://www.leetcode.com/2010/11/finding-minimum-window-in-s-which.html
To help illustrate this approach, I use an example: string1 = "acbbaca" and string2 = "aba". Here, we also use the term "window", which means a contiguous block of characters from string1 (could be interchanged with the term substring).
i) string1 = "acbbaca" and string2 = "aba".
ii) The first minimum window is found.
Notice that we cannot advance begin
pointer as hasFound['a'] ==
needToFind['a'] == 2. Advancing would
mean breaking the constraint.
iii) The second window is found. begin
pointer still points to the first
element 'a'. hasFound['a'] (3) is
greater than needToFind['a'] (2). We
decrement hasFound['a'] by one and
advance begin pointer to the right.
iv) We skip 'c' since it is not found
in string2. Begin pointer now points to 'b'.
hasFound['b'] (2) is greater than
needToFind['b'] (1). We decrement
hasFound['b'] by one and advance begin
pointer to the right.
v) Begin pointer now points to the
next 'b'. hasFound['b'] (1) is equal
to needToFind['b'] (1). We stop
immediately and this is our newly
found minimum window.
The idea is mainly based on the help of two pointers (begin and end position of the window) and two tables (needToFind and hasFound) while traversing string1. needToFind stores the total count of a character in string2 and hasFound stores the total count of a character met so far. We also use a count variable to store the total characters in string2 that's met so far (not counting characters where hasFound[x] exceeds needToFind[x]). When count equals string2's length, we know a valid window is found.
Each time we advance the end pointer (pointing to an element x), we increment hasFound[x] by one. We also increment count by one if hasFound[x] is less than or equal to needToFind[x]. Why? When the constraint is met (that is, count equals to string2's size), we immediately advance begin pointer as far right as possible while maintaining the constraint.
How do we check if it is maintaining the constraint? Assume that begin points to an element x, we check if hasFound[x] is greater than needToFind[x]. If it is, we can decrement hasFound[x] by one and advancing begin pointer without breaking the constraint. On the other hand, if it is not, we stop immediately as advancing begin pointer breaks the window constraint.
Finally, we check if the minimum window length is less than the current minimum. Update the current minimum if a new minimum is found.
Essentially, the algorithm finds the first window that satisfies the constraint, then continue maintaining the constraint throughout.
You can do a histogram sweep in O(N+M) time and O(1) space where N is the number of characters in the first string and M is the number of characters in the second.
It works like this:
Make a histogram of the second string's characters (key operation is hist2[ s2[i] ]++).
Make a cumulative histogram of the first string's characters until that histogram contains every character that the second string's histogram contains (which I will call "the histogram condition").
Then move forwards on the first string, subtracting from the histogram, until it fails to meet the histogram condition. Mark that bit of the first string (before the final move) as your tentative substring.
Move the front of the substring forwards again until you meet the histogram condition again. Move the end forwards until it fails again. If this is a shorter substring than the first, mark that as your tentative substring.
Repeat until you've passed through the entire first string.
The marked substring is your answer.
Note that by varying the check you use on the histogram condition, you can choose either to have the same set of characters as the second string, or at least as many characters of each type. (Its just the difference between a[i]>0 && b[i]>0 and a[i]>=b[i].)
You can speed up the histogram checks if you keep a track of which condition is not satisfied when you're trying to satisfy it, and checking only the thing that you decrement when you're trying to break it. (On the initial buildup, you count how many items you've satisfied, and increment that count every time you add a new character that takes the condition from false to true.)
Here's an O(n) solution. The basic idea is simple: for each starting index, find the least ending index such that the substring contains all of the necessary letters. The trick is that the least ending index increases over the course of the function, so with a little data structure support, we consider each character at most twice.
In Python:
from collections import defaultdict
def smallest(s1, s2):
assert s2 != ''
d = defaultdict(int)
nneg = [0] # number of negative entries in d
def incr(c):
d[c] += 1
if d[c] == 0:
nneg[0] -= 1
def decr(c):
if d[c] == 0:
nneg[0] += 1
d[c] -= 1
for c in s2:
decr(c)
minlen = len(s1) + 1
j = 0
for i in xrange(len(s1)):
while nneg[0] > 0:
if j >= len(s1):
return minlen
incr(s1[j])
j += 1
minlen = min(minlen, j - i)
decr(s1[i])
return minlen
I received the same interview question. I am a C++ candidate but I was in a position to code relatively fast in JAVA.
Java [Courtesy : Sumod Mathilakath]
import java.io.*;
import java.util.*;
class UserMainCode
{
public String GetSubString(String input1,String input2){
// Write code here...
return find(input1, input2);
}
private static boolean containsPatternChar(int[] sCount, int[] pCount) {
for(int i=0;i<256;i++) {
if(pCount[i]>sCount[i])
return false;
}
return true;
}
public static String find(String s, String p) {
if (p.length() > s.length())
return null;
int[] pCount = new int[256];
int[] sCount = new int[256];
// Time: O(p.lenght)
for(int i=0;i<p.length();i++) {
pCount[(int)(p.charAt(i))]++;
sCount[(int)(s.charAt(i))]++;
}
int i = 0, j = p.length(), min = Integer.MAX_VALUE;
String res = null;
// Time: O(s.lenght)
while (j < s.length()) {
if (containsPatternChar(sCount, pCount)) {
if ((j - i) < min) {
min = j - i;
res = s.substring(i, j);
// This is the smallest possible substring.
if(min==p.length())
break;
// Reduce the window size.
sCount[(int)(s.charAt(i))]--;
i++;
}
} else {
sCount[(int)(s.charAt(j))]++;
// Increase the window size.
j++;
}
}
System.out.println(res);
return res;
}
}
C++ [Courtesy : sundeepblue]
#include <iostream>
#include <vector>
#include <string>
#include <climits>
using namespace std;
string find_minimum_window(string s, string t) {
if(s.empty() || t.empty()) return;
int ns = s.size(), nt = t.size();
vector<int> total(256, 0);
vector<int> sofar(256, 0);
for(int i=0; i<nt; i++)
total[t[i]]++;
int L = 0, R;
int minL = 0; //gist2
int count = 0;
int min_win_len = INT_MAX;
for(R=0; R<ns; R++) { // gist0, a big for loop
if(total[s[R]] == 0) continue;
else sofar[s[R]]++;
if(sofar[s[R]] <= total[s[R]]) // gist1, <= not <
count++;
if(count == nt) { // POS1
while(true) {
char c = s[L];
if(total[c] == 0) { L++; }
else if(sofar[c] > total[c]) {
sofar[c]--;
L++;
}
else break;
}
if(R - L + 1 < min_win_len) { // this judge should be inside POS1
min_win_len = R - L + 1;
minL = L;
}
}
}
string res;
if(count == nt) // gist3, cannot forget this.
res = s.substr(minL, min_win_len); // gist4, start from "minL" not "L"
return res;
}
int main() {
string s = "abdccdedca";
cout << find_minimum_window(s, "acd");
}
Erlang [Courtesy : wardbekker]
-module(leetcode).
-export([min_window/0]).
%% Given a string S and a string T, find the minimum window in S which will contain all the characters in T in complexity O(n).
%% For example,
%% S = "ADOBECODEBANC"
%% T = "ABC"
%% Minimum window is "BANC".
%% Note:
%% If there is no such window in S that covers all characters in T, return the emtpy string "".
%% If there are multiple such windows, you are guaranteed that there will always be only one unique minimum window in S.
min_window() ->
"eca" = min_window("cabeca", "cae"),
"eca" = min_window("cfabeca", "cae"),
"aec" = min_window("cabefgecdaecf", "cae"),
"cwae" = min_window("cabwefgewcwaefcf", "cae"),
"BANC" = min_window("ADOBECODEBANC", "ABC"),
ok.
min_window(T, S) ->
min_window(T, S, []).
min_window([], _T, MinWindow) ->
MinWindow;
min_window([H | Rest], T, MinWindow) ->
NewMinWindow = case lists:member(H, T) of
true ->
MinWindowFound = fullfill_window(Rest, lists:delete(H, T), [H]),
case length(MinWindow) == 0 orelse (length(MinWindow) > length(MinWindowFound)
andalso length(MinWindowFound) > 0) of
true ->
MinWindowFound;
false ->
MinWindow
end;
false ->
MinWindow
end,
min_window(Rest, T, NewMinWindow).
fullfill_window(_, [], Acc) ->
%% window completed
Acc;
fullfill_window([], _T, _Acc) ->
%% no window found
"";
fullfill_window([H | Rest], T, Acc) ->
%% completing window
case lists:member(H, T) of
true ->
fullfill_window(Rest, lists:delete(H, T), Acc ++ [H]);
false ->
fullfill_window(Rest, T, Acc ++ [H])
end.
REF:
http://articles.leetcode.com/finding-minimum-window-in-s-which/#comment-511216
http://www.mif.vu.lt/~valdas/ALGORITMAI/LITERATURA/Cormen/Cormen.pdf
Please have a look at this as well:
//-----------------------------------------------------------------------
bool IsInSet(char ch, char* cSet)
{
char* cSetptr = cSet;
int index = 0;
while (*(cSet+ index) != '\0')
{
if(ch == *(cSet+ index))
{
return true;
}
++index;
}
return false;
}
void removeChar(char ch, char* cSet)
{
bool bShift = false;
int index = 0;
while (*(cSet + index) != '\0')
{
if( (ch == *(cSet + index)) || bShift)
{
*(cSet + index) = *(cSet + index + 1);
bShift = true;
}
++index;
}
}
typedef struct subStr
{
short iStart;
short iEnd;
short szStr;
}ss;
char* subStringSmallest(char* testStr, char* cSet)
{
char* subString = NULL;
int iSzSet = strlen(cSet) + 1;
int iSzString = strlen(testStr)+ 1;
char* cSetBackUp = new char[iSzSet];
memcpy((void*)cSetBackUp, (void*)cSet, iSzSet);
int iStartIndx = -1;
int iEndIndx = -1;
int iIndexStartNext = -1;
std::vector<ss> subStrVec;
int index = 0;
while( *(testStr+index) != '\0' )
{
if (IsInSet(*(testStr+index), cSetBackUp))
{
removeChar(*(testStr+index), cSetBackUp);
if(iStartIndx < 0)
{
iStartIndx = index;
}
else if( iIndexStartNext < 0)
iIndexStartNext = index;
else
;
if (strlen(cSetBackUp) == 0 )
{
iEndIndx = index;
if( iIndexStartNext == -1)
break;
else
{
index = iIndexStartNext;
ss stemp = {iStartIndx, iEndIndx, (iEndIndx-iStartIndx + 1)};
subStrVec.push_back(stemp);
iStartIndx = iEndIndx = iIndexStartNext = -1;
memcpy((void*)cSetBackUp, (void*)cSet, iSzSet);
continue;
}
}
}
else
{
if (IsInSet(*(testStr+index), cSet))
{
if(iIndexStartNext < 0)
iIndexStartNext = index;
}
}
++index;
}
int indexSmallest = 0;
for(int indexVec = 0; indexVec < subStrVec.size(); ++indexVec)
{
if(subStrVec[indexSmallest].szStr > subStrVec[indexVec].szStr)
indexSmallest = indexVec;
}
subString = new char[(subStrVec[indexSmallest].szStr) + 1];
memcpy((void*)subString, (void*)(testStr+ subStrVec[indexSmallest].iStart), subStrVec[indexSmallest].szStr);
memset((void*)(subString + subStrVec[indexSmallest].szStr), 0, 1);
delete[] cSetBackUp;
return subString;
}
//--------------------------------------------------------------------
Edit: apparently there's an O(n) algorithm (cf. algorithmist's answer). Obviously this have this will beat the [naive] baseline described below!
Too bad I gotta go... I'm a bit suspicious that we can get O(n). I'll check in tomorrow to see the winner ;-) Have fun!
Tentative algorithm:
The general idea is to sequentially try and use a character from str2 found in str1 as the start of a search (in either/both directions) of all the other letters of str2. By keeping a "length of best match so far" value, we can abort searches when they exceed this. Other heuristics can probably be used to further abort suboptimal (so far) solutions. The choice of the order of the starting letters in str1 matters much; it is suggested to start with the letter(s) of str1 which have the lowest count and to try with the other letters, of an increasing count, in subsequent attempts.
[loose pseudo-code]
- get count for each letter/character in str1 (number of As, Bs etc.)
- get count for each letter in str2
- minLen = length(str1) + 1 (the +1 indicates you're not sure all chars of
str2 are in str1)
- Starting with the letter from string2 which is found the least in string1,
look for other letters of Str2, in either direction of str1, until you've
found them all (or not, at which case response = impossible => done!).
set x = length(corresponding substring of str1).
- if (x < minLen),
set minlen = x,
also memorize the start/len of the str1 substring.
- continue trying with other letters of str1 (going the up the frequency
list in str1), but abort search as soon as length(substring of strl)
reaches or exceed minLen.
We can find a few other heuristics that would allow aborting a
particular search, based on [pre-calculated ?] distance between a given
letter in str1 and some (all?) of the letters in str2.
- the overall search terminates when minLen = length(str2) or when
we've used all letters of str1 (which match one letter of str2)
as a starting point for the search
Here is Java implementation
public static String shortestSubstrContainingAllChars(String input, String target) {
int needToFind[] = new int[256];
int hasFound[] = new int[256];
int totalCharCount = 0;
String result = null;
char[] targetCharArray = target.toCharArray();
for (int i = 0; i < targetCharArray.length; i++) {
needToFind[targetCharArray[i]]++;
}
char[] inputCharArray = input.toCharArray();
for (int begin = 0, end = 0; end < inputCharArray.length; end++) {
if (needToFind[inputCharArray[end]] == 0) {
continue;
}
hasFound[inputCharArray[end]]++;
if (hasFound[inputCharArray[end]] <= needToFind[inputCharArray[end]]) {
totalCharCount ++;
}
if (totalCharCount == target.length()) {
while (needToFind[inputCharArray[begin]] == 0
|| hasFound[inputCharArray[begin]] > needToFind[inputCharArray[begin]]) {
if (hasFound[inputCharArray[begin]] > needToFind[inputCharArray[begin]]) {
hasFound[inputCharArray[begin]]--;
}
begin++;
}
String substring = input.substring(begin, end + 1);
if (result == null || result.length() > substring.length()) {
result = substring;
}
}
}
return result;
}
Here is the Junit Test
#Test
public void shortestSubstringContainingAllCharsTest() {
String result = StringUtil.shortestSubstrContainingAllChars("acbbaca", "aba");
assertThat(result, equalTo("baca"));
result = StringUtil.shortestSubstrContainingAllChars("acbbADOBECODEBANCaca", "ABC");
assertThat(result, equalTo("BANC"));
result = StringUtil.shortestSubstrContainingAllChars("this is a test string", "tist");
assertThat(result, equalTo("t stri"));
}
//[ShortestSubstring.java][1]
public class ShortestSubstring {
public static void main(String[] args) {
String input1 = "My name is Fran";
String input2 = "rim";
System.out.println(getShortestSubstring(input1, input2));
}
private static String getShortestSubstring(String mainString, String toBeSearched) {
int mainStringLength = mainString.length();
int toBeSearchedLength = toBeSearched.length();
if (toBeSearchedLength > mainStringLength) {
throw new IllegalArgumentException("search string cannot be larger than main string");
}
for (int j = 0; j < mainStringLength; j++) {
for (int i = 0; i <= mainStringLength - toBeSearchedLength; i++) {
String substring = mainString.substring(i, i + toBeSearchedLength);
if (checkIfMatchFound(substring, toBeSearched)) {
return substring;
}
}
toBeSearchedLength++;
}
return null;
}
private static boolean checkIfMatchFound(String substring, String toBeSearched) {
char[] charArraySubstring = substring.toCharArray();
char[] charArrayToBeSearched = toBeSearched.toCharArray();
int count = 0;
for (int i = 0; i < charArraySubstring.length; i++) {
for (int j = 0; j < charArrayToBeSearched.length; j++) {
if (String.valueOf(charArraySubstring[i]).equalsIgnoreCase(String.valueOf(charArrayToBeSearched[j]))) {
count++;
}
}
}
return count == charArrayToBeSearched.length;
}
}
This is an approach using prime numbers to avoid one loop, and replace it with multiplications. Several other minor optimizations can be made.
Assign a unique prime number to any of the characters that you want to find, and 1 to the uninteresting characters.
Find the product of a matching string by multiplying the prime number with the number of occurrences it should have. Now this product can only be found if the same prime factors are used.
Search the string from the beginning, multiplying the respective prime number as you move into a running product.
If the number is greater than the correct sum, remove the first character and divide its prime number out of your running product.
If the number is less than the correct sum, include the next character and multiply it into your running product.
If the number is the same as the correct sum you have found a match, slide beginning and end to next character and continue searching for other matches.
Decide which of the matches is the shortest.
Gist
charcount = { 'a': 3, 'b' : 1 };
str = "kjhdfsbabasdadaaaaasdkaaajbajerhhayeom"
def find (c, s):
Ns = len (s)
C = list (c.keys ())
D = list (c.values ())
# prime numbers assigned to the first 25 chars
prmsi = [ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89 , 97]
# primes used in the key, all other set to 1
prms = []
Cord = [ord(c) - ord('a') for c in C]
for e,p in enumerate(prmsi):
if e in Cord:
prms.append (p)
else:
prms.append (1)
# Product of match
T = 1
for c,d in zip(C,D):
p = prms[ord (c) - ord('a')]
T *= p**d
print ("T=", T)
t = 1 # product of current string
f = 0
i = 0
matches = []
mi = 0
mn = Ns
mm = 0
while i < Ns:
k = prms[ord(s[i]) - ord ('a')]
t *= k
print ("testing:", s[f:i+1])
if (t > T):
# included too many chars: move start
t /= prms[ord(s[f]) - ord('a')] # remove first char, usually division by 1
f += 1 # increment start position
t /= k # will be retested, could be replaced with bool
elif t == T:
# found match
print ("FOUND match:", s[f:i+1])
matches.append (s[f:i+1])
if (i - f) < mn:
mm = mi
mn = i - f
mi += 1
t /= prms[ord(s[f]) - ord('a')] # remove first matching char
# look for next match
i += 1
f += 1
else:
# no match yet, keep searching
i += 1
return (mm, matches)
print (find (charcount, str))
(note: this answer was originally posted to a duplicate question, the original answer is now deleted.)
C# Implementation:
public static Tuple<int, int> FindMinSubstringWindow(string input, string pattern)
{
Tuple<int, int> windowCoords = new Tuple<int, int>(0, input.Length - 1);
int[] patternHist = new int[256];
for (int i = 0; i < pattern.Length; i++)
{
patternHist[pattern[i]]++;
}
int[] inputHist = new int[256];
int minWindowLength = int.MaxValue;
int count = 0;
for (int begin = 0, end = 0; end < input.Length; end++)
{
// Skip what's not in pattern.
if (patternHist[input[end]] == 0)
{
continue;
}
inputHist[input[end]]++;
// Count letters that are in pattern.
if (inputHist[input[end]] <= patternHist[input[end]])
{
count++;
}
// Window found.
if (count == pattern.Length)
{
// Remove extra instances of letters from pattern
// or just letters that aren't part of the pattern
// from the beginning.
while (patternHist[input[begin]] == 0 ||
inputHist[input[begin]] > patternHist[input[begin]])
{
if (inputHist[input[begin]] > patternHist[input[begin]])
{
inputHist[input[begin]]--;
}
begin++;
}
// Current window found.
int windowLength = end - begin + 1;
if (windowLength < minWindowLength)
{
windowCoords = new Tuple<int, int>(begin, end);
minWindowLength = windowLength;
}
}
}
if (count == pattern.Length)
{
return windowCoords;
}
return null;
}
I've implemented it using Python3 at O(N) efficiency:
def get(s, alphabet="abc"):
seen = {}
for c in alphabet:
seen[c] = 0
seen[s[0]] = 1
start = 0
end = 0
shortest_s = 0
shortest_e = 99999
while end + 1 < len(s):
while seen[s[start]] > 1:
seen[s[start]] -= 1
start += 1
# Constant time check:
if sum(seen.values()) == len(alphabet) and all(v == 1 for v in seen.values()) and \
shortest_e - shortest_s > end - start:
shortest_s = start
shortest_e = end
end += 1
seen[s[end]] += 1
return s[shortest_s: shortest_e + 1]
print(get("abbcac")) # Expected to return "bca"
String s = "xyyzyzyx";
String s1 = "xyz";
String finalString ="";
Map<Character,Integer> hm = new HashMap<>();
if(s1!=null && s!=null && s.length()>s1.length()){
for(int i =0;i<s1.length();i++){
if(hm.get(s1.charAt(i))!=null){
int k = hm.get(s1.charAt(i))+1;
hm.put(s1.charAt(i), k);
}else
hm.put(s1.charAt(i), 1);
}
Map<Character,Integer> t = new HashMap<>();
int start =-1;
for(int j=0;j<s.length();j++){
if(hm.get(s.charAt(j))!=null){
if(t.get(s.charAt(j))!=null){
if(t.get(s.charAt(j))!=hm.get(s.charAt(j))){
int k = t.get(s.charAt(j))+1;
t.put(s.charAt(j), k);
}
}else{
t.put(s.charAt(j), 1);
if(start==-1){
if(j+s1.length()>s.length()){
break;
}
start = j;
}
}
if(hm.equals(t)){
t = new HashMap<>();
if(finalString.length()<s.substring(start,j+1).length());
{
finalString=s.substring(start,j+1);
}
j=start;
start=-1;
}
}
}
JavaScript solution in bruteforce way:
function shortestSubStringOfUniqueChars(s){
var uniqueArr = [];
for(let i=0; i<s.length; i++){
if(uniqueArr.indexOf(s.charAt(i)) <0){
uniqueArr.push(s.charAt(i));
}
}
let windoww = uniqueArr.length;
while(windoww < s.length){
for(let i=0; i<s.length - windoww; i++){
let match = true;
let tempArr = [];
for(let j=0; j<uniqueArr.length; j++){
if(uniqueArr.indexOf(s.charAt(i+j))<0){
match = false;
break;
}
}
let checkStr
if(match){
checkStr = s.substr(i, windoww);
for(let j=0; j<uniqueArr.length; j++){
if(uniqueArr.indexOf(checkStr.charAt(j))<0){
match = false;
break;
}
}
}
if(match){
return checkStr;
}
}
windoww = windoww + 1;
}
}
console.log(shortestSubStringOfUniqueChars("ABA"));
# Python implementation
s = input('Enter the string : ')
s1 = input('Enter the substring to search : ')
l = [] # List to record all the matching combinations
check = all([char in s for char in s1])
if check == True:
for i in range(len(s1),len(s)+1) :
for j in range(0,i+len(s1)+2):
if (i+j) < len(s)+1:
cnt = 0
b = all([char in s[j:i+j] for char in s1])
if (b == True) :
l.append(s[j:i+j])
print('The smallest substring containing',s1,'is',l[0])
else:
print('Please enter a valid substring')
Java code for the approach discussed above:
private static Map<Character, Integer> frequency;
private static Set<Character> charsCovered;
private static Map<Character, Integer> encountered;
/**
* To set the first match index as an intial start point
*/
private static boolean hasStarted = false;
private static int currentStartIndex = 0;
private static int finalStartIndex = 0;
private static int finalEndIndex = 0;
private static int minLen = Integer.MAX_VALUE;
private static int currentLen = 0;
/**
* Whether we have already found the match and now looking for other
* alternatives.
*/
private static boolean isFound = false;
private static char currentChar;
public static String findSmallestSubStringWithAllChars(String big, String small) {
if (null == big || null == small || big.isEmpty() || small.isEmpty()) {
return null;
}
frequency = new HashMap<Character, Integer>();
instantiateFrequencyMap(small);
charsCovered = new HashSet<Character>();
int charsToBeCovered = frequency.size();
encountered = new HashMap<Character, Integer>();
for (int i = 0; i < big.length(); i++) {
currentChar = big.charAt(i);
if (frequency.containsKey(currentChar) && !isFound) {
if (!hasStarted && !isFound) {
hasStarted = true;
currentStartIndex = i;
}
updateEncounteredMapAndCharsCoveredSet(currentChar);
if (charsCovered.size() == charsToBeCovered) {
currentLen = i - currentStartIndex;
isFound = true;
updateMinLength(i);
}
} else if (frequency.containsKey(currentChar) && isFound) {
updateEncounteredMapAndCharsCoveredSet(currentChar);
if (currentChar == big.charAt(currentStartIndex)) {
encountered.put(currentChar, encountered.get(currentChar) - 1);
currentStartIndex++;
while (currentStartIndex < i) {
if (encountered.containsKey(big.charAt(currentStartIndex))
&& encountered.get(big.charAt(currentStartIndex)) > frequency.get(big
.charAt(currentStartIndex))) {
encountered.put(big.charAt(currentStartIndex),
encountered.get(big.charAt(currentStartIndex)) - 1);
} else if (encountered.containsKey(big.charAt(currentStartIndex))) {
break;
}
currentStartIndex++;
}
}
currentLen = i - currentStartIndex;
updateMinLength(i);
}
}
System.out.println("start: " + finalStartIndex + " finalEnd : " + finalEndIndex);
return big.substring(finalStartIndex, finalEndIndex + 1);
}
private static void updateMinLength(int index) {
if (minLen > currentLen) {
minLen = currentLen;
finalStartIndex = currentStartIndex;
finalEndIndex = index;
}
}
private static void updateEncounteredMapAndCharsCoveredSet(Character currentChar) {
if (encountered.containsKey(currentChar)) {
encountered.put(currentChar, encountered.get(currentChar) + 1);
} else {
encountered.put(currentChar, 1);
}
if (encountered.get(currentChar) >= frequency.get(currentChar)) {
charsCovered.add(currentChar);
}
}
private static void instantiateFrequencyMap(String str) {
for (char c : str.toCharArray()) {
if (frequency.containsKey(c)) {
frequency.put(c, frequency.get(c) + 1);
} else {
frequency.put(c, 1);
}
}
}
public static void main(String[] args) {
String big = "this is a test string";
String small = "tist";
System.out.println("len: " + big.length());
System.out.println(findSmallestSubStringWithAllChars(big, small));
}
def minimum_window(s, t, min_length = 100000):
d = {}
for x in t:
if x in d:
d[x]+= 1
else:
d[x] = 1
tot = sum([y for x,y in d.iteritems()])
l = []
ind = 0
for i,x in enumerate(s):
if ind == 1:
l = l + [x]
if x in d:
tot-=1
if not l:
ind = 1
l = [x]
if tot == 0:
if len(l)<min_length:
min_length = len(l)
min_length = minimum_window(s[i+1:], t, min_length)
return min_length
l_s = "ADOBECODEBANC"
t_s = "ABC"
min_length = minimum_window(l_s, t_s)
if min_length == 100000:
print "Not found"
else:
print min_length

Generate list of all possible permutations of a string

How would I go about generating a list of all possible permutations of a string between x and y characters in length, containing a variable list of characters.
Any language would work, but it should be portable.
There are several ways to do this. Common methods use recursion, memoization, or dynamic programming. The basic idea is that you produce a list of all strings of length 1, then in each iteration, for all strings produced in the last iteration, add that string concatenated with each character in the string individually. (the variable index in the code below keeps track of the start of the last and the next iteration)
Some pseudocode:
list = originalString.split('')
index = (0,0)
list = [""]
for iteration n in 1 to y:
index = (index[1], len(list))
for string s in list.subset(index[0] to end):
for character c in originalString:
list.add(s + c)
you'd then need to remove all strings less than x in length, they'll be the first (x-1) * len(originalString) entries in the list.
It's better to use backtracking
#include <stdio.h>
#include <string.h>
void swap(char *a, char *b) {
char temp;
temp = *a;
*a = *b;
*b = temp;
}
void print(char *a, int i, int n) {
int j;
if(i == n) {
printf("%s\n", a);
} else {
for(j = i; j <= n; j++) {
swap(a + i, a + j);
print(a, i + 1, n);
swap(a + i, a + j);
}
}
}
int main(void) {
char a[100];
gets(a);
print(a, 0, strlen(a) - 1);
return 0;
}
You are going to get a lot of strings, that's for sure...
Where x and y is how you define them and r is the number of characters we are selecting from --if I am understanding you correctly. You should definitely generate these as needed and not get sloppy and say, generate a powerset and then filter the length of strings.
The following definitely isn't the best way to generate these, but it's an interesting aside, none-the-less.
Knuth (volume 4, fascicle 2, 7.2.1.3) tells us that (s,t)-combination is equivalent to s+1 things taken t at a time with repetition -- an (s,t)-combination is notation used by Knuth that is equal to . We can figure this out by first generating each (s,t)-combination in binary form (so, of length (s+t)) and counting the number of 0's to the left of each 1.
10001000011101 --> becomes the permutation: {0, 3, 4, 4, 4, 1}
Non recursive solution according to Knuth, Python example:
def nextPermutation(perm):
k0 = None
for i in range(len(perm)-1):
if perm[i]<perm[i+1]:
k0=i
if k0 == None:
return None
l0 = k0+1
for i in range(k0+1, len(perm)):
if perm[k0] < perm[i]:
l0 = i
perm[k0], perm[l0] = perm[l0], perm[k0]
perm[k0+1:] = reversed(perm[k0+1:])
return perm
perm=list("12345")
while perm:
print perm
perm = nextPermutation(perm)
You might look at "Efficiently Enumerating the Subsets of a Set", which describes an algorithm to do part of what you want - quickly generate all subsets of N characters from length x to y. It contains an implementation in C.
For each subset, you'd still have to generate all the permutations. For instance if you wanted 3 characters from "abcde", this algorithm would give you "abc","abd", "abe"...
but you'd have to permute each one to get "acb", "bac", "bca", etc.
Some working Java code based on Sarp's answer:
public class permute {
static void permute(int level, String permuted,
boolean used[], String original) {
int length = original.length();
if (level == length) {
System.out.println(permuted);
} else {
for (int i = 0; i < length; i++) {
if (!used[i]) {
used[i] = true;
permute(level + 1, permuted + original.charAt(i),
used, original);
used[i] = false;
}
}
}
}
public static void main(String[] args) {
String s = "hello";
boolean used[] = {false, false, false, false, false};
permute(0, "", used, s);
}
}
Here is a simple solution in C#.
It generates only the distinct permutations of a given string.
static public IEnumerable<string> permute(string word)
{
if (word.Length > 1)
{
char character = word[0];
foreach (string subPermute in permute(word.Substring(1)))
{
for (int index = 0; index <= subPermute.Length; index++)
{
string pre = subPermute.Substring(0, index);
string post = subPermute.Substring(index);
if (post.Contains(character))
continue;
yield return pre + character + post;
}
}
}
else
{
yield return word;
}
}
There are a lot of good answers here. I also suggest a very simple recursive solution in C++.
#include <string>
#include <iostream>
template<typename Consume>
void permutations(std::string s, Consume consume, std::size_t start = 0) {
if (start == s.length()) consume(s);
for (std::size_t i = start; i < s.length(); i++) {
std::swap(s[start], s[i]);
permutations(s, consume, start + 1);
}
}
int main(void) {
std::string s = "abcd";
permutations(s, [](std::string s) {
std::cout << s << std::endl;
});
}
Note: strings with repeated characters will not produce unique permutations.
I just whipped this up quick in Ruby:
def perms(x, y, possible_characters)
all = [""]
current_array = all.clone
1.upto(y) { |iteration|
next_array = []
current_array.each { |string|
possible_characters.each { |c|
value = string + c
next_array.insert next_array.length, value
all.insert all.length, value
}
}
current_array = next_array
}
all.delete_if { |string| string.length < x }
end
You might look into language API for built in permutation type functions, and you might be able to write more optimized code, but if the numbers are all that high, I'm not sure there is much of a way around having a lot of results.
Anyways, the idea behind the code is start with string of length 0, then keep track of all the strings of length Z where Z is the current size in the iteration. Then, go through each string and append each character onto each string. Finally at the end, remove any that were below the x threshold and return the result.
I didn't test it with potentially meaningless input (null character list, weird values of x and y, etc).
This is a translation of Mike's Ruby version, into Common Lisp:
(defun perms (x y original-string)
(loop with all = (list "")
with current-array = (list "")
for iteration from 1 to y
do (loop with next-array = nil
for string in current-array
do (loop for c across original-string
for value = (concatenate 'string string (string c))
do (push value next-array)
(push value all))
(setf current-array (reverse next-array)))
finally (return (nreverse (delete-if #'(lambda (el) (< (length el) x)) all)))))
And another version, slightly shorter and using more loop facility features:
(defun perms (x y original-string)
(loop repeat y
collect (loop for string in (or (car (last sets)) (list ""))
append (loop for c across original-string
collect (concatenate 'string string (string c)))) into sets
finally (return (loop for set in sets
append (loop for el in set when (>= (length el) x) collect el)))))
Here is a simple word C# recursive solution:
Method:
public ArrayList CalculateWordPermutations(string[] letters, ArrayList words, int index)
{
bool finished = true;
ArrayList newWords = new ArrayList();
if (words.Count == 0)
{
foreach (string letter in letters)
{
words.Add(letter);
}
}
for(int j=index; j<words.Count; j++)
{
string word = (string)words[j];
for(int i =0; i<letters.Length; i++)
{
if(!word.Contains(letters[i]))
{
finished = false;
string newWord = (string)word.Clone();
newWord += letters[i];
newWords.Add(newWord);
}
}
}
foreach (string newWord in newWords)
{
words.Add(newWord);
}
if(finished == false)
{
CalculateWordPermutations(letters, words, words.Count - newWords.Count);
}
return words;
}
Calling:
string[] letters = new string[]{"a","b","c"};
ArrayList words = CalculateWordPermutations(letters, new ArrayList(), 0);
... and here is the C version:
void permute(const char *s, char *out, int *used, int len, int lev)
{
if (len == lev) {
out[lev] = '\0';
puts(out);
return;
}
int i;
for (i = 0; i < len; ++i) {
if (! used[i])
continue;
used[i] = 1;
out[lev] = s[i];
permute(s, out, used, len, lev + 1);
used[i] = 0;
}
return;
}
permute (ABC) -> A.perm(BC) -> A.perm[B.perm(C)] -> A.perm[(*BC), (CB*)] -> [(*ABC), (BAC), (BCA*), (*ACB), (CAB), (CBA*)]
To remove duplicates when inserting each alphabet check to see if previous string ends with the same alphabet (why? -exercise)
public static void main(String[] args) {
for (String str : permStr("ABBB")){
System.out.println(str);
}
}
static Vector<String> permStr(String str){
if (str.length() == 1){
Vector<String> ret = new Vector<String>();
ret.add(str);
return ret;
}
char start = str.charAt(0);
Vector<String> endStrs = permStr(str.substring(1));
Vector<String> newEndStrs = new Vector<String>();
for (String endStr : endStrs){
for (int j = 0; j <= endStr.length(); j++){
if (endStr.substring(0, j).endsWith(String.valueOf(start)))
break;
newEndStrs.add(endStr.substring(0, j) + String.valueOf(start) + endStr.substring(j));
}
}
return newEndStrs;
}
Prints all permutations sans duplicates
Recursive solution in C++
int main (int argc, char * const argv[]) {
string s = "sarp";
bool used [4];
permute(0, "", used, s);
}
void permute(int level, string permuted, bool used [], string &original) {
int length = original.length();
if(level == length) { // permutation complete, display
cout << permuted << endl;
} else {
for(int i=0; i<length; i++) { // try to add an unused character
if(!used[i]) {
used[i] = true;
permute(level+1, original[i] + permuted, used, original); // find the permutations starting with this string
used[i] = false;
}
}
}
In Perl, if you want to restrict yourself to the lowercase alphabet, you can do this:
my #result = ("a" .. "zzzz");
This gives all possible strings between 1 and 4 characters using lowercase characters. For uppercase, change "a" to "A" and "zzzz" to "ZZZZ".
For mixed-case it gets much harder, and probably not doable with one of Perl's builtin operators like that.
Ruby answer that works:
class String
def each_char_with_index
0.upto(size - 1) do |index|
yield(self[index..index], index)
end
end
def remove_char_at(index)
return self[1..-1] if index == 0
self[0..(index-1)] + self[(index+1)..-1]
end
end
def permute(str, prefix = '')
if str.size == 0
puts prefix
return
end
str.each_char_with_index do |char, index|
permute(str.remove_char_at(index), prefix + char)
end
end
# example
# permute("abc")
The following Java recursion prints all permutations of a given string:
//call it as permut("",str);
public void permut(String str1,String str2){
if(str2.length() != 0){
char ch = str2.charAt(0);
for(int i = 0; i <= str1.length();i++)
permut(str1.substring(0,i) + ch + str1.substring(i,str1.length()),
str2.substring(1,str2.length()));
}else{
System.out.println(str1);
}
}
Following is the updated version of above "permut" method which makes n! (n factorial) less recursive calls compared to the above method
//call it as permut("",str);
public void permut(String str1,String str2){
if(str2.length() > 1){
char ch = str2.charAt(0);
for(int i = 0; i <= str1.length();i++)
permut(str1.substring(0,i) + ch + str1.substring(i,str1.length()),
str2.substring(1,str2.length()));
}else{
char ch = str2.charAt(0);
for(int i = 0; i <= str1.length();i++)
System.out.println(str1.substring(0,i) + ch + str1.substring(i,str1.length()),
str2.substring(1,str2.length()));
}
}
import java.util.*;
public class all_subsets {
public static void main(String[] args) {
String a = "abcd";
for(String s: all_perm(a)) {
System.out.println(s);
}
}
public static Set<String> concat(String c, Set<String> lst) {
HashSet<String> ret_set = new HashSet<String>();
for(String s: lst) {
ret_set.add(c+s);
}
return ret_set;
}
public static HashSet<String> all_perm(String a) {
HashSet<String> set = new HashSet<String>();
if(a.length() == 1) {
set.add(a);
} else {
for(int i=0; i<a.length(); i++) {
set.addAll(concat(a.charAt(i)+"", all_perm(a.substring(0, i)+a.substring(i+1, a.length()))));
}
}
return set;
}
}
I'm not sure why you would want to do this in the first place. The resulting set for any moderately large values of x and y will be huge, and will grow exponentially as x and/or y get bigger.
Lets say your set of possible characters is the 26 lowercase letters of the alphabet, and you ask your application to generate all permutations where length = 5. Assuming you don't run out of memory you'll get 11,881,376 (i.e. 26 to the power of 5) strings back. Bump that length up to 6, and you'll get 308,915,776 strings back. These numbers get painfully large, very quickly.
Here's a solution I put together in Java. You'll need to provide two runtime arguments (corresponding to x and y). Have fun.
public class GeneratePermutations {
public static void main(String[] args) {
int lower = Integer.parseInt(args[0]);
int upper = Integer.parseInt(args[1]);
if (upper < lower || upper == 0 || lower == 0) {
System.exit(0);
}
for (int length = lower; length <= upper; length++) {
generate(length, "");
}
}
private static void generate(int length, String partial) {
if (length <= 0) {
System.out.println(partial);
} else {
for (char c = 'a'; c <= 'z'; c++) {
generate(length - 1, partial + c);
}
}
}
}
Here's a non-recursive version I came up with, in javascript.
It's not based on Knuth's non-recursive one above, although it has some similarities in element swapping.
I've verified its correctness for input arrays of up to 8 elements.
A quick optimization would be pre-flighting the out array and avoiding push().
The basic idea is:
Given a single source array, generate a first new set of arrays which swap the first element with each subsequent element in turn, each time leaving the other elements unperturbed.
eg: given 1234, generate 1234, 2134, 3214, 4231.
Use each array from the previous pass as the seed for a new pass,
but instead of swapping the first element, swap the second element with each subsequent element. Also, this time, don't include the original array in the output.
Repeat step 2 until done.
Here is the code sample:
function oxe_perm(src, depth, index)
{
var perm = src.slice(); // duplicates src.
perm = perm.split("");
perm[depth] = src[index];
perm[index] = src[depth];
perm = perm.join("");
return perm;
}
function oxe_permutations(src)
{
out = new Array();
out.push(src);
for (depth = 0; depth < src.length; depth++) {
var numInPreviousPass = out.length;
for (var m = 0; m < numInPreviousPass; ++m) {
for (var n = depth + 1; n < src.length; ++n) {
out.push(oxe_perm(out[m], depth, n));
}
}
}
return out;
}
In ruby:
str = "a"
100_000_000.times {puts str.next!}
It is quite fast, but it is going to take some time =). Of course, you can start at "aaaaaaaa" if the short strings aren't interesting to you.
I might have misinterpreted the actual question though - in one of the posts it sounded as if you just needed a bruteforce library of strings, but in the main question it sounds like you need to permutate a particular string.
Your problem is somewhat similar to this one: http://beust.com/weblog/archives/000491.html (list all integers in which none of the digits repeat themselves, which resulted in a whole lot of languages solving it, with the ocaml guy using permutations, and some java guy using yet another solution).
I needed this today, and although the answers already given pointed me in the right direction, they weren't quite what I wanted.
Here's an implementation using Heap's method. The length of the array must be at least 3 and for practical considerations not be bigger than 10 or so, depending on what you want to do, patience and clock speed.
Before you enter your loop, initialise Perm(1 To N) with the first permutation, Stack(3 To N) with zeroes*, and Level with 2**. At the end of the loop call NextPerm, which will return false when we're done.
* VB will do that for you.
** You can change NextPerm a little to make this unnecessary, but it's clearer like this.
Option Explicit
Function NextPerm(Perm() As Long, Stack() As Long, Level As Long) As Boolean
Dim N As Long
If Level = 2 Then
Swap Perm(1), Perm(2)
Level = 3
Else
While Stack(Level) = Level - 1
Stack(Level) = 0
If Level = UBound(Stack) Then Exit Function
Level = Level + 1
Wend
Stack(Level) = Stack(Level) + 1
If Level And 1 Then N = 1 Else N = Stack(Level)
Swap Perm(N), Perm(Level)
Level = 2
End If
NextPerm = True
End Function
Sub Swap(A As Long, B As Long)
A = A Xor B
B = A Xor B
A = A Xor B
End Sub
'This is just for testing.
Private Sub Form_Paint()
Const Max = 8
Dim A(1 To Max) As Long, I As Long
Dim S(3 To Max) As Long, J As Long
Dim Test As New Collection, T As String
For I = 1 To UBound(A)
A(I) = I
Next
Cls
ScaleLeft = 0
J = 2
Do
If CurrentY + TextHeight("0") > ScaleHeight Then
ScaleLeft = ScaleLeft - TextWidth(" 0 ") * (UBound(A) + 1)
CurrentY = 0
CurrentX = 0
End If
T = vbNullString
For I = 1 To UBound(A)
Print A(I);
T = T & Hex(A(I))
Next
Print
Test.Add Null, T
Loop While NextPerm(A, S, J)
J = 1
For I = 2 To UBound(A)
J = J * I
Next
If J <> Test.Count Then Stop
End Sub
Other methods are described by various authors. Knuth describes two, one gives lexical order, but is complex and slow, the other is known as the method of plain changes. Jie Gao and Dianjun Wang also wrote an interesting paper.
Here is a link that describes how to print permutations of a string.
http://nipun-linuxtips.blogspot.in/2012/11/print-all-permutations-of-characters-in.html
This code in python, when called with allowed_characters set to [0,1] and 4 character max, would generate 2^4 results:
['0000', '0001', '0010', '0011', '0100', '0101', '0110', '0111', '1000', '1001', '1010', '1011', '1100', '1101', '1110', '1111']
def generate_permutations(chars = 4) :
#modify if in need!
allowed_chars = [
'0',
'1',
]
status = []
for tmp in range(chars) :
status.append(0)
last_char = len(allowed_chars)
rows = []
for x in xrange(last_char ** chars) :
rows.append("")
for y in range(chars - 1 , -1, -1) :
key = status[y]
rows[x] = allowed_chars[key] + rows[x]
for pos in range(chars - 1, -1, -1) :
if(status[pos] == last_char - 1) :
status[pos] = 0
else :
status[pos] += 1
break;
return rows
import sys
print generate_permutations()
Hope this is of use to you. Works with any character, not only numbers
Many of the previous answers used backtracking. This is the asymptotically optimal way O(n*n!) of generating permutations after initial sorting
class Permutation {
/* runtime -O(n) for generating nextPermutaion
* and O(n*n!) for generating all n! permutations with increasing sorted array as start
* return true, if there exists next lexicographical sequence
* e.g [a,b,c],3-> true, modifies array to [a,c,b]
* e.g [c,b,a],3-> false, as it is largest lexicographic possible */
public static boolean nextPermutation(char[] seq, int len) {
// 1
if (len <= 1)
return false;// no more perm
// 2: Find last j such that seq[j] <= seq[j+1]. Terminate if no such j exists
int j = len - 2;
while (j >= 0 && seq[j] >= seq[j + 1]) {
--j;
}
if (j == -1)
return false;// no more perm
// 3: Find last l such that seq[j] <= seq[l], then exchange elements j and l
int l = len - 1;
while (seq[j] >= seq[l]) {
--l;
}
swap(seq, j, l);
// 4: Reverse elements j+1 ... count-1:
reverseSubArray(seq, j + 1, len - 1);
// return seq, add store next perm
return true;
}
private static void swap(char[] a, int i, int j) {
char temp = a[i];
a[i] = a[j];
a[j] = temp;
}
private static void reverseSubArray(char[] a, int lo, int hi) {
while (lo < hi) {
swap(a, lo, hi);
++lo;
--hi;
}
}
public static void main(String[] args) {
String str = "abcdefg";
char[] array = str.toCharArray();
Arrays.sort(array);
int cnt=0;
do {
System.out.println(new String(array));
cnt++;
}while(nextPermutation(array, array.length));
System.out.println(cnt);//5040=7!
}
//if we use "bab"-> "abb", "bab", "bba", 3(#permutations)
}
Recursive Approach
func StringPermutations(inputStr string) (permutations []string) {
for i := 0; i < len(inputStr); i++ {
inputStr = inputStr[1:] + inputStr[0:1]
if len(inputStr) <= 2 {
permutations = append(permutations, inputStr)
continue
}
leftPermutations := StringPermutations(inputStr[0 : len(inputStr)-1])
for _, leftPermutation := range leftPermutations {
permutations = append(permutations, leftPermutation+inputStr[len(inputStr)-1:])
}
}
return
}
Though this doesn't answer your question exactly, here's one way to generate every permutation of the letters from a number of strings of the same length: eg, if your words were "coffee", "joomla" and "moodle", you can expect output like "coodle", "joodee", "joffle", etc.
Basically, the number of combinations is the (number of words) to the power of (number of letters per word). So, choose a random number between 0 and the number of combinations - 1, convert that number to base (number of words), then use each digit of that number as the indicator for which word to take the next letter from.
eg: in the above example. 3 words, 6 letters = 729 combinations. Choose a random number: 465. Convert to base 3: 122020. Take the first letter from word 1, 2nd from word 2, 3rd from word 2, 4th from word 0... and you get... "joofle".
If you wanted all the permutations, just loop from 0 to 728. Of course, if you're just choosing one random value, a much simpler less-confusing way would be to loop over the letters. This method lets you avoid recursion, should you want all the permutations, plus it makes you look like you know Maths(tm)!
If the number of combinations is excessive, you can break it up into a series of smaller words and concatenate them at the end.
c# iterative:
public List<string> Permutations(char[] chars)
{
List<string> words = new List<string>();
words.Add(chars[0].ToString());
for (int i = 1; i < chars.Length; ++i)
{
int currLen = words.Count;
for (int j = 0; j < currLen; ++j)
{
var w = words[j];
for (int k = 0; k <= w.Length; ++k)
{
var nstr = w.Insert(k, chars[i].ToString());
if (k == 0)
words[j] = nstr;
else
words.Add(nstr);
}
}
}
return words;
}
def gen( x,y,list): #to generate all strings inserting y at different positions
list = []
list.append( y+x )
for i in range( len(x) ):
list.append( func(x,0,i) + y + func(x,i+1,len(x)-1) )
return list
def func( x,i,j ): #returns x[i..j]
z = ''
for i in range(i,j+1):
z = z+x[i]
return z
def perm( x , length , list ): #perm function
if length == 1 : # base case
list.append( x[len(x)-1] )
return list
else:
lists = perm( x , length-1 ,list )
lists_temp = lists #temporarily storing the list
lists = []
for i in range( len(lists_temp) ) :
list_temp = gen(lists_temp[i],x[length-2],lists)
lists += list_temp
return lists
def permutation(str)
posibilities = []
str.split('').each do |char|
if posibilities.size == 0
posibilities[0] = char.downcase
posibilities[1] = char.upcase
else
posibilities_count = posibilities.length
posibilities = posibilities + posibilities
posibilities_count.times do |i|
posibilities[i] += char.downcase
posibilities[i+posibilities_count] += char.upcase
end
end
end
posibilities
end
Here is my take on a non recursive version

Resources