NSData from hex String? - string

I know this question has been asked for ObjectiveC, but my Swift isn't strong enough yet to transliterate the char *bytes stuff.
So given
let string = "600DBEEF"
How do I create an NSData which represents those 4 bytes: 60 0D BE EF?

With the arrival of Swift3 and the new Foundation Data type, I finally circled back to this:
extension UnicodeScalar {
var hexNibble:UInt8 {
let value = self.value
if 48 <= value && value <= 57 {
return UInt8(value - 48)
}
else if 65 <= value && value <= 70 {
return UInt8(value - 55)
}
else if 97 <= value && value <= 102 {
return UInt8(value - 87)
}
fatalError("\(self) not a legal hex nibble")
}
}
extension Data {
init(hex:String) {
let scalars = hex.unicodeScalars
var bytes = Array<UInt8>(repeating: 0, count: (scalars.count + 1) >> 1)
for (index, scalar) in scalars.enumerated() {
var nibble = scalar.hexNibble
if index & 1 == 0 {
nibble <<= 4
}
bytes[index >> 1] |= nibble
}
self = Data(bytes: bytes)
}
}
Now I can construct Data objects in a fashion similar to their printed form:
Data(hex: "600dBeef")

Answer in swift, XCode beta 6
let string = "600DBEEF"
let length = string.characters.count
let rawData = UnsafeMutablePointer<CUnsignedChar>.alloc(length/2)
var rawIndex = 0
for var index = 0; index < length; index+=2{
let single = NSMutableString()
single.appendString(string.substringWithRange(Range(start:string.startIndex.advancedBy(index), end:string.startIndex.advancedBy(index+2))))
rawData[rawIndex] = UInt8(single as String, radix:16)!
rawIndex++
}
let data:NSData = NSData(bytes: rawData, length: length/2)
rawData.dealloc(length/2)

Related

Trouble with indices

I am writing a Maximum Value Knapsack algorithm. It takes in a Knapsack object with Items that have a value and cost. I declare a 2D array for calculating the max value. For the base cases I have set the zeroth row values to 0 and zeroth column values to 0. I am running into trouble when I grab an item in the knapsack because when I want to grab the zeroth item, I am really grabbing the first item in the knapsack and am consequently getting the wrong values in the 2D array. Can someone check out my code and see what I am missing?
public static double MaximumKnapsack(Knapsack knapsack) {
int numItems = knapsack.getNumOfItems();
int budget = (int) knapsack.getBudget();
double[][] DP = new double[numItems+1][budget+1];
boolean taken = false;
for (int i = 0; i < numItems + 1; i++) {
for (int b = 0; b < budget + 1; b++) {
if (i == 0 || b == 0) {
DP[i][b] = 0;
}
else
{
Item item = knapsack.getItem(i);
if (item.getCost() > b) {
DP[i][b] = DP[i-1][b];
}
else
{
DP[i][b] = Math.max(DP[i-1][b-(int) item.getCost()] + item.getValue(),
DP[i-1][b]);
if (DP[i][b] == DP[i-1][b-(int) item.getCost()] + item.getValue() && item.getCost() != 0.0) {
taken = true;
}
}
}
}
taken = false;
}
return DP[numItems][budget];
}
I think the problem is in
Item item = knapsack.getItem(i);
beacuse your loop will start with i = 1. You should use:
Item item = knapsack.getItem(i-1);

how to find decode way to decode a USSD Command's result in c#?

I'm working on my GSM modem (Huawei E171) to send USSD commands.
to do this i use this commands at the first:
AT+CMGF=1
AT+CSCS=? ----> result is "IRA" this is my modem default
after that i sent these commands and i have got these results and everything works fine.
//*141*1# ----->to check my balance
+CUSD:
0,"457A591C96EB40B41A8D0692A6C36C17688A2E9FCB667AD87D4EEB4130103D
0C8281E4753D0B1926E7CB2018881E06C140F2BADE5583819A4250D24D2FC
BDD653A485AD787DD65504C068381A8EF76D80D2287E53A55AD5653D554
31956D04",15
//*100# ----> this command give me some options to charge my mobile
+CUSD:
1,"06280627062C06470020062706CC06310627064606330644000A0030002E062E0
63106CC062F00200634062706310698000A0031002E067E062706330627063106A
F0627062F000A0032002E0622067E000A0033002E06450644062A000A003
4002E06330627064506270646000A0035002E067E0627063106330
6CC06270646000A002300200028006E0065007800740029000A",72
i found some codes to decode these result:
to decode checking balance result i used:
string result141="457A591C96EB40B41A8D0692A6C36C17688A......."
byte[] packedBytes = ConvertHexToBytes(result141);
byte[] unpackedBytes = UnpackBytes(packedBytes);
//gahi in kar mikone gahi balkaee nafahmidam chera
string o = Encoding.Default.GetString(unpackedBytes);
my function's codes are:
public static byte[] ConvertHexToBytes(string hexString)
{
if (hexString.Length % 2 != 0)
return null;
int len = hexString.Length / 2;
byte[] array = new byte[len];
for (int i = 0; i < array.Length; i++)
{
string tmp = hexString.Substring(i * 2, 2);
array[i] =
byte.Parse(tmp, System.Globalization.NumberStyles.HexNumber);
}
return array;
}
public static byte[] UnpackBytes(byte[] packedBytes)
{
byte[] shiftedBytes = new byte[(packedBytes.Length * 8) / 7];
int shiftOffset = 0;
int shiftIndex = 0;
// Shift the packed bytes to the left according
//to the offset (position of the byte)
foreach (byte b in packedBytes)
{
if (shiftOffset == 7)
{
shiftedBytes[shiftIndex] = 0;
shiftOffset = 0;
shiftIndex++;
}
shiftedBytes[shiftIndex] = (byte)((b << shiftOffset) & 127);
shiftOffset++;
shiftIndex++;
}
int moveOffset = 0;
int moveIndex = 0;
int unpackIndex = 1;
byte[] unpackedBytes = new byte[shiftedBytes.Length];
//
if (shiftedBytes.Length > 0)
{
unpackedBytes[unpackIndex - 1] =
shiftedBytes[unpackIndex - 1];
}
// Move the bits to the appropriate byte (unpack the bits)
foreach (byte b in packedBytes)
{
if (unpackIndex != shiftedBytes.Length)
{
if (moveOffset == 7)
{
moveOffset = 0;
unpackIndex++;
unpackedBytes[unpackIndex - 1] =
shiftedBytes[unpackIndex - 1];
}
if (unpackIndex != shiftedBytes.Length)
{
// Extract the bits to be moved
int extractedBitsByte = (packedBytes[moveIndex] &
_decodeMask[moveOffset]);
// Shift the extracted bits to the proper offset
extractedBitsByte =
(extractedBitsByte >> (7 - moveOffset));
// Move the bits to the appropriate byte
//(unpack the bits)
int movedBitsByte =
(extractedBitsByte | shiftedBytes[unpackIndex]);
unpackedBytes[unpackIndex] = (byte)movedBitsByte;
moveOffset++;
unpackIndex++;
moveIndex++;
}
}
}
// Remove the padding if exists
if (unpackedBytes[unpackedBytes.Length - 1] == 0)
{
byte[] finalResultBytes = new byte[unpackedBytes.Length - 1];
Array.Copy(unpackedBytes, 0,
finalResultBytes, 0, finalResultBytes.Length);
return finalResultBytes;
}
return unpackedBytes;
}
but to decode second result i used:
string strHex= "06280627062C06470020062706CC06310......";
strHex = strHex.Replace(" ", "");
int nNumberChars = strHex.Length / 2;
byte[] aBytes = new byte[nNumberChars];
using (var sr = new StringReader(strHex))
{
for (int i = 0; i < nNumberChars; i++)
aBytes[i] = Convert.ToByte(
new String(new char[2] {
(char)sr.Read(), (char)sr.Read() }), 16);
}
string decodedmessage= Encoding.BigEndianUnicode.
GetString(aBytes, 0, aBytes.Length);
both of theme works current but why i should different decoding way to decode these results?
from where i can find, i should use which one of these two types of decoding?
USSD command responses +CUSD unsolicited responses are formatted as follows:
+CUSD: <m>[<str_urc>[<dcs>]]
Where "m" is the type of action required, "str_urc" is the response string, and "dcs" is the response string encoding.
This quote is from a Siemens Cinterion MC55i manual but applies generally to other modem manufacturers:
If dcs indicates that GSM 03.38 default alphabet is used TA converts GSM alphabet into current TE character
set according to rules of GSM 07.05 Annex A. Otherwise in case of invalid or omitted dcs conversion of
str_urc is not possible.
USSD's can be sent in 7-Bit encoded format or UC2 hence when looking at your two example responses you can see either a DCS of 15 or 72.
GSM 03.38 Cell Broadcast Data Coding Scheme in integer format (default 15). In case of an invalid or omitted
dcs from the network side (MT) will not be given out.
So if you get a DCS of 15 then it is 7-Bit encoded. And if it's 72 then it will be UC2. So from this you can easily select either your first decoding routine or second.

Algorithm for finding continuous repeated sequences

I'm looking for an algorithm that finds short tandem repeats in a genome sequence.
Basically, given a really long string which can only consist of the 4 characters 'ATCG', I need to find short repeats between 2-5 characters long that are next to each other.
ex:
TACATGAGATCATGATGATGATGATGGAGCTGTGAGATC
would give ATGATGATG or ATG repeated 3 times
The algorithm needs to scale up to a string of 1 million characters so I'm trying to get as close to linear runtime as possible.
My current algorithm:
Since the repeats can be 2-5 characters long, I check the string character by character and see if the Nth character is the same as the N+Xth character, with X being 2 through 5. With a counter for each X that counts sequential matches and resets at a mismatch, we know if there is a repeat when X = the counter. The subsequent repeats can then be checked manually.
You are looking at each character which gives you O(n), since you compare on each character the next (maximum) five characters this gives you a constant c:
var data = get_input();
var compare = { `A`, `T`, `G`, `A`, `T` } // or whatever
var MAX_LOOKAHEAD = compare.length
var n
var c
for(n = data_array.length; n < size; i++) { // Has runtime O(n)
for(c = 0; c < MAX_LOOKAHEAD; c++) { // Maximum O(c)
if( compare[c] != data[i+c] ) {
break;
} else {
report( "found match at position " + i )
}
}
}
It is easy to see that this runs O(n*c) times. Since c is very small it can be ignored - and I think one can not get rid of that constant - which results in a total runtime of O(n).
The good news:
You can speed this up with parallelization. E.g. you could split this up in k intervals and let multiple threads do the job for you by giving them appropriate start and end indices. This could give you a linear speedup.
If you do that make sure that you treat the intersections as special cases since you could miss a match if your intervals split a match in two.
E.g. n = 50000:
Partition for 4 threads: (n/10000) - 1 = 4. The 5th thread won't have a lot to do since it just handles the intersections which is why we don't need to consider its (in our case tiny) overhead.
1 10000 20000 40000 50000
|-------------------|-------------------|-------------------|-------------------|
| <- thread 1 -> | <- thread 2 -> | <- thread 3 -> | <- thread 4 -> |
|---| |---| |---|
|___________________|___________________|
|
thread 5
And this is how it could look like:
var data;
var compare = { `A`, `T`, `G`, `A`, `T` };
var MAX_LOOKAHEAD = compare.length;
thread_function(args[]) {
var from = args[0];
var to = args[1];
for(n = from ; n < to ; i++) {
for(c = 0; c < MAX_LOOKAHEAD; c++) {
if( compare[c] != data[i+c] ) {
break;
} else {
report( "found match at position " + i )
}
}
}
}
main() {
var data_size = 50000;
var thread_count = 4;
var interval_size = data_size / ( thread_count + 1) ;
var tid[]
// This loop starts the threads for us:
for( var i = 0; i < thread_count; i++ ) {
var args = { interval_size * i, (interval_size * i) + interval_size };
tid.add( create_thread( thread_function, args ) );
}
// And this handles the intersections:
for( var i = 1; i < thread_count - 1; i++ ) {
var args = { interval_size * i, (interval_size * i) + interval_size };
from = (interval_size * i) - compare.length + 1;
to = (interval_size * i) + compare.length - 1;
for(j = from; j < to ; j++) {
for(k = 0; k < MAX_LOOKAHEAD; k++) {
if( compare[k] != data[j+k] ) {
break;
} else {
report( "found match at position " + j )
}
}
}
}
wait_for_multiple_threads( tid );
}

What does following code means int val = str.charAt(i) - 'a';?

The code is taken from career cup book
public static boolean isUniqueChars(String str) {
if (str.length() > 256) {
return false;`
}
int checker = 0;
for (int i = 0; i < str.length(); i++) {
int val = str.charAt(i) - 'a';
if ((checker & (1 << val)) > 0) return false;
checker |= (1 << val);
}
return true;
}
Thank you for explanation and I am not sure what do I get. Lets look at the following code-
public class ConvertAscii {
public static void main(String args[]){
String str ="Hello How are you";
int i =0;
for(i=0;i<str.length();i++){
System.out.println(str.charAt(i)-'a');
}
}
}
It gives me following output-
-24
12
32
34
etc
Also as in the above example we have
For example if str is "fbhsdsbfid" and i is 4 then val is equal to 3. What does subtracting ascii value of character 'a' from another character results in? Please explain more
It takes the character which is at index i in str and substracts the ASCII value of the character 'a'.
For example if str is "fbhsdsbfid" and i is 4 then val is equal to 3.
To answer your question for index i = 4, the character at index 4 is 'd' and it's corresponding ASCII value is 64.
The ASCII value of 'a' is 61.Therefore, str.charAt(i) - 'a' gives 64 - 61 = 3.

How to find smallest substring which contains all characters from a given string?

I have recently come across an interesting question on strings. Suppose you are given following:
Input string1: "this is a test string"
Input string2: "tist"
Output string: "t stri"
So, given above, how can I approach towards finding smallest substring of string1 that contains all the characters from string 2?
To see more details including working code, check my blog post at:
http://www.leetcode.com/2010/11/finding-minimum-window-in-s-which.html
To help illustrate this approach, I use an example: string1 = "acbbaca" and string2 = "aba". Here, we also use the term "window", which means a contiguous block of characters from string1 (could be interchanged with the term substring).
i) string1 = "acbbaca" and string2 = "aba".
ii) The first minimum window is found.
Notice that we cannot advance begin
pointer as hasFound['a'] ==
needToFind['a'] == 2. Advancing would
mean breaking the constraint.
iii) The second window is found. begin
pointer still points to the first
element 'a'. hasFound['a'] (3) is
greater than needToFind['a'] (2). We
decrement hasFound['a'] by one and
advance begin pointer to the right.
iv) We skip 'c' since it is not found
in string2. Begin pointer now points to 'b'.
hasFound['b'] (2) is greater than
needToFind['b'] (1). We decrement
hasFound['b'] by one and advance begin
pointer to the right.
v) Begin pointer now points to the
next 'b'. hasFound['b'] (1) is equal
to needToFind['b'] (1). We stop
immediately and this is our newly
found minimum window.
The idea is mainly based on the help of two pointers (begin and end position of the window) and two tables (needToFind and hasFound) while traversing string1. needToFind stores the total count of a character in string2 and hasFound stores the total count of a character met so far. We also use a count variable to store the total characters in string2 that's met so far (not counting characters where hasFound[x] exceeds needToFind[x]). When count equals string2's length, we know a valid window is found.
Each time we advance the end pointer (pointing to an element x), we increment hasFound[x] by one. We also increment count by one if hasFound[x] is less than or equal to needToFind[x]. Why? When the constraint is met (that is, count equals to string2's size), we immediately advance begin pointer as far right as possible while maintaining the constraint.
How do we check if it is maintaining the constraint? Assume that begin points to an element x, we check if hasFound[x] is greater than needToFind[x]. If it is, we can decrement hasFound[x] by one and advancing begin pointer without breaking the constraint. On the other hand, if it is not, we stop immediately as advancing begin pointer breaks the window constraint.
Finally, we check if the minimum window length is less than the current minimum. Update the current minimum if a new minimum is found.
Essentially, the algorithm finds the first window that satisfies the constraint, then continue maintaining the constraint throughout.
You can do a histogram sweep in O(N+M) time and O(1) space where N is the number of characters in the first string and M is the number of characters in the second.
It works like this:
Make a histogram of the second string's characters (key operation is hist2[ s2[i] ]++).
Make a cumulative histogram of the first string's characters until that histogram contains every character that the second string's histogram contains (which I will call "the histogram condition").
Then move forwards on the first string, subtracting from the histogram, until it fails to meet the histogram condition. Mark that bit of the first string (before the final move) as your tentative substring.
Move the front of the substring forwards again until you meet the histogram condition again. Move the end forwards until it fails again. If this is a shorter substring than the first, mark that as your tentative substring.
Repeat until you've passed through the entire first string.
The marked substring is your answer.
Note that by varying the check you use on the histogram condition, you can choose either to have the same set of characters as the second string, or at least as many characters of each type. (Its just the difference between a[i]>0 && b[i]>0 and a[i]>=b[i].)
You can speed up the histogram checks if you keep a track of which condition is not satisfied when you're trying to satisfy it, and checking only the thing that you decrement when you're trying to break it. (On the initial buildup, you count how many items you've satisfied, and increment that count every time you add a new character that takes the condition from false to true.)
Here's an O(n) solution. The basic idea is simple: for each starting index, find the least ending index such that the substring contains all of the necessary letters. The trick is that the least ending index increases over the course of the function, so with a little data structure support, we consider each character at most twice.
In Python:
from collections import defaultdict
def smallest(s1, s2):
assert s2 != ''
d = defaultdict(int)
nneg = [0] # number of negative entries in d
def incr(c):
d[c] += 1
if d[c] == 0:
nneg[0] -= 1
def decr(c):
if d[c] == 0:
nneg[0] += 1
d[c] -= 1
for c in s2:
decr(c)
minlen = len(s1) + 1
j = 0
for i in xrange(len(s1)):
while nneg[0] > 0:
if j >= len(s1):
return minlen
incr(s1[j])
j += 1
minlen = min(minlen, j - i)
decr(s1[i])
return minlen
I received the same interview question. I am a C++ candidate but I was in a position to code relatively fast in JAVA.
Java [Courtesy : Sumod Mathilakath]
import java.io.*;
import java.util.*;
class UserMainCode
{
public String GetSubString(String input1,String input2){
// Write code here...
return find(input1, input2);
}
private static boolean containsPatternChar(int[] sCount, int[] pCount) {
for(int i=0;i<256;i++) {
if(pCount[i]>sCount[i])
return false;
}
return true;
}
public static String find(String s, String p) {
if (p.length() > s.length())
return null;
int[] pCount = new int[256];
int[] sCount = new int[256];
// Time: O(p.lenght)
for(int i=0;i<p.length();i++) {
pCount[(int)(p.charAt(i))]++;
sCount[(int)(s.charAt(i))]++;
}
int i = 0, j = p.length(), min = Integer.MAX_VALUE;
String res = null;
// Time: O(s.lenght)
while (j < s.length()) {
if (containsPatternChar(sCount, pCount)) {
if ((j - i) < min) {
min = j - i;
res = s.substring(i, j);
// This is the smallest possible substring.
if(min==p.length())
break;
// Reduce the window size.
sCount[(int)(s.charAt(i))]--;
i++;
}
} else {
sCount[(int)(s.charAt(j))]++;
// Increase the window size.
j++;
}
}
System.out.println(res);
return res;
}
}
C++ [Courtesy : sundeepblue]
#include <iostream>
#include <vector>
#include <string>
#include <climits>
using namespace std;
string find_minimum_window(string s, string t) {
if(s.empty() || t.empty()) return;
int ns = s.size(), nt = t.size();
vector<int> total(256, 0);
vector<int> sofar(256, 0);
for(int i=0; i<nt; i++)
total[t[i]]++;
int L = 0, R;
int minL = 0; //gist2
int count = 0;
int min_win_len = INT_MAX;
for(R=0; R<ns; R++) { // gist0, a big for loop
if(total[s[R]] == 0) continue;
else sofar[s[R]]++;
if(sofar[s[R]] <= total[s[R]]) // gist1, <= not <
count++;
if(count == nt) { // POS1
while(true) {
char c = s[L];
if(total[c] == 0) { L++; }
else if(sofar[c] > total[c]) {
sofar[c]--;
L++;
}
else break;
}
if(R - L + 1 < min_win_len) { // this judge should be inside POS1
min_win_len = R - L + 1;
minL = L;
}
}
}
string res;
if(count == nt) // gist3, cannot forget this.
res = s.substr(minL, min_win_len); // gist4, start from "minL" not "L"
return res;
}
int main() {
string s = "abdccdedca";
cout << find_minimum_window(s, "acd");
}
Erlang [Courtesy : wardbekker]
-module(leetcode).
-export([min_window/0]).
%% Given a string S and a string T, find the minimum window in S which will contain all the characters in T in complexity O(n).
%% For example,
%% S = "ADOBECODEBANC"
%% T = "ABC"
%% Minimum window is "BANC".
%% Note:
%% If there is no such window in S that covers all characters in T, return the emtpy string "".
%% If there are multiple such windows, you are guaranteed that there will always be only one unique minimum window in S.
min_window() ->
"eca" = min_window("cabeca", "cae"),
"eca" = min_window("cfabeca", "cae"),
"aec" = min_window("cabefgecdaecf", "cae"),
"cwae" = min_window("cabwefgewcwaefcf", "cae"),
"BANC" = min_window("ADOBECODEBANC", "ABC"),
ok.
min_window(T, S) ->
min_window(T, S, []).
min_window([], _T, MinWindow) ->
MinWindow;
min_window([H | Rest], T, MinWindow) ->
NewMinWindow = case lists:member(H, T) of
true ->
MinWindowFound = fullfill_window(Rest, lists:delete(H, T), [H]),
case length(MinWindow) == 0 orelse (length(MinWindow) > length(MinWindowFound)
andalso length(MinWindowFound) > 0) of
true ->
MinWindowFound;
false ->
MinWindow
end;
false ->
MinWindow
end,
min_window(Rest, T, NewMinWindow).
fullfill_window(_, [], Acc) ->
%% window completed
Acc;
fullfill_window([], _T, _Acc) ->
%% no window found
"";
fullfill_window([H | Rest], T, Acc) ->
%% completing window
case lists:member(H, T) of
true ->
fullfill_window(Rest, lists:delete(H, T), Acc ++ [H]);
false ->
fullfill_window(Rest, T, Acc ++ [H])
end.
REF:
http://articles.leetcode.com/finding-minimum-window-in-s-which/#comment-511216
http://www.mif.vu.lt/~valdas/ALGORITMAI/LITERATURA/Cormen/Cormen.pdf
Please have a look at this as well:
//-----------------------------------------------------------------------
bool IsInSet(char ch, char* cSet)
{
char* cSetptr = cSet;
int index = 0;
while (*(cSet+ index) != '\0')
{
if(ch == *(cSet+ index))
{
return true;
}
++index;
}
return false;
}
void removeChar(char ch, char* cSet)
{
bool bShift = false;
int index = 0;
while (*(cSet + index) != '\0')
{
if( (ch == *(cSet + index)) || bShift)
{
*(cSet + index) = *(cSet + index + 1);
bShift = true;
}
++index;
}
}
typedef struct subStr
{
short iStart;
short iEnd;
short szStr;
}ss;
char* subStringSmallest(char* testStr, char* cSet)
{
char* subString = NULL;
int iSzSet = strlen(cSet) + 1;
int iSzString = strlen(testStr)+ 1;
char* cSetBackUp = new char[iSzSet];
memcpy((void*)cSetBackUp, (void*)cSet, iSzSet);
int iStartIndx = -1;
int iEndIndx = -1;
int iIndexStartNext = -1;
std::vector<ss> subStrVec;
int index = 0;
while( *(testStr+index) != '\0' )
{
if (IsInSet(*(testStr+index), cSetBackUp))
{
removeChar(*(testStr+index), cSetBackUp);
if(iStartIndx < 0)
{
iStartIndx = index;
}
else if( iIndexStartNext < 0)
iIndexStartNext = index;
else
;
if (strlen(cSetBackUp) == 0 )
{
iEndIndx = index;
if( iIndexStartNext == -1)
break;
else
{
index = iIndexStartNext;
ss stemp = {iStartIndx, iEndIndx, (iEndIndx-iStartIndx + 1)};
subStrVec.push_back(stemp);
iStartIndx = iEndIndx = iIndexStartNext = -1;
memcpy((void*)cSetBackUp, (void*)cSet, iSzSet);
continue;
}
}
}
else
{
if (IsInSet(*(testStr+index), cSet))
{
if(iIndexStartNext < 0)
iIndexStartNext = index;
}
}
++index;
}
int indexSmallest = 0;
for(int indexVec = 0; indexVec < subStrVec.size(); ++indexVec)
{
if(subStrVec[indexSmallest].szStr > subStrVec[indexVec].szStr)
indexSmallest = indexVec;
}
subString = new char[(subStrVec[indexSmallest].szStr) + 1];
memcpy((void*)subString, (void*)(testStr+ subStrVec[indexSmallest].iStart), subStrVec[indexSmallest].szStr);
memset((void*)(subString + subStrVec[indexSmallest].szStr), 0, 1);
delete[] cSetBackUp;
return subString;
}
//--------------------------------------------------------------------
Edit: apparently there's an O(n) algorithm (cf. algorithmist's answer). Obviously this have this will beat the [naive] baseline described below!
Too bad I gotta go... I'm a bit suspicious that we can get O(n). I'll check in tomorrow to see the winner ;-) Have fun!
Tentative algorithm:
The general idea is to sequentially try and use a character from str2 found in str1 as the start of a search (in either/both directions) of all the other letters of str2. By keeping a "length of best match so far" value, we can abort searches when they exceed this. Other heuristics can probably be used to further abort suboptimal (so far) solutions. The choice of the order of the starting letters in str1 matters much; it is suggested to start with the letter(s) of str1 which have the lowest count and to try with the other letters, of an increasing count, in subsequent attempts.
[loose pseudo-code]
- get count for each letter/character in str1 (number of As, Bs etc.)
- get count for each letter in str2
- minLen = length(str1) + 1 (the +1 indicates you're not sure all chars of
str2 are in str1)
- Starting with the letter from string2 which is found the least in string1,
look for other letters of Str2, in either direction of str1, until you've
found them all (or not, at which case response = impossible => done!).
set x = length(corresponding substring of str1).
- if (x < minLen),
set minlen = x,
also memorize the start/len of the str1 substring.
- continue trying with other letters of str1 (going the up the frequency
list in str1), but abort search as soon as length(substring of strl)
reaches or exceed minLen.
We can find a few other heuristics that would allow aborting a
particular search, based on [pre-calculated ?] distance between a given
letter in str1 and some (all?) of the letters in str2.
- the overall search terminates when minLen = length(str2) or when
we've used all letters of str1 (which match one letter of str2)
as a starting point for the search
Here is Java implementation
public static String shortestSubstrContainingAllChars(String input, String target) {
int needToFind[] = new int[256];
int hasFound[] = new int[256];
int totalCharCount = 0;
String result = null;
char[] targetCharArray = target.toCharArray();
for (int i = 0; i < targetCharArray.length; i++) {
needToFind[targetCharArray[i]]++;
}
char[] inputCharArray = input.toCharArray();
for (int begin = 0, end = 0; end < inputCharArray.length; end++) {
if (needToFind[inputCharArray[end]] == 0) {
continue;
}
hasFound[inputCharArray[end]]++;
if (hasFound[inputCharArray[end]] <= needToFind[inputCharArray[end]]) {
totalCharCount ++;
}
if (totalCharCount == target.length()) {
while (needToFind[inputCharArray[begin]] == 0
|| hasFound[inputCharArray[begin]] > needToFind[inputCharArray[begin]]) {
if (hasFound[inputCharArray[begin]] > needToFind[inputCharArray[begin]]) {
hasFound[inputCharArray[begin]]--;
}
begin++;
}
String substring = input.substring(begin, end + 1);
if (result == null || result.length() > substring.length()) {
result = substring;
}
}
}
return result;
}
Here is the Junit Test
#Test
public void shortestSubstringContainingAllCharsTest() {
String result = StringUtil.shortestSubstrContainingAllChars("acbbaca", "aba");
assertThat(result, equalTo("baca"));
result = StringUtil.shortestSubstrContainingAllChars("acbbADOBECODEBANCaca", "ABC");
assertThat(result, equalTo("BANC"));
result = StringUtil.shortestSubstrContainingAllChars("this is a test string", "tist");
assertThat(result, equalTo("t stri"));
}
//[ShortestSubstring.java][1]
public class ShortestSubstring {
public static void main(String[] args) {
String input1 = "My name is Fran";
String input2 = "rim";
System.out.println(getShortestSubstring(input1, input2));
}
private static String getShortestSubstring(String mainString, String toBeSearched) {
int mainStringLength = mainString.length();
int toBeSearchedLength = toBeSearched.length();
if (toBeSearchedLength > mainStringLength) {
throw new IllegalArgumentException("search string cannot be larger than main string");
}
for (int j = 0; j < mainStringLength; j++) {
for (int i = 0; i <= mainStringLength - toBeSearchedLength; i++) {
String substring = mainString.substring(i, i + toBeSearchedLength);
if (checkIfMatchFound(substring, toBeSearched)) {
return substring;
}
}
toBeSearchedLength++;
}
return null;
}
private static boolean checkIfMatchFound(String substring, String toBeSearched) {
char[] charArraySubstring = substring.toCharArray();
char[] charArrayToBeSearched = toBeSearched.toCharArray();
int count = 0;
for (int i = 0; i < charArraySubstring.length; i++) {
for (int j = 0; j < charArrayToBeSearched.length; j++) {
if (String.valueOf(charArraySubstring[i]).equalsIgnoreCase(String.valueOf(charArrayToBeSearched[j]))) {
count++;
}
}
}
return count == charArrayToBeSearched.length;
}
}
This is an approach using prime numbers to avoid one loop, and replace it with multiplications. Several other minor optimizations can be made.
Assign a unique prime number to any of the characters that you want to find, and 1 to the uninteresting characters.
Find the product of a matching string by multiplying the prime number with the number of occurrences it should have. Now this product can only be found if the same prime factors are used.
Search the string from the beginning, multiplying the respective prime number as you move into a running product.
If the number is greater than the correct sum, remove the first character and divide its prime number out of your running product.
If the number is less than the correct sum, include the next character and multiply it into your running product.
If the number is the same as the correct sum you have found a match, slide beginning and end to next character and continue searching for other matches.
Decide which of the matches is the shortest.
Gist
charcount = { 'a': 3, 'b' : 1 };
str = "kjhdfsbabasdadaaaaasdkaaajbajerhhayeom"
def find (c, s):
Ns = len (s)
C = list (c.keys ())
D = list (c.values ())
# prime numbers assigned to the first 25 chars
prmsi = [ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89 , 97]
# primes used in the key, all other set to 1
prms = []
Cord = [ord(c) - ord('a') for c in C]
for e,p in enumerate(prmsi):
if e in Cord:
prms.append (p)
else:
prms.append (1)
# Product of match
T = 1
for c,d in zip(C,D):
p = prms[ord (c) - ord('a')]
T *= p**d
print ("T=", T)
t = 1 # product of current string
f = 0
i = 0
matches = []
mi = 0
mn = Ns
mm = 0
while i < Ns:
k = prms[ord(s[i]) - ord ('a')]
t *= k
print ("testing:", s[f:i+1])
if (t > T):
# included too many chars: move start
t /= prms[ord(s[f]) - ord('a')] # remove first char, usually division by 1
f += 1 # increment start position
t /= k # will be retested, could be replaced with bool
elif t == T:
# found match
print ("FOUND match:", s[f:i+1])
matches.append (s[f:i+1])
if (i - f) < mn:
mm = mi
mn = i - f
mi += 1
t /= prms[ord(s[f]) - ord('a')] # remove first matching char
# look for next match
i += 1
f += 1
else:
# no match yet, keep searching
i += 1
return (mm, matches)
print (find (charcount, str))
(note: this answer was originally posted to a duplicate question, the original answer is now deleted.)
C# Implementation:
public static Tuple<int, int> FindMinSubstringWindow(string input, string pattern)
{
Tuple<int, int> windowCoords = new Tuple<int, int>(0, input.Length - 1);
int[] patternHist = new int[256];
for (int i = 0; i < pattern.Length; i++)
{
patternHist[pattern[i]]++;
}
int[] inputHist = new int[256];
int minWindowLength = int.MaxValue;
int count = 0;
for (int begin = 0, end = 0; end < input.Length; end++)
{
// Skip what's not in pattern.
if (patternHist[input[end]] == 0)
{
continue;
}
inputHist[input[end]]++;
// Count letters that are in pattern.
if (inputHist[input[end]] <= patternHist[input[end]])
{
count++;
}
// Window found.
if (count == pattern.Length)
{
// Remove extra instances of letters from pattern
// or just letters that aren't part of the pattern
// from the beginning.
while (patternHist[input[begin]] == 0 ||
inputHist[input[begin]] > patternHist[input[begin]])
{
if (inputHist[input[begin]] > patternHist[input[begin]])
{
inputHist[input[begin]]--;
}
begin++;
}
// Current window found.
int windowLength = end - begin + 1;
if (windowLength < minWindowLength)
{
windowCoords = new Tuple<int, int>(begin, end);
minWindowLength = windowLength;
}
}
}
if (count == pattern.Length)
{
return windowCoords;
}
return null;
}
I've implemented it using Python3 at O(N) efficiency:
def get(s, alphabet="abc"):
seen = {}
for c in alphabet:
seen[c] = 0
seen[s[0]] = 1
start = 0
end = 0
shortest_s = 0
shortest_e = 99999
while end + 1 < len(s):
while seen[s[start]] > 1:
seen[s[start]] -= 1
start += 1
# Constant time check:
if sum(seen.values()) == len(alphabet) and all(v == 1 for v in seen.values()) and \
shortest_e - shortest_s > end - start:
shortest_s = start
shortest_e = end
end += 1
seen[s[end]] += 1
return s[shortest_s: shortest_e + 1]
print(get("abbcac")) # Expected to return "bca"
String s = "xyyzyzyx";
String s1 = "xyz";
String finalString ="";
Map<Character,Integer> hm = new HashMap<>();
if(s1!=null && s!=null && s.length()>s1.length()){
for(int i =0;i<s1.length();i++){
if(hm.get(s1.charAt(i))!=null){
int k = hm.get(s1.charAt(i))+1;
hm.put(s1.charAt(i), k);
}else
hm.put(s1.charAt(i), 1);
}
Map<Character,Integer> t = new HashMap<>();
int start =-1;
for(int j=0;j<s.length();j++){
if(hm.get(s.charAt(j))!=null){
if(t.get(s.charAt(j))!=null){
if(t.get(s.charAt(j))!=hm.get(s.charAt(j))){
int k = t.get(s.charAt(j))+1;
t.put(s.charAt(j), k);
}
}else{
t.put(s.charAt(j), 1);
if(start==-1){
if(j+s1.length()>s.length()){
break;
}
start = j;
}
}
if(hm.equals(t)){
t = new HashMap<>();
if(finalString.length()<s.substring(start,j+1).length());
{
finalString=s.substring(start,j+1);
}
j=start;
start=-1;
}
}
}
JavaScript solution in bruteforce way:
function shortestSubStringOfUniqueChars(s){
var uniqueArr = [];
for(let i=0; i<s.length; i++){
if(uniqueArr.indexOf(s.charAt(i)) <0){
uniqueArr.push(s.charAt(i));
}
}
let windoww = uniqueArr.length;
while(windoww < s.length){
for(let i=0; i<s.length - windoww; i++){
let match = true;
let tempArr = [];
for(let j=0; j<uniqueArr.length; j++){
if(uniqueArr.indexOf(s.charAt(i+j))<0){
match = false;
break;
}
}
let checkStr
if(match){
checkStr = s.substr(i, windoww);
for(let j=0; j<uniqueArr.length; j++){
if(uniqueArr.indexOf(checkStr.charAt(j))<0){
match = false;
break;
}
}
}
if(match){
return checkStr;
}
}
windoww = windoww + 1;
}
}
console.log(shortestSubStringOfUniqueChars("ABA"));
# Python implementation
s = input('Enter the string : ')
s1 = input('Enter the substring to search : ')
l = [] # List to record all the matching combinations
check = all([char in s for char in s1])
if check == True:
for i in range(len(s1),len(s)+1) :
for j in range(0,i+len(s1)+2):
if (i+j) < len(s)+1:
cnt = 0
b = all([char in s[j:i+j] for char in s1])
if (b == True) :
l.append(s[j:i+j])
print('The smallest substring containing',s1,'is',l[0])
else:
print('Please enter a valid substring')
Java code for the approach discussed above:
private static Map<Character, Integer> frequency;
private static Set<Character> charsCovered;
private static Map<Character, Integer> encountered;
/**
* To set the first match index as an intial start point
*/
private static boolean hasStarted = false;
private static int currentStartIndex = 0;
private static int finalStartIndex = 0;
private static int finalEndIndex = 0;
private static int minLen = Integer.MAX_VALUE;
private static int currentLen = 0;
/**
* Whether we have already found the match and now looking for other
* alternatives.
*/
private static boolean isFound = false;
private static char currentChar;
public static String findSmallestSubStringWithAllChars(String big, String small) {
if (null == big || null == small || big.isEmpty() || small.isEmpty()) {
return null;
}
frequency = new HashMap<Character, Integer>();
instantiateFrequencyMap(small);
charsCovered = new HashSet<Character>();
int charsToBeCovered = frequency.size();
encountered = new HashMap<Character, Integer>();
for (int i = 0; i < big.length(); i++) {
currentChar = big.charAt(i);
if (frequency.containsKey(currentChar) && !isFound) {
if (!hasStarted && !isFound) {
hasStarted = true;
currentStartIndex = i;
}
updateEncounteredMapAndCharsCoveredSet(currentChar);
if (charsCovered.size() == charsToBeCovered) {
currentLen = i - currentStartIndex;
isFound = true;
updateMinLength(i);
}
} else if (frequency.containsKey(currentChar) && isFound) {
updateEncounteredMapAndCharsCoveredSet(currentChar);
if (currentChar == big.charAt(currentStartIndex)) {
encountered.put(currentChar, encountered.get(currentChar) - 1);
currentStartIndex++;
while (currentStartIndex < i) {
if (encountered.containsKey(big.charAt(currentStartIndex))
&& encountered.get(big.charAt(currentStartIndex)) > frequency.get(big
.charAt(currentStartIndex))) {
encountered.put(big.charAt(currentStartIndex),
encountered.get(big.charAt(currentStartIndex)) - 1);
} else if (encountered.containsKey(big.charAt(currentStartIndex))) {
break;
}
currentStartIndex++;
}
}
currentLen = i - currentStartIndex;
updateMinLength(i);
}
}
System.out.println("start: " + finalStartIndex + " finalEnd : " + finalEndIndex);
return big.substring(finalStartIndex, finalEndIndex + 1);
}
private static void updateMinLength(int index) {
if (minLen > currentLen) {
minLen = currentLen;
finalStartIndex = currentStartIndex;
finalEndIndex = index;
}
}
private static void updateEncounteredMapAndCharsCoveredSet(Character currentChar) {
if (encountered.containsKey(currentChar)) {
encountered.put(currentChar, encountered.get(currentChar) + 1);
} else {
encountered.put(currentChar, 1);
}
if (encountered.get(currentChar) >= frequency.get(currentChar)) {
charsCovered.add(currentChar);
}
}
private static void instantiateFrequencyMap(String str) {
for (char c : str.toCharArray()) {
if (frequency.containsKey(c)) {
frequency.put(c, frequency.get(c) + 1);
} else {
frequency.put(c, 1);
}
}
}
public static void main(String[] args) {
String big = "this is a test string";
String small = "tist";
System.out.println("len: " + big.length());
System.out.println(findSmallestSubStringWithAllChars(big, small));
}
def minimum_window(s, t, min_length = 100000):
d = {}
for x in t:
if x in d:
d[x]+= 1
else:
d[x] = 1
tot = sum([y for x,y in d.iteritems()])
l = []
ind = 0
for i,x in enumerate(s):
if ind == 1:
l = l + [x]
if x in d:
tot-=1
if not l:
ind = 1
l = [x]
if tot == 0:
if len(l)<min_length:
min_length = len(l)
min_length = minimum_window(s[i+1:], t, min_length)
return min_length
l_s = "ADOBECODEBANC"
t_s = "ABC"
min_length = minimum_window(l_s, t_s)
if min_length == 100000:
print "Not found"
else:
print min_length

Resources