Here is a trie made to work on alphabets/scripts from any language.
class TrieNode {
constructor(key) {
// the "key" value will be the character in sequence
this.key = key;
// we keep a reference to parent
this.parent = null;
// we have hash of children
this.children = {};
// check to see if the node is at the end
this.end = false;
}
getWord() {
let output = [];
let node = this;
while (node !== null) {
output.unshift(node.key)
node = node.parent
}
return output.join('')
}
}
class Trie {
constructor() {
this.base = new TrieNode(null)
}
insert(word) {
let node = this.base
const points = Array.from(word)
for (const i in points) {
const point = points[i]
if (!node.children[point]) {
const child = node.children[point] = new TrieNode(point)
child.parent = node
}
node = node.children[point]
if (i == word.length - 1) {
node.end = true
}
}
}
contains(word) {
let node = this.base
const points = Array.from(word)
for (const i in points) {
const point = points[i]
if (node.children[point]) {
node = node.children[point]
} else {
return false
}
}
return node.end;
}
find(prefix) {
let node = this.base
let output = []
const points = Array.from(prefix)
for (const i in points) {
const point = points[i]
// make sure prefix actually has words
if (node.children[point]) {
node = node.children[point]
} else {
// there's none. just return it.
return output
}
}
const stack = [node]
while (stack.length) {
node = stack.shift()
// base case, if node is at a word, push to output
if (node.end) {
output.unshift(node.getWord())
}
// iterate through each children, call recursive findAllWords
for (var child in node.children) {
stack.push(node.children[child])
}
}
return output
}
}
After asking How to efficiently store 1 million words and query them by starts_with, contains, or ends_with? and getting some answers, I am wondering how the "contains" and "unscrambles" parts can be implemented as a trie. The prefix ("starts with") search is easily handled by the trie.
const fs = require('fs')
const Trie = require('./Trie')
const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
.trim()
.split(/\n+/)
.map(x => x.trim())
const trie = new Trie()
words.forEach(word => trie.insert(word))
console.log(trie.find('zy'))
[
'zygodactylous', 'zygomorphies', 'zygapophysis',
'zygapophyses', 'zygomorphic', 'zymologies',
'zygospores', 'zygosities', 'zygomorphy',
'zygodactyl', 'zymurgies', 'zymograms',
'zymogenes', 'zygotenes', 'zygospore',
'zygomatic', 'zyzzyvas', 'zymosans',
'zymology', 'zymogram', 'zymogens',
'zymogene', 'zygotene', 'zygosity',
'zygomata', 'zyzzyva', 'zymurgy',
'zymotic', 'zymosis', 'zymoses',
'zymosan', 'zymogen', 'zymases',
'zygotic', 'zygotes', 'zygosis',
'zygoses', 'zygomas', 'zydecos',
'zymase', 'zygote', 'zygose',
'zygoma', 'zygoid', 'zydeco',
'zymes', 'zyme'
]
Here is the tmp/scrabble.csv I used.
The question is, can a trie be used to find all the words which "contain" some input string, or find all the words which "unscramble" the input string? I am curious how to accomplish this with a Trie. If a trie cannot do this efficiently, then knowing why not, and potentially what I should be looking at instead, will be a great answer as well.
From my initial thought-attempts at solving "contains", it seems I would have to create a trie which maps all possible combinations of substrings to the final word, but that seems like it will be an explosion of memory, so not sure how to better reason about this.
For "unscrambles", where you put in "caldku" and it finds "duck", amongst other possible words, I think possibly, similar to the linked answer to the other question, maybe sorting "duck" into "cdku", and then storing that in the trie, and then sorting the input to "acdklu", and then searching for "contains" using the previous algorithm, but hmm, no that will break in a few cases. So maybe a trie is not the right approach for these two problems? If it is, roughly how would you do it (you don't need to provide a full implementation, unless you'd like and it's straightforward enough).
Here is an answer to unscrambles that ALSO handles pagination. It actually returns [words, startOfNextPage]. So to find the next page you call it starting at startOfNextPage.
The idea is to normalize words by sorting the letters, and then storing the word in the trie once. When searching we can dig into the tree both for the next letter we want, and any letter before it (because the letter we want may come after). So we also store a lookup to know what letters might be found, so that we can break out early without having to look at a lot of bad options. And, of course, we include counts so that we can calculate how much of the trie we have explored and/or skipped over.
class Trie {
constructor(depth=0) {
this.depth = depth;
this.children = {};
this.words = [];
this.count = 0;
this.isSorted = false;
this.hasChildWith = {};
}
insert (word) {
if (word != undefined) {
this._insert(Array.from(word).sort(), word);
}
}
_insert (key, word) {
this.count++;
this.isSorted = false;
if (key.length == this.depth) {
this.words.push(word);
}
else {
for (let i = this.depth+1; i < key.length; i++) {
this.hasChildWith[key[i]] = true;
}
if (! this.children[key[this.depth]] ) {
this.children[key[this.depth]] = new Trie(this.depth+1);
}
this.children[key[this.depth]]._insert(key, word);
}
}
sort () {
if (! this.isSorted) {
this.words.sort();
let letters = Object.keys(this.children);
letters.sort();
// Keys come out of a hash in insertion order.
let orderedChildren = {};
for (let i = 0; i < letters.length; i++) {
orderedChildren[letters[i]] = this.children[letters[i]];
}
this.children = orderedChildren;
}
}
find (letters, startAt=0, maxCount=null) {
return this._find(Array.from(letters).sort(), 0, startAt, maxCount || this.count, 0);
}
_find (key, keyPos, startAt, maxCount, curPos) {
if (curPos + this.count < startAt) {
return [[], curPos + this.count];
}
this.sort(); // Make sure we are sorted.
let answer = [];
if (keyPos < key.length) {
// We have not yet found all the letters.
// tmpPos will track how much we looked at in detail
let tmpPos = curPos;
tmpPos += this.words.length;
for (const [k, v] of Object.entries(this.children)) {
if (k < key[keyPos]) {
// The next letter we want can be deeper in the trie?
if (v.hasChildWith[key[keyPos]]) {
// It is! Let's find it.
let result = v._find(key, keyPos, startAt, maxCount - answer.length, tmpPos);
answer = answer.concat(result[0]);
if (maxCount <= answer.length) {
// We finished our answer, return it and what we found.
return [answer, result[1]];
}
}
tmpPos += v.count; // Didn't find it, but track that we've seen this.
}
else if (k == key[keyPos]) {
// We found our next letter! Search deeper.
let result = v._find(key, keyPos+1, startAt, maxCount - answer.length, tmpPos);
answer = answer.concat(result[0]);
if (maxCount <= answer.length) {
// We finished the search.
return [answer, result[1]];
}
else {
// No other letter can match.
break;
}
}
else {
// Neither this or any later letter can match.
break;
}
}
// Return our partial answer and mark that we went
// through this whole node.
return [answer, curPos + this.count];
}
else {
// We have our letters and are recursively finding words.
if (startAt <= curPos + this.words.length) {
answer = this.words.slice(startAt - curPos);
if (maxCount <= answer.length) {
// No need to search deeper, we're done.
answer = answer.slice(0, maxCount);
return [answer, curPos + answer.length];
}
curPos += answer.length;
}
for (const child of Object.values(this.children)) {
let result = child._find(key, keyPos, startAt, maxCount - answer.length, curPos);
answer = answer.concat(result[0])
if (maxCount <= answer.length) {
return [answer, result[1]];
}
else {
curPos += child.count;
}
}
return [answer, curPos];
}
}
}
exports.Trie = Trie
And here is some code to test it with.
const fs = require('fs')
const AnagramTrie = require('./AnagramTrie')
const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
.trim()
.split(/\n+/)
.map(x => x.trim())
const trie = new AnagramTrie.Trie()
words.forEach(word => trie.insert(word))
console.log(trie.find('b', 0, 12))
console.log(trie.find('b', 0, 6))
console.log(trie.find('b', 11, 6))
console.log(trie.find('zx', 0, 12))
console.log(trie.find('zx', 0, 6))
console.log(trie.find('zx', 60875, 6))
It will return words in the order of their sorted anagram, followed by the word itself. So if you search for admired you'll find unadmired first because the n in un comes before the r in admired. And you'll find that disembarrassed comes before either because it has 2 a's in it.
And here is an answer to contains. Note how, despite using a Trie both times, it needs a lot of customization to the problem we are solving.
Sample code to use it is like the other one. You can see the deduplication at work if you search for a. The answer aa (correctly) only comes up once. Also it is a little slower to start up because you have to put words into the data structure multiple times.
class Trie {
constructor(depth=0, key=[]) {
this.depth = depth;
this.children = {};
this.words = [];
this.count = 0;
this.isSorted = false;
this.path = key.slice(0, depth).join("");
this.char = key[this.depth-1];
}
insert (word) {
if (word != undefined) {
const key = Array.from(word);
for (let i = 0; i < key.length; i++) {
this._insert(key.slice(i), word);
}
}
}
_insert (key, word) {
this.count++;
if (this.depth == key.length) {
this.words.push(word);
}
else {
if (! this.children[key[this.depth]] ) {
this.children[key[this.depth]] = new Trie(this.depth+1, key);
}
this.children[key[this.depth]]._insert(key, word);
}
}
sort () {
if (! this.isSorted) {
this.words.sort();
let letters = Object.keys(this.children);
letters.sort();
// Keys come out of a hash in insertion order.
let orderedChildren = {};
for (let i = 0; i < letters.length; i++) {
orderedChildren[letters[i]] = this.children[letters[i]];
}
this.children = orderedChildren;
this.isSorted
}
}
find (letters, startAt=0, maxCount=null) {
// Defaults, special cases, etc.
if (this.count <= startAt) {
return [[], startAt];
}
if (maxCount == null) {
maxCount = this.count;
}
if (letters == "") {
// Special case.
this.sort();
answer = this.words.slice(startAt, startAt + maxCount);
return [answer, startAt + answer.length];
}
// We will do the recursive search.
const key = Array.from(letters);
// The challenge is that each word is stored multiple times.
// We want to only find them once. So we will stop searching as soon
// as we match our search string twice.
//
// This requires keeping track of where we are in trying to match our
// search string again. And if we fail, backtracking in our attempt
// to match.
//
// Here is the complication. Partial matches can overlap.
//
// Consider the following search string:
//
// search: a b a a b a c a b
// 0 1 2 3 4 5 6 7 8
//
// Now suppose that I'm looking for "c" at 6 next and it isn't an
// "c". Well I know I matched "aba" and should next check if I got
// character 3, "a". We can encode this logic in an array.
//
// a b a a b a c a b
// backtrackTo: [-1, 0,-1, 1, 0,-1, 3,-1, 0]
//
// So if I've matched through 5, I check "c", then "a", then "b"
// before deciding that I'm not still partway through a match.
//
// Let's calculate that backtrackTo. We will also calculate a
// variable, matchAtAtEnd. Which is how long our longest
// running match is at a full match.
// We start by assuming that we go back to the beginning.
let backtrackTo = [-1];
let rematches = [];
let matchAtEnd = 0;
for (let i = 1; i < key.length; i++) {
if (key[i] == key[0]) {
backtrackTo[i] = -1;
rematches.push(i);
}
else {
backtrackTo[i] = 0;
}
}
// In our example we have:
//
// backtrackTo = [-1, 0, -1, -1, 0, -1, 0, -1, 0]
// rematches = [2, 3, 5, 7]
//
// Now let `k` be the length of the current rematch.
let k = 1;
while (0 < rematches.length) {
let nextRematches = [];
for (let i = 0; i < rematches.length; i++) {
if (i + k == key.length) {
matchAtEnd = k-1;
}
else {
if (key[k] == key[i+k]) {
nextRematches.push(i);
}
else {
backtrackTo[i+k] = k;
}
}
}
rematches = nextRematches;
k++;
// In our example we get:
//
// k = 1
// backtrackTo = [-1, 0, -1, -1, 0, -1, 0, -1, 0]
// rematches = [2, 3, 5, 7]
// matchAtEnd = -1
//
// k = 2
// backtrackTo = [-1, 0, -1, 1, 0, -1, 0, -1, 0]
// rematches = [3, 7]
// matchAtEnd = -1
//
// k = 3
// backtrackTo = [-1, 0, -1, 1, 0, -1, 3, -1, 0]
// rematches = []
// matchAtEnd = 2
//
// and we see that we got the expected backtrackTo AND
// we have recorded the fact that at matching
// abaabacab we are currently also matching the ab at the
// start.
//
// Now let's find the first match.
let node = this;
for (let i = 0; i < key.length; i++) {
node = node.children[key[i]];
if (!node) {
return [[], startAt];
}
}
return node._find(key, startAt, maxCount, 0, backtrackTo, matchAtEnd)
}
_find (key, startAt, maxCount, curPos, backtrackTo, nextMatch) {
// console.log([key, startAt, maxCount, curPos, backtrackTo, nextMatch, [this.path, this.count]]);
// Skip me?
if ((curPos + this.count <= startAt) || (key.length <= nextMatch)) {
return [[], curPos + this.count];
}
this.sort();
let answer = [];
if (curPos < startAt) {
if (startAt < curPos + this.words.length) {
answer = this.words.slice(startAt-curPos, startAt-curPos+maxCount);
}
else {
curPos += this.words.length; // Count the words we are skipping.
}
}
else {
answer = this.words.slice(0, maxCount);
}
curPos += answer.length;
if (maxCount <= answer.length) {
return [answer, curPos];
}
for (const [k, v] of Object.entries(this.children)) {
let thisMatch = nextMatch;
while ((-1 < thisMatch) && (key[thisMatch] != k)) {
thisMatch = backtrackTo[thisMatch];
}
thisMatch++;
let partialAnswer = null;
[partialAnswer, curPos] = v._find(key, startAt, maxCount - answer.length, curPos, backtrackTo, thisMatch);
answer = answer.concat(partialAnswer);
if (maxCount <= answer.length) {
break; // We are done.
}
}
return [answer, curPos];
}
}
exports.Trie = Trie
How to decode String from percent encoded windowsCP1251?
replacingPercentEscapes(using: String.Encoding.windowsCP1251) now is obsolete
removingPercentEncoding uses utf8
I have found the solution. It works for me. Welcome to refactor my example.
extension String {
func removingPercentEncoding(using encoding: String.Encoding) -> String {
let firstChar = self.first
let percentCharacter = Character("%")
var encodedPrefix: String.SubSequence? = nil
var encodedSuffix = self
if firstChar != percentCharacter {
if let indexOfFirstPercentChar = index(of: percentCharacter) {
encodedPrefix = self[..<indexOfFirstPercentChar]
encodedSuffix = String(self[indexOfFirstPercentChar...])
} else {
//no % char at all. Nothing encoded
return self
}
}
let substrings = encodedSuffix.components(separatedBy: "%")
let arr = substrings.map{ substring -> (String) in
switch substring.count {
case let count where count < 2:
return substring
case let count where count == 2:
let decodedArr = substring.hexa2Bytes
let data = Data(decodedArr)
if let decodedStr = String(data: data, encoding: encoding) {
return decodedStr
}
return substring
default: //>2
let thirdSymbolIndex = index(startIndex, offsetBy: 2)
let firstTwo = substring[..<thirdSymbolIndex]
let furhter = substring[thirdSymbolIndex...]
let decodedArr = String(firstTwo).hexa2Bytes
let data = Data(decodedArr)
if let decodedStr = String(data: data, encoding: encoding) {
return decodedStr + furhter
}
return substring
}
}
let result = arr.joined()
return String(encodedPrefix ?? "") + result
}
var hexa2Bytes: [UInt8] {
let hexa = Array(characters)
return stride(from: 0, to: characters.count, by: 2).flatMap { UInt8(String(hexa[$0..<$0.advanced(by: 2)]), radix: 16) }
}
}
A little example which is expected to work with multi-byte string encodings.
extension UInt8 {
//returns 0...15 when '0'...'9', 'A'...'F', 'a'...'f', otherwise returns nil
var hexValue: UInt8? {
if UInt8(ascii: "0") <= self && self <= UInt8(ascii: "9") {
return self - UInt8(ascii: "0")
} else if UInt8(ascii: "A") <= self && self <= UInt8(ascii: "F") {
return self - UInt8(ascii: "A") + 10
} else if UInt8(ascii: "a") <= self && self <= UInt8(ascii: "f") {
return self - UInt8(ascii: "a") + 10
} else {
return nil
}
}
}
extension String {
func removingPercentEncoding(using encoding: String.Encoding) -> String? {
guard let percentEncodedData = self.data(using: encoding) else {return nil}
var byteIterator = percentEncodedData.makeIterator()
var percentDecodedData = Data()
while let b0 = byteIterator.next() {
guard b0 == UInt8(ascii: "%"), let b1 = byteIterator.next() else {
//Non percent character
percentDecodedData.append(b0)
continue
}
guard let h1 = b1.hexValue, let b2 = byteIterator.next() else {
//Keep it as is, when invalid hex-sequece appeared
percentDecodedData.append(b0)
percentDecodedData.append(b1)
continue
}
guard let h2 = b2.hexValue else {
//Keep it as is, when invalid hex-sequece appeared
percentDecodedData.append(b0)
percentDecodedData.append(b1)
percentDecodedData.append(b2)
continue
}
percentDecodedData.append((h1<<4) + h2)
}
return String(data: percentDecodedData, encoding: encoding)
}
}
In my opinion, you should think non-UTF8 percent encoding now is obsolete, and should fix the part which is generating the percent encoded string with CP1251.
I want to concatenate a bunch of different files of a single type into one large file. For example, many javascript files into one large file, many css files down to one etc. I want to create a sourcemap of the files pre concatenation, but I do not know where to start. I am working in Node, but I am also open to solutions in other environments.
I know there are tools that can do this, but they seem to be on a language by language basis (uglifyjs, cssmin or whatever its called these days), but I want a tool that is not language specific.
Also, I would like to define how the files are bound. For example, in javascript I want to give each file its own closure with an IIFE. Such as:
(function () {
// File
}());
I can also think of other wrappers I would like to implement for different files.
Here are my options as I see it right now. However, I don't know which is best or how to start any of them.
Find a module that does this (I'm working in a Node.js environment)
Create an algorithm with Mozilla's source-map module. For that I also see a couple options.
Only map each line to the new line location
Map every single character to the new location
Map every word to its new location (this options seems way out of scope)
Don't even worry about source maps
What do you guys think about these options. I've already tried options 2.1 and 2.2, but the solution seemed way too complicated for a concatenation algorithm and it did not perform perfectly in the Google Chrome browser tools.
I implemented code without any dependencies like this:
export interface SourceMap {
version: number; // always 3
file?: string;
sourceRoot?: string;
sources: string[];
sourcesContent?: string[];
names?: string[];
mappings: string | Buffer;
}
const emptySourceMap: SourceMap = { version: 3, sources: [], mappings: new Buffer(0) }
var charToInteger = new Buffer(256);
var integerToChar = new Buffer(64);
charToInteger.fill(255);
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/='.split('').forEach((char, i) => {
charToInteger[char.charCodeAt(0)] = i;
integerToChar[i] = char.charCodeAt(0);
});
class DynamicBuffer {
buffer: Buffer;
size: number;
constructor() {
this.buffer = new Buffer(512);
this.size = 0;
}
ensureCapacity(capacity: number) {
if (this.buffer.length >= capacity)
return;
let oldBuffer = this.buffer;
this.buffer = new Buffer(Math.max(oldBuffer.length * 2, capacity));
oldBuffer.copy(this.buffer);
}
addByte(b: number) {
this.ensureCapacity(this.size + 1);
this.buffer[this.size++] = b;
}
addVLQ(num: number) {
var clamped: number;
if (num < 0) {
num = (-num << 1) | 1;
} else {
num <<= 1;
}
do {
clamped = num & 31;
num >>= 5;
if (num > 0) {
clamped |= 32;
}
this.addByte(integerToChar[clamped]);
} while (num > 0);
}
addString(s: string) {
let l = Buffer.byteLength(s);
this.ensureCapacity(this.size + l);
this.buffer.write(s, this.size);
this.size += l;
}
addBuffer(b: Buffer) {
this.ensureCapacity(this.size + b.length);
b.copy(this.buffer, this.size);
this.size += b.length;
}
toBuffer(): Buffer {
return this.buffer.slice(0, this.size);
}
}
function countNL(b: Buffer): number {
let res = 0;
for (let i = 0; i < b.length; i++) {
if (b[i] === 10) res++;
}
return res;
}
export class SourceMapBuilder {
outputBuffer: DynamicBuffer;
sources: string[];
mappings: DynamicBuffer;
lastSourceIndex = 0;
lastSourceLine = 0;
lastSourceCol = 0;
constructor() {
this.outputBuffer = new DynamicBuffer();
this.mappings = new DynamicBuffer();
this.sources = [];
}
addLine(text: string) {
this.outputBuffer.addString(text);
this.outputBuffer.addByte(10);
this.mappings.addByte(59); // ;
}
addSource(content: Buffer, sourceMap?: SourceMap) {
if (sourceMap == null) sourceMap = emptySourceMap;
this.outputBuffer.addBuffer(content);
let sourceLines = countNL(content);
if (content.length > 0 && content[content.length - 1] !== 10) {
sourceLines++;
this.outputBuffer.addByte(10);
}
let sourceRemap = [];
sourceMap.sources.forEach((v) => {
let pos = this.sources.indexOf(v);
if (pos < 0) {
pos = this.sources.length;
this.sources.push(v);
}
sourceRemap.push(pos);
});
let lastOutputCol = 0;
let inputMappings = (typeof sourceMap.mappings === "string") ? new Buffer(<string>sourceMap.mappings) : <Buffer>sourceMap.mappings;
let outputLine = 0;
let ip = 0;
let inOutputCol = 0;
let inSourceIndex = 0;
let inSourceLine = 0;
let inSourceCol = 0;
let shift = 0;
let value = 0;
let valpos = 0;
const commit = () => {
if (valpos === 0) return;
this.mappings.addVLQ(inOutputCol - lastOutputCol);
lastOutputCol = inOutputCol;
if (valpos === 1) {
valpos = 0;
return;
}
let outSourceIndex = sourceRemap[inSourceIndex];
this.mappings.addVLQ(outSourceIndex - this.lastSourceIndex);
this.lastSourceIndex = outSourceIndex;
this.mappings.addVLQ(inSourceLine - this.lastSourceLine);
this.lastSourceLine = inSourceLine;
this.mappings.addVLQ(inSourceCol - this.lastSourceCol);
this.lastSourceCol = inSourceCol;
valpos = 0;
}
while (ip < inputMappings.length) {
let b = inputMappings[ip++];
if (b === 59) { // ;
commit();
this.mappings.addByte(59);
inOutputCol = 0;
lastOutputCol = 0;
outputLine++;
} else if (b === 44) { // ,
commit();
this.mappings.addByte(44);
} else {
b = charToInteger[b];
if (b === 255) throw new Error("Invalid sourceMap");
value += (b & 31) << shift;
if (b & 32) {
shift += 5;
} else {
let shouldNegate = value & 1;
value >>= 1;
if (shouldNegate) value = -value;
switch (valpos) {
case 0: inOutputCol += value; break;
case 1: inSourceIndex += value; break;
case 2: inSourceLine += value; break;
case 3: inSourceCol += value; break;
}
valpos++;
value = shift = 0;
}
}
}
commit();
while (outputLine < sourceLines) {
this.mappings.addByte(59);
outputLine++;
}
}
toContent(): Buffer {
return this.outputBuffer.toBuffer();
}
toSourceMap(sourceRoot?: string): Buffer {
return new Buffer(JSON.stringify({ version: 3, sourceRoot, sources: this.sources, mappings: this.mappings.toBuffer().toString() }));
}
}
I, at first, implemented "index map" from that spec, only to find out that it is not supported by any browser.
Another project that could be useful to look at is magic string.