Here is a trie made to work on alphabets/scripts from any language.
class TrieNode {
constructor(key) {
// the "key" value will be the character in sequence
this.key = key;
// we keep a reference to parent
this.parent = null;
// we have hash of children
this.children = {};
// check to see if the node is at the end
this.end = false;
}
getWord() {
let output = [];
let node = this;
while (node !== null) {
output.unshift(node.key)
node = node.parent
}
return output.join('')
}
}
class Trie {
constructor() {
this.base = new TrieNode(null)
}
insert(word) {
let node = this.base
const points = Array.from(word)
for (const i in points) {
const point = points[i]
if (!node.children[point]) {
const child = node.children[point] = new TrieNode(point)
child.parent = node
}
node = node.children[point]
if (i == word.length - 1) {
node.end = true
}
}
}
contains(word) {
let node = this.base
const points = Array.from(word)
for (const i in points) {
const point = points[i]
if (node.children[point]) {
node = node.children[point]
} else {
return false
}
}
return node.end;
}
find(prefix) {
let node = this.base
let output = []
const points = Array.from(prefix)
for (const i in points) {
const point = points[i]
// make sure prefix actually has words
if (node.children[point]) {
node = node.children[point]
} else {
// there's none. just return it.
return output
}
}
const stack = [node]
while (stack.length) {
node = stack.shift()
// base case, if node is at a word, push to output
if (node.end) {
output.unshift(node.getWord())
}
// iterate through each children, call recursive findAllWords
for (var child in node.children) {
stack.push(node.children[child])
}
}
return output
}
}
After asking How to efficiently store 1 million words and query them by starts_with, contains, or ends_with? and getting some answers, I am wondering how the "contains" and "unscrambles" parts can be implemented as a trie. The prefix ("starts with") search is easily handled by the trie.
const fs = require('fs')
const Trie = require('./Trie')
const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
.trim()
.split(/\n+/)
.map(x => x.trim())
const trie = new Trie()
words.forEach(word => trie.insert(word))
console.log(trie.find('zy'))
[
'zygodactylous', 'zygomorphies', 'zygapophysis',
'zygapophyses', 'zygomorphic', 'zymologies',
'zygospores', 'zygosities', 'zygomorphy',
'zygodactyl', 'zymurgies', 'zymograms',
'zymogenes', 'zygotenes', 'zygospore',
'zygomatic', 'zyzzyvas', 'zymosans',
'zymology', 'zymogram', 'zymogens',
'zymogene', 'zygotene', 'zygosity',
'zygomata', 'zyzzyva', 'zymurgy',
'zymotic', 'zymosis', 'zymoses',
'zymosan', 'zymogen', 'zymases',
'zygotic', 'zygotes', 'zygosis',
'zygoses', 'zygomas', 'zydecos',
'zymase', 'zygote', 'zygose',
'zygoma', 'zygoid', 'zydeco',
'zymes', 'zyme'
]
Here is the tmp/scrabble.csv I used.
The question is, can a trie be used to find all the words which "contain" some input string, or find all the words which "unscramble" the input string? I am curious how to accomplish this with a Trie. If a trie cannot do this efficiently, then knowing why not, and potentially what I should be looking at instead, will be a great answer as well.
From my initial thought-attempts at solving "contains", it seems I would have to create a trie which maps all possible combinations of substrings to the final word, but that seems like it will be an explosion of memory, so not sure how to better reason about this.
For "unscrambles", where you put in "caldku" and it finds "duck", amongst other possible words, I think possibly, similar to the linked answer to the other question, maybe sorting "duck" into "cdku", and then storing that in the trie, and then sorting the input to "acdklu", and then searching for "contains" using the previous algorithm, but hmm, no that will break in a few cases. So maybe a trie is not the right approach for these two problems? If it is, roughly how would you do it (you don't need to provide a full implementation, unless you'd like and it's straightforward enough).
Here is an answer to unscrambles that ALSO handles pagination. It actually returns [words, startOfNextPage]. So to find the next page you call it starting at startOfNextPage.
The idea is to normalize words by sorting the letters, and then storing the word in the trie once. When searching we can dig into the tree both for the next letter we want, and any letter before it (because the letter we want may come after). So we also store a lookup to know what letters might be found, so that we can break out early without having to look at a lot of bad options. And, of course, we include counts so that we can calculate how much of the trie we have explored and/or skipped over.
class Trie {
constructor(depth=0) {
this.depth = depth;
this.children = {};
this.words = [];
this.count = 0;
this.isSorted = false;
this.hasChildWith = {};
}
insert (word) {
if (word != undefined) {
this._insert(Array.from(word).sort(), word);
}
}
_insert (key, word) {
this.count++;
this.isSorted = false;
if (key.length == this.depth) {
this.words.push(word);
}
else {
for (let i = this.depth+1; i < key.length; i++) {
this.hasChildWith[key[i]] = true;
}
if (! this.children[key[this.depth]] ) {
this.children[key[this.depth]] = new Trie(this.depth+1);
}
this.children[key[this.depth]]._insert(key, word);
}
}
sort () {
if (! this.isSorted) {
this.words.sort();
let letters = Object.keys(this.children);
letters.sort();
// Keys come out of a hash in insertion order.
let orderedChildren = {};
for (let i = 0; i < letters.length; i++) {
orderedChildren[letters[i]] = this.children[letters[i]];
}
this.children = orderedChildren;
}
}
find (letters, startAt=0, maxCount=null) {
return this._find(Array.from(letters).sort(), 0, startAt, maxCount || this.count, 0);
}
_find (key, keyPos, startAt, maxCount, curPos) {
if (curPos + this.count < startAt) {
return [[], curPos + this.count];
}
this.sort(); // Make sure we are sorted.
let answer = [];
if (keyPos < key.length) {
// We have not yet found all the letters.
// tmpPos will track how much we looked at in detail
let tmpPos = curPos;
tmpPos += this.words.length;
for (const [k, v] of Object.entries(this.children)) {
if (k < key[keyPos]) {
// The next letter we want can be deeper in the trie?
if (v.hasChildWith[key[keyPos]]) {
// It is! Let's find it.
let result = v._find(key, keyPos, startAt, maxCount - answer.length, tmpPos);
answer = answer.concat(result[0]);
if (maxCount <= answer.length) {
// We finished our answer, return it and what we found.
return [answer, result[1]];
}
}
tmpPos += v.count; // Didn't find it, but track that we've seen this.
}
else if (k == key[keyPos]) {
// We found our next letter! Search deeper.
let result = v._find(key, keyPos+1, startAt, maxCount - answer.length, tmpPos);
answer = answer.concat(result[0]);
if (maxCount <= answer.length) {
// We finished the search.
return [answer, result[1]];
}
else {
// No other letter can match.
break;
}
}
else {
// Neither this or any later letter can match.
break;
}
}
// Return our partial answer and mark that we went
// through this whole node.
return [answer, curPos + this.count];
}
else {
// We have our letters and are recursively finding words.
if (startAt <= curPos + this.words.length) {
answer = this.words.slice(startAt - curPos);
if (maxCount <= answer.length) {
// No need to search deeper, we're done.
answer = answer.slice(0, maxCount);
return [answer, curPos + answer.length];
}
curPos += answer.length;
}
for (const child of Object.values(this.children)) {
let result = child._find(key, keyPos, startAt, maxCount - answer.length, curPos);
answer = answer.concat(result[0])
if (maxCount <= answer.length) {
return [answer, result[1]];
}
else {
curPos += child.count;
}
}
return [answer, curPos];
}
}
}
exports.Trie = Trie
And here is some code to test it with.
const fs = require('fs')
const AnagramTrie = require('./AnagramTrie')
const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
.trim()
.split(/\n+/)
.map(x => x.trim())
const trie = new AnagramTrie.Trie()
words.forEach(word => trie.insert(word))
console.log(trie.find('b', 0, 12))
console.log(trie.find('b', 0, 6))
console.log(trie.find('b', 11, 6))
console.log(trie.find('zx', 0, 12))
console.log(trie.find('zx', 0, 6))
console.log(trie.find('zx', 60875, 6))
It will return words in the order of their sorted anagram, followed by the word itself. So if you search for admired you'll find unadmired first because the n in un comes before the r in admired. And you'll find that disembarrassed comes before either because it has 2 a's in it.
And here is an answer to contains. Note how, despite using a Trie both times, it needs a lot of customization to the problem we are solving.
Sample code to use it is like the other one. You can see the deduplication at work if you search for a. The answer aa (correctly) only comes up once. Also it is a little slower to start up because you have to put words into the data structure multiple times.
class Trie {
constructor(depth=0, key=[]) {
this.depth = depth;
this.children = {};
this.words = [];
this.count = 0;
this.isSorted = false;
this.path = key.slice(0, depth).join("");
this.char = key[this.depth-1];
}
insert (word) {
if (word != undefined) {
const key = Array.from(word);
for (let i = 0; i < key.length; i++) {
this._insert(key.slice(i), word);
}
}
}
_insert (key, word) {
this.count++;
if (this.depth == key.length) {
this.words.push(word);
}
else {
if (! this.children[key[this.depth]] ) {
this.children[key[this.depth]] = new Trie(this.depth+1, key);
}
this.children[key[this.depth]]._insert(key, word);
}
}
sort () {
if (! this.isSorted) {
this.words.sort();
let letters = Object.keys(this.children);
letters.sort();
// Keys come out of a hash in insertion order.
let orderedChildren = {};
for (let i = 0; i < letters.length; i++) {
orderedChildren[letters[i]] = this.children[letters[i]];
}
this.children = orderedChildren;
this.isSorted
}
}
find (letters, startAt=0, maxCount=null) {
// Defaults, special cases, etc.
if (this.count <= startAt) {
return [[], startAt];
}
if (maxCount == null) {
maxCount = this.count;
}
if (letters == "") {
// Special case.
this.sort();
answer = this.words.slice(startAt, startAt + maxCount);
return [answer, startAt + answer.length];
}
// We will do the recursive search.
const key = Array.from(letters);
// The challenge is that each word is stored multiple times.
// We want to only find them once. So we will stop searching as soon
// as we match our search string twice.
//
// This requires keeping track of where we are in trying to match our
// search string again. And if we fail, backtracking in our attempt
// to match.
//
// Here is the complication. Partial matches can overlap.
//
// Consider the following search string:
//
// search: a b a a b a c a b
// 0 1 2 3 4 5 6 7 8
//
// Now suppose that I'm looking for "c" at 6 next and it isn't an
// "c". Well I know I matched "aba" and should next check if I got
// character 3, "a". We can encode this logic in an array.
//
// a b a a b a c a b
// backtrackTo: [-1, 0,-1, 1, 0,-1, 3,-1, 0]
//
// So if I've matched through 5, I check "c", then "a", then "b"
// before deciding that I'm not still partway through a match.
//
// Let's calculate that backtrackTo. We will also calculate a
// variable, matchAtAtEnd. Which is how long our longest
// running match is at a full match.
// We start by assuming that we go back to the beginning.
let backtrackTo = [-1];
let rematches = [];
let matchAtEnd = 0;
for (let i = 1; i < key.length; i++) {
if (key[i] == key[0]) {
backtrackTo[i] = -1;
rematches.push(i);
}
else {
backtrackTo[i] = 0;
}
}
// In our example we have:
//
// backtrackTo = [-1, 0, -1, -1, 0, -1, 0, -1, 0]
// rematches = [2, 3, 5, 7]
//
// Now let `k` be the length of the current rematch.
let k = 1;
while (0 < rematches.length) {
let nextRematches = [];
for (let i = 0; i < rematches.length; i++) {
if (i + k == key.length) {
matchAtEnd = k-1;
}
else {
if (key[k] == key[i+k]) {
nextRematches.push(i);
}
else {
backtrackTo[i+k] = k;
}
}
}
rematches = nextRematches;
k++;
// In our example we get:
//
// k = 1
// backtrackTo = [-1, 0, -1, -1, 0, -1, 0, -1, 0]
// rematches = [2, 3, 5, 7]
// matchAtEnd = -1
//
// k = 2
// backtrackTo = [-1, 0, -1, 1, 0, -1, 0, -1, 0]
// rematches = [3, 7]
// matchAtEnd = -1
//
// k = 3
// backtrackTo = [-1, 0, -1, 1, 0, -1, 3, -1, 0]
// rematches = []
// matchAtEnd = 2
//
// and we see that we got the expected backtrackTo AND
// we have recorded the fact that at matching
// abaabacab we are currently also matching the ab at the
// start.
//
// Now let's find the first match.
let node = this;
for (let i = 0; i < key.length; i++) {
node = node.children[key[i]];
if (!node) {
return [[], startAt];
}
}
return node._find(key, startAt, maxCount, 0, backtrackTo, matchAtEnd)
}
_find (key, startAt, maxCount, curPos, backtrackTo, nextMatch) {
// console.log([key, startAt, maxCount, curPos, backtrackTo, nextMatch, [this.path, this.count]]);
// Skip me?
if ((curPos + this.count <= startAt) || (key.length <= nextMatch)) {
return [[], curPos + this.count];
}
this.sort();
let answer = [];
if (curPos < startAt) {
if (startAt < curPos + this.words.length) {
answer = this.words.slice(startAt-curPos, startAt-curPos+maxCount);
}
else {
curPos += this.words.length; // Count the words we are skipping.
}
}
else {
answer = this.words.slice(0, maxCount);
}
curPos += answer.length;
if (maxCount <= answer.length) {
return [answer, curPos];
}
for (const [k, v] of Object.entries(this.children)) {
let thisMatch = nextMatch;
while ((-1 < thisMatch) && (key[thisMatch] != k)) {
thisMatch = backtrackTo[thisMatch];
}
thisMatch++;
let partialAnswer = null;
[partialAnswer, curPos] = v._find(key, startAt, maxCount - answer.length, curPos, backtrackTo, thisMatch);
answer = answer.concat(partialAnswer);
if (maxCount <= answer.length) {
break; // We are done.
}
}
return [answer, curPos];
}
}
exports.Trie = Trie