/* * AhoCorasickDoubleArrayTrie Project * https://github.com/hankcs/AhoCorasickDoubleArrayTrie * * Copyright 2008-2018 hankcs * You may modify and redistribute as long as this attribution remains. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ @file:Suppress("unused") package dorkbox.collections.ahoCorasick import java.io.IOException import java.io.ObjectInputStream import java.io.ObjectOutputStream import java.io.Serializable import java.util.* /** * An implementation of Aho Corasick algorithm based on Double Array Trie * * Will create a DoubleArray Trie from a Map or InputStream (if previously saved) * * @author hankcs */ class DoubleArrayTrie(map: Map? = null, inputStream: ObjectInputStream? = null) : Serializable { /** * check array of the Double Array Trie structure */ private val check: IntArray /** * base array of the Double Array Trie structure */ private val base: IntArray /** * fail table of the Aho Corasick automata */ private val fail: IntArray /** * output table of the Aho Corasick automata */ private val output: Array /** * outer value array */ internal val v: Array /** * the length of every key */ private val l: IntArray /** * the size of base and check array */ private val size: Int init { when { map != null -> { @Suppress("UNCHECKED_CAST") v = kotlin.jvm.internal.collectionToArray(map.values) as Array l = IntArray(map.size) val builder = Builder() builder.build(map) fail = builder.fail base = builder.base check = builder.check size = builder.size output = builder.output } inputStream != null -> { @Suppress("UNCHECKED_CAST") v = inputStream.readObject() as Array l = inputStream.readObject() as IntArray fail = inputStream.readObject() as IntArray base = inputStream.readObject() as IntArray check = inputStream.readObject() as IntArray size = inputStream.readObject() as Int @Suppress("UNCHECKED_CAST") output = inputStream.readObject() as Array } else -> throw NullPointerException("Map or InputStream must be specified!") } } /** * Save */ @Throws(IOException::class) fun save(out: ObjectOutputStream) { out.writeObject(v) out.writeObject(l) out.writeObject(fail) out.writeObject(base) out.writeObject(check) out.writeObject(size) out.writeObject(output) } /** * Parse text * * @return a list of outputs */ fun parseText(text: CharSequence): List> { var position = 1 var currentState = 0 val collectedEmits = LinkedList>() // unknown size, so for (i in 0 until text.length) { currentState = getState(currentState, text[i]) storeEmits(position, currentState, collectedEmits) ++position } return collectedEmits } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ fun parseText(text: CharSequence, processor: IHit ) { var position = 1 var currentState = 0 for (i in 0 until text.length) { currentState = getState(currentState, text[i]) val hitArray = output[currentState] if (hitArray != null) { for (hit in hitArray) { processor.hit(position - l[hit], position, v[hit]) } } ++position } } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ fun parseText(text: CharSequence, processor: IHitCancellable ) { var currentState = 0 for (i in 0 until text.length) { val position = i + 1 currentState = getState(currentState, text[i]) val hitArray = output[currentState] if (hitArray != null) { for (hit in hitArray) { val proceed = processor.hit(position - l[hit], position, v[hit]) if (!proceed) { return } } } } } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ fun parseText(text: CharArray, processor: IHit ) { var position = 1 var currentState = 0 for (c in text) { currentState = getState(currentState, c) val hitArray = output[currentState] if (hitArray != null) { for (hit in hitArray) { processor.hit(position - l[hit], position, v[hit]) } } ++position } } /** * Parse text * * @param text The text * @param processor A processor which handles the output */ fun parseText(text: CharArray, processor: IHitFull ) { var position = 1 var currentState = 0 for (c in text) { currentState = getState(currentState, c) val hitArray = output[currentState] if (hitArray != null) { for (hit in hitArray) { processor.hit(position - l[hit], position, v[hit], hit) } } ++position } } /** * Checks that string contains at least one substring * * @param text source text to check * * @return `true` if string contains at least one substring */ fun matches(text: String): Boolean { var currentState = 0 for (i in 0 until text.length) { currentState = getState(currentState, text[i]) val hitArray = output[currentState] if (hitArray != null) { return true } } return false } /** * Search first match in string * * @param text source text to check * * @return first match or `null` if there are no matches */ fun findFirst(text: String): Hit? { var position = 1 var currentState = 0 for (i in 0 until text.length) { currentState = getState(currentState, text[i]) val hitArray = output[currentState] if (hitArray != null) { val hitIndex = hitArray[0] return Hit(position - l[hitIndex], position, v[hitIndex]) } ++position } return null } /** * Get value by a String key, just like a map.get() method * * @param key The key */ operator fun get(key: String): V? { val index = exactMatchSearch(key) return if (index >= 0) { v[index] } else null } /** * Update a value corresponding to a key * * @param key the key * @param value the value * * @return successful or not(failure if there is no key) */ operator fun set(key: String, value: V): Boolean { val index = exactMatchSearch(key) if (index >= 0) { v[index] = value return true } return false } /** * Pick the value by index in value array

* Notice that to be more efficiently, this method DOES NOT check the parameter * * @param index The index * * @return The value */ operator fun get(index: Int): V { return v[index] } /** * Processor handles the output when hit a keyword */ interface IHit { /** * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword * * @param begin the beginning index, inclusive. * @param end the ending index, exclusive. * @param value the value assigned to the keyword */ fun hit(begin: Int, end: Int, value: V) } /** * Processor handles the output when hit a keyword, with more detail */ interface IHitFull { /** * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword * * @param begin the beginning index, inclusive. * @param end the ending index, exclusive. * @param value the value assigned to the keyword * @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value */ fun hit(begin: Int, end: Int, value: V, index: Int) } /** * Callback that allows to cancel the search process. */ interface IHitCancellable { /** * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword * * @param begin the beginning index, inclusive. * @param end the ending index, exclusive. * @param value the value assigned to the keyword * * @return Return true for continuing the search and false for stopping it. */ fun hit(begin: Int, end: Int, value: V): Boolean } /** * A result output * * @param the value type */ class Hit internal constructor( /** * the beginning index, inclusive. */ val begin: Int, /** * the ending index, exclusive. */ val end: Int, /** * the value assigned to the keyword */ val value: V) { override fun toString(): String { return String.format("[%d:%d]=%s", begin, end, value) } } /** * transmit state, supports failure function */ private fun getState(currentState: Int, character: Char): Int { @Suppress("NAME_SHADOWING") var currentState = currentState var newCurrentState = transitionWithRoot(currentState, character) // First press success while (newCurrentState == -1) // If the jump fails, press failure to jump { currentState = fail[currentState] newCurrentState = transitionWithRoot(currentState, character) } return newCurrentState } /** * store output */ private fun storeEmits(position: Int, currentState: Int, collectedEmits: MutableList>) { val hitArray = output[currentState] if (hitArray != null) { for (hit in hitArray) { collectedEmits.add(Hit(position - l[hit], position, v[hit])) } } } /** * transition of a state */ private fun transition(current: Int, c: Char): Int { var b = current var p: Int p = b + c.code + 1 if (b == check[p]) { b = base[p] } else { return -1 } p = b return p } /** * transition of a state, if the state is root and it failed, then returns the root */ private fun transitionWithRoot(nodePos: Int, c: Char): Int { val b = base[nodePos] val p: Int p = b + c.code + 1 return if (b != check[p]) { if (nodePos == 0) { 0 } else -1 } else p } /** * match exactly by a key * * @param key the key * * @return the index of the key, you can use it as a perfect hash function */ fun exactMatchSearch(key: String): Int { return exactMatchSearch(key, 0, 0, 0) } /** * match exactly by a key-char array * * @param keyChars the key (as a Character array) * * @return the index of the key, you can use it as a perfect hash function */ fun exactMatchSearch(keyChars: CharArray): Int { return exactMatchSearch(keyChars, 0, 0, 0) } /** * match exactly by a key */ private fun exactMatchSearch(key: String, pos: Int, len: Int, nodePos: Int): Int { @Suppress("NAME_SHADOWING") var len = len @Suppress("NAME_SHADOWING") var nodePos = nodePos if (len <= 0) { len = key.length } if (nodePos <= 0) { nodePos = 0 } var result = -1 val keyChars = key.toCharArray() var b = base[nodePos] var p: Int for (i in pos until len) { p = b + keyChars[i].code + 1 if (b == check[p]) { b = base[p] } else { return result } } p = b val n = base[p] if (b == check[p] && n < 0) { result = -n - 1 } return result } /** * match exactly by a key * * @param keyChars the char array of the key * @param pos the begin index of char array * @param len the length of the key * @param nodePos the starting position of the node for searching * * @return the value index of the key, minus indicates null */ private fun exactMatchSearch(keyChars: CharArray, pos: Int, len: Int, nodePos: Int): Int { var result = -1 var b = base[nodePos] var p: Int for (i in pos until len) { p = b + keyChars[i].code + 1 if (b == check[p]) { b = base[p] } else { return result } } p = b val n = base[p] if (b == check[p] && n < 0) { result = -n - 1 } return result } // /** // * Just for debug when I wrote it // */ // public void debug() // { // System.out.println("base:"); // for (int i = 0; i < base.length; i++) // { // if (base[i] < 0) // { // System.out.println(i + " : " + -base[i]); // } // } // // System.out.println("output:"); // for (int i = 0; i < output.length; i++) // { // if (output[i] != null) // { // System.out.println(i + " : " + Arrays.toString(output[i])); // } // } // // System.out.println("fail:"); // for (int i = 0; i < fail.length; i++) // { // if (fail[i] != 0) // { // System.out.println(i + " : " + fail[i]); // } // } // // System.out.println(this); // } // // @Override // public String toString() // { // String infoIndex = "i = "; // String infoChar = "char = "; // String infoBase = "base = "; // String infoCheck = "check= "; // for (int i = 0; i < Math.min(base.length, 200); ++i) // { // if (base[i] != 0 || check[i] != 0) // { // infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1)); // infoIndex += " " + String.format("%5d", i); // infoBase += " " + String.format("%5d", base[i]); // infoCheck += " " + String.format("%5d", check[i]); // } // } // return "DoubleArrayTrie:" + // "\n" + infoChar + // "\n" + infoIndex + // "\n" + infoBase + // "\n" + infoCheck + "\n" + //// "check=" + Arrays.toString(check) + //// ", base=" + Arrays.toString(base) + //// ", used=" + Arrays.toString(used) + // "size=" + size //// ", length=" + Arrays.toString(length) + //// ", value=" + Arrays.toString(value) + // ; // } // // /** // * A debug class that sequentially outputs variable names and variable values // */ // private static class DebugArray // { // Map nameValueMap = new LinkedHashMap(); // // public void add(String name, int value) // { // String valueInMap = nameValueMap.get(name); // if (valueInMap == null) // { // valueInMap = ""; // } // // valueInMap += " " + String.format("%5d", value); // // nameValueMap.put(name, valueInMap); // } // // @Override // public String toString() // { // String text = ""; // for (Map.Entry entry : nameValueMap.entrySet()) // { // String name = entry.getKey(); // String value = entry.getValue(); // text += String.format("%-5s", name) + "= " + value + '\n'; // } // // return text; // } // // public void println() // { // System.out.print(this); // } // } /** * Get the size of the keywords */ fun size(): Int { return v.size } /** * A builder to build the AhoCorasickDoubleArrayTrie */ private inner class Builder { /** * the root state of trie */ private var rootState: State? = State() /** * whether the position has been used */ private var used: BooleanArray? = null /** * the allocSize of the dynamic array */ private var allocSize: Int = 0 /** * a parameter controls the memory growth speed of the dynamic array */ private var progress: Int = 0 /** * the next position to check unused memory */ private var nextCheckPos: Int = 0 /** * the size of the key-pair sets */ private var keySize: Int = 0 lateinit var output: Array lateinit var fail: IntArray lateinit var base: IntArray lateinit var check: IntArray var size: Int = 0 /** * Build from a map * * @param map a map containing key-value pairs */ fun build(map: Map) { val keySet = map.keys // Construct a two-point trie tree addAllKeyword(keySet) // Building a double array trie tree based on a two-point trie tree buildDoubleArrayTrie(keySet.size) used = null // Build the failure table and merge the output table constructFailureStates() rootState = null loseWeight() } /** * fetch siblings of a parent node * * @param parent parent node * @param siblings parent node's child nodes, i . e . the siblings * * @return the amount of the siblings */ private fun fetch(parent: State, siblings: MutableList>): Int { if (parent.isAcceptable) { // This node is a child of the parent and has the output of the parent. val fakeNode = State(-(parent.depth + 1)) fakeNode.addEmit(parent.largestValueId!!) siblings.add(Pair(0, fakeNode)) } for ((key, value) in parent.getSuccess()) { siblings.add(Pair(key.code + 1, value)) } return siblings.size } /** * add a keyword * * @param keyword a keyword * @param index the index of the keyword */ private fun addKeyword(keyword: String, index: Int) { var currentState = this.rootState keyword.toCharArray().forEach { character -> currentState = currentState!!.addState(character) } currentState!!.addEmit(index) l[index] = keyword.length } /** * add a collection of keywords * * @param keywordSet the collection holding keywords */ private fun addAllKeyword(keywordSet: Collection) { var i = 0 keywordSet.forEach { keyword -> addKeyword(keyword, i++) } } /** * construct failure table */ private fun constructFailureStates() { fail = IntArray(Math.max(size + 1, 2)) fail[1] = base[0] output = arrayOfNulls(size + 1) val queue = ArrayDeque() // The first step is to set the failure of the node with depth 1 to the root node. this.rootState!!.states.forEach { depthOneState -> depthOneState.setFailure(this.rootState!!, fail) queue.add(depthOneState) constructOutput(depthOneState) } // The second step is to create a failure table for nodes with depth > 1, which is a bfs while (!queue.isEmpty()) { val currentState = queue.remove() for (transition in currentState.transitions) { val targetState = currentState.nextState(transition) queue.add(targetState) var traceFailureState = currentState.failure() while (traceFailureState!!.nextState(transition) == null) { traceFailureState = traceFailureState.failure() } val newFailureState = traceFailureState.nextState(transition) targetState!!.setFailure(newFailureState!!, fail) targetState.addEmit(newFailureState.emit()) constructOutput(targetState) } } } /** * construct output table */ private fun constructOutput(targetState: State) { val emit = targetState.emit() if (emit.isEmpty()) { return } val output = IntArray(emit.size) val it = emit.iterator() for (i in output.indices) { output[i] = it.next() } this.output[targetState.index] = output } private fun buildDoubleArrayTrie(keySize: Int) { progress = 0 this.keySize = keySize resize(65536 * 32) // 32 double bytes base[0] = 1 nextCheckPos = 0 val rootNode = this.rootState val initialCapacity = rootNode!!.getSuccess().entries.size val siblings = ArrayList>(initialCapacity) fetch(rootNode, siblings) if (siblings.isNotEmpty()) { insert(siblings) } } /** * allocate the memory of the dynamic array */ private fun resize(newSize: Int): Int { val base2 = IntArray(newSize) val check2 = IntArray(newSize) val used2 = BooleanArray(newSize) if (allocSize > 0) { System.arraycopy(base, 0, base2, 0, allocSize) System.arraycopy(check, 0, check2, 0, allocSize) System.arraycopy(used!!, 0, used2, 0, allocSize) } base = base2 check = check2 used = used2 allocSize = newSize return newSize } /** * insert the siblings to double array trie * * @param siblings the siblings being inserted * * @return the position to insert them */ private fun insert(siblings: List>): Int { var begin: Int var pos = Math.max(siblings[0].first + 1, nextCheckPos) - 1 var nonzeroNum = 0 var first = 0 if (allocSize <= pos) { resize(pos + 1) } outer@ // The goal of this loop body is to find n free spaces that satisfy base[begin + a1...an] == 0, a1...an are n nodes in siblings while (true) { pos++ if (allocSize <= pos) { resize(pos + 1) } if (check[pos] != 0) { nonzeroNum++ continue } else if (first == 0) { nextCheckPos = pos first = 1 } begin = pos - siblings[0].first // The distance of the current position from the first sibling node if (allocSize <= begin + siblings[siblings.size - 1].first) { // progress can be zero // Prevent progress from generating zero divide errors val l = if (1.05 > 1.0 * keySize / (progress + 1)) 1.05 else 1.0 * keySize / (progress + 1) resize((allocSize * l).toInt()) } if (used!![begin]) { continue } for (i in 1 until siblings.size) { if (check[begin + siblings[i].first] != 0) { continue@outer } } break } // -- Simple heuristics -- // if the percentage of non-empty contents in check between the // index // 'next_check_pos' and 'check' is greater than some constant value // (e.g. 0.9), // new 'next_check_pos' index is written by 'check'. if (1.0 * nonzeroNum / (pos - nextCheckPos + 1) >= 0.95) { // From the position next_check_pos to pos, if the occupied space is above 95%, the next // time you insert a node, you can start looking directly at the pos position. nextCheckPos = pos } used!![begin] = true // valid because resize is called. val sizeLimit = begin + siblings[siblings.size - 1].first + 1 if (size <= sizeLimit) { size = sizeLimit } for (sibling in siblings) { check[begin + sibling.first] = begin } for (sibling in siblings) { val newSiblings = ArrayList>(sibling.second.getSuccess().entries.size + 1) if (fetch(sibling.second, newSiblings) == 0) { // The termination of a word and not the prefix of other words, in fact, is the leaf node base[begin + sibling.first] = 0 - sibling.second.largestValueId!! - 1 progress++ } else { val h = insert(newSiblings) // depth first search base[begin + sibling.first] = h } sibling.second.index = begin + sibling.first } return begin } /** * free the unnecessary memory */ private fun loseWeight() { base = base.copyOf(size + 65535) check = check.copyOf(size + 65535) } } // companion object { // @JvmStatic // fun main(args: Array) { // // test outliers // test(hashMapOf()) // // test(hashMapOf("bmw" to "bmw")) // // // var map = hashMapOf() // var keyArray = arrayOf("bmw.com", "cnn.com", "google.com", "reddit.com") // for (key in keyArray) { // map[key] = key // } // test(map) // // map = hashMapOf() // keyArray = arrayOf("bmw.com", "cnn.com", "google.com", "reddit.com", "reddit.google.com") // for (key in keyArray) { // map[key] = key // } // test(map) // } // // fun test(map: Map) { // val trie = DoubleArrayTrie(map) // // val text = "reddit.google.com" // println(trie.parseText(text).toString()) // println(trie.exactMatchSearch(text)) // } // } }