Moved FSM to its own project.

master
Robinson 2023-01-23 11:52:43 +01:00
parent 2ffa58cbbd
commit bb28977529
No known key found for this signature in database
GPG Key ID: 8E7DB78588BD6F5C
18 changed files with 4 additions and 3012 deletions

View File

@ -19,7 +19,7 @@ Maven Info
<dependency>
<groupId>com.dorkbox</groupId>
<artifactId>Collections</artifactId>
<version>1.2</version>
<version>1.3</version>
</dependency>
</dependencies>
```
@ -29,7 +29,7 @@ Gradle Info
```
dependencies {
...
implementation("com.dorkbox:Collections:1.2")
implementation("com.dorkbox:Collections:1.3")
}
```

View File

@ -38,7 +38,7 @@ object Extras {
// set for the project
const val description = "Niche collections to augment what is already available."
const val group = "com.dorkbox"
const val version = "1.2"
const val version = "1.3"
// set as project.ext
const val name = "Collections"
@ -65,12 +65,6 @@ licensing {
author(Extras.vendor)
url(Extras.url)
extra("AhoCorasickDoubleArrayTrie", License.APACHE_2) {
description(Extras.description)
copyright(2018)
author("hankcs <me@hankcs.com>")
url("https://github.com/hankcs/AhoCorasickDoubleArrayTrie")
}
extra("Bias, BinarySearch", License.MIT) {
url(Extras.url)
url("https://github.com/timboudreau/util")

View File

@ -21,7 +21,7 @@ object Collections {
/**
* Gets the version number.
*/
const val version = "1.2"
const val version = "1.3"
init {
// Add this project to the updates system, which verifies this class + UUID + version information

View File

@ -1,348 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.util.*
/**
* A builder to build the AhoCorasickDoubleArrayTrie
*/
internal abstract class BaseByteBuilder<K, V> {
/**
* the root state of trie
*/
internal var rootState: StateByte? = StateByte()
/**
* whether the position has been used
*/
private var used: BooleanArray? = null
/**
* the allocSize of the dynamic array
*/
private var allocSize: Int = 0
/**
* a parameter controls the memory growth speed of the dynamic array
*/
private var progress: Int = 0
/**
* the next position to check unused memory
*/
private var nextCheckPos: Int = 0
/**
* the size of the key-pair sets
*/
private var keySize: Int = 0
lateinit var output: Array<IntArray?>
lateinit var fail: IntArray
lateinit var base: IntArray
lateinit var check: IntArray
var size: Int = 0
/**
* Build from a map
*
* @param map a map containing key-value pairs
*/
fun build(map: Map<K, V>) {
val keySet = map.keys
// Construct a two-point trie tree
addAllKeyword(keySet)
// Building a double array trie tree based on a two-point trie tree
buildDoubleArrayTrie(keySet.size)
used = null
// Build the failure table and merge the output table
constructFailureStates()
rootState = null
loseWeight()
}
/**
* fetch siblings of a parent node
*
* @param parent parent node
* @param siblings parent node's child nodes, i . e . the siblings
*
* @return the amount of the siblings
*/
private fun fetch(parent: StateByte,
siblings: MutableList<Pair<Int, StateByte>>): Int {
if (parent.isAcceptable) {
// This node is a child of the parent and has the output of the parent.
val fakeNode = StateByte(-(parent.depth + 1))
fakeNode.addEmit(parent.largestValueId!!)
siblings.add(Pair(0, fakeNode))
}
for ((key, value) in parent.getSuccess()) {
siblings.add(Pair(key + 1, value))
}
return siblings.size
}
/**
* add a keyword
*
* @param keyword a keyword
* @param index the index of the keyword
*/
internal abstract fun addKeyword(keyword: K, index: Int)
/**
* add a collection of keywords
*
* @param keywordSet the collection holding keywords
*/
private fun addAllKeyword(keywordSet: Collection<K>) {
var i = 0
keywordSet.forEach { keyword ->
addKeyword(keyword, i++)
}
}
/**
* construct failure table
*/
private fun constructFailureStates() {
fail = IntArray((size + 1).coerceAtLeast(2))
fail[1] = base[0]
output = arrayOfNulls(size + 1)
val queue = ArrayDeque<StateByte>()
// The first step is to set the failure of the node with depth 1 to the root node.
this.rootState!!.states.forEach { depthOneState ->
depthOneState.setFailure(this.rootState!!, fail)
queue.add(depthOneState)
constructOutput(depthOneState)
}
// The second step is to create a failure table for nodes with depth > 1, which is a bfs
while (!queue.isEmpty()) {
val currentState = queue.remove()
for (transition in currentState.transitions) {
val targetState = currentState.nextState(transition)
queue.add(targetState)
var traceFailureState = currentState.failure()
while (traceFailureState!!.nextState(transition) == null) {
traceFailureState = traceFailureState.failure()
}
val newFailureState = traceFailureState.nextState(transition)
targetState!!.setFailure(newFailureState!!, fail)
targetState.addEmit(newFailureState.emit())
constructOutput(targetState)
}
}
}
/**
* construct output table
*/
private fun constructOutput(targetState: StateByte) {
val emit = targetState.emit()
if (emit.isEmpty()) {
return
}
val output = IntArray(emit.size)
val it = emit.iterator()
for (i in output.indices) {
output[i] = it.next()
}
this.output[targetState.index] = output
}
private fun buildDoubleArrayTrie(keySize: Int) {
progress = 0
this.keySize = keySize
resize(65536 * 32) // 32 double bytes
base[0] = 1
nextCheckPos = 0
val rootNode = this.rootState
val initialCapacity = rootNode!!.getSuccess().entries.size
val siblings = ArrayList<Pair<Int, StateByte>>(initialCapacity)
fetch(rootNode, siblings)
if (siblings.isNotEmpty()) {
insert(siblings)
}
}
/**
* allocate the memory of the dynamic array
*/
private fun resize(newSize: Int): Int {
val base2 = IntArray(newSize)
val check2 = IntArray(newSize)
val used2 = BooleanArray(newSize)
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize)
System.arraycopy(check, 0, check2, 0, allocSize)
System.arraycopy(used!!, 0, used2, 0, allocSize)
}
base = base2
check = check2
used = used2
allocSize = newSize
return newSize
}
/**
* insert the siblings to double array trie
*
* @param siblings the siblings being inserted
*
* @return the position to insert them
*/
private fun insert(siblings: List<Pair<Int, StateByte>>): Int {
var begin: Int
var pos = Math.max(siblings[0].first + 1, nextCheckPos) - 1
var nonzeroNum = 0
var first = 0
if (allocSize <= pos) {
resize(pos + 1)
}
outer@
// The goal of this loop body is to find n free spaces that satisfy base[begin + a1...an] == 0, a1...an are n nodes in siblings
while (true) {
pos++
if (allocSize <= pos) {
resize(pos + 1)
}
if (check[pos] != 0) {
nonzeroNum++
continue
}
else if (first == 0) {
nextCheckPos = pos
first = 1
}
begin = pos - siblings[0].first // The distance of the current position from the first sibling node
if (allocSize <= begin + siblings[siblings.size - 1].first) {
// progress can be zero
// Prevent progress from generating zero divide errors
val l = if (1.05 > 1.0 * keySize / (progress + 1)) 1.05 else 1.0 * keySize / (progress + 1)
resize((allocSize * l).toInt())
}
if (used!![begin]) {
continue
}
for (i in 1 until siblings.size) {
if (check[begin + siblings[i].first] != 0) {
continue@outer
}
}
break
}
// -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzeroNum / (pos - nextCheckPos + 1) >= 0.95) {
// From the position next_check_pos to pos, if the occupied space is above 95%, the next
// time you insert a node, you can start looking directly at the pos position.
nextCheckPos = pos
}
used!![begin] = true // valid because resize is called.
val sizeLimit = begin + siblings[siblings.size - 1].first + 1
if (size <= sizeLimit) {
size = sizeLimit
}
for (sibling in siblings) {
check[begin + sibling.first] = begin
}
for (sibling in siblings) {
val newSiblings = ArrayList<Pair<Int, StateByte>>(sibling.second.getSuccess().entries.size + 1)
if (fetch(sibling.second, newSiblings) == 0) {
// The termination of a word and not the prefix of other words, in fact, is the leaf node
base[begin + sibling.first] = 0 - sibling.second.largestValueId!! - 1
progress++
}
else {
val h = insert(newSiblings) // depth first search
base[begin + sibling.first] = h
}
sibling.second.index = begin + sibling.first
}
return begin
}
/**
* free the unnecessary memory
*/
private fun loseWeight() {
base = base.copyOf(size + 65535)
check = check.copyOf(size + 65535)
}
}

View File

@ -1,558 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
@file:Suppress("unused")
package dorkbox.collections.ahoCorasick
import java.io.IOException
import java.io.ObjectInputStream
import java.io.ObjectOutputStream
import java.io.Serializable
import java.util.*
/**
* An implementation of Aho Corasick algorithm based on Double Array Trie
*
* Will create a DoubleArray Trie from a Map or InputStream (if previously saved)
*
* @author hankcs, dorkbox
*/
@Suppress("DuplicatedCode")
abstract class BaseByteTrie<K, V>(map: Map<K, V>?, inputStream: ObjectInputStream?) : Serializable {
/**
* check array of the Double Array Trie structure
*/
private val check: IntArray
/**
* base array of the Double Array Trie structure
*/
private val base: IntArray
/**
* fail table of the Aho Corasick automata
*/
private val fail: IntArray
/**
* output table of the Aho Corasick automata
*/
private val output: Array<IntArray?>
/**
* outer value array
*/
internal val v: Array<V>
/**
* the length of every key
*/
internal val l: IntArray
/**
* the size of base and check array
*/
private val checkSize: Int
init {
when {
map != null -> {
@Suppress("UNCHECKED_CAST")
v = kotlin.jvm.internal.collectionToArray(map.values) as Array<V>
l = IntArray(map.size)
@Suppress("LeakingThis")
val builder = builder()
builder.build(map)
fail = builder.fail
base = builder.base
check = builder.check
checkSize = builder.size
output = builder.output
}
inputStream != null -> {
@Suppress("UNCHECKED_CAST")
v = inputStream.readObject() as Array<V>
l = inputStream.readObject() as IntArray
fail = inputStream.readObject() as IntArray
base = inputStream.readObject() as IntArray
check = inputStream.readObject() as IntArray
checkSize = inputStream.readObject() as Int
@Suppress("UNCHECKED_CAST")
output = inputStream.readObject() as Array<IntArray?>
}
else -> throw NullPointerException("Map or InputStream must be specified!")
}
}
internal abstract fun builder(): BaseByteBuilder<K, V>
/**
* Save
*/
@Throws(IOException::class)
fun save(out: ObjectOutputStream) {
out.writeObject(v)
out.writeObject(l)
out.writeObject(fail)
out.writeObject(base)
out.writeObject(check)
out.writeObject(checkSize)
out.writeObject(output)
}
/**
* Returns the backing keywords IN THEIR NATURAL ORDER, in the case that you need access to the original FSM data.
*
* @return for example, if the FSM was populated with [reddit.com, cnn.com], this will return [cnn.com, reddit.com]
*/
val keywords: Array<V>
get() {
return v
}
/**
* Get the size of the keywords
*/
val size: Int
get() {
return v.size
}
/**
* Parses text and returns true if there are PARTIALLY matching results. For exact matches only it is better to use `matches`
*
* @return true if there is a match or partial match. "fun.reddit.com" will partially match to "reddit.com"
*/
fun hasPartialMatch(byteArray: ByteArray): Boolean {
return parseBytes(byteArray).isNotEmpty()
}
/**
* Parses text and finds PARTIALLY matching results. For exact matches only it is better to use `matches`
*
* @return a list of outputs that contain matches or partial matches. The returned list will specify HOW MUCH of the text matches (A full match would be from 0 (the start), to N (the length of the text).
*/
fun partialMatch(byteArray: ByteArray): List<Hit<V>> {
return parseBytes(byteArray)
}
/**
* Parse byte arrays
*
* @return a list of outputs
*/
fun parseBytes(byteArray: ByteArray): List<Hit<V>> {
var position = 1
var currentState = 0
val collectedEmits = LinkedList<Hit<V>>() // unknown size, so
for (element in byteArray) {
currentState = getState(currentState, element)
storeEmits(position++, currentState, collectedEmits)
}
return collectedEmits
}
/**
* Parse byte arrays
*
* @param byteArray The text
* @param processor A processor which handles the output
*/
fun parseBytes(byteArray: ByteArray,
processor: IHitCancellable<V>
) {
var position = 1
var currentState = 0
for (element in byteArray) {
position++
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
val proceed = processor.hit(position - l[hit], position, v[hit])
if (!proceed) {
return
}
}
}
}
}
/**
* Parse byte arrays
*
* @param byteArray The text
* @param processor A processor which handles the output
*/
fun parseBytes(byteArray: ByteArray,
processor: IHit<V>
) {
var position = 1
var currentState = 0
for (c in byteArray) {
currentState = getState(currentState, c)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
processor.hit(position - l[hit], position, v[hit])
}
}
position++
}
}
/**
* Parse byte arrays
*
* @param byteArray The text
* @param processor A processor which handles the output
*/
fun parseBytes(byteArray: ByteArray,
processor: IHitFull<V>
) {
var position = 1
var currentState = 0
for (c in byteArray) {
currentState = getState(currentState, c)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
processor.hit(position - l[hit], position, v[hit], hit)
}
}
position++
}
}
/**
* Checks that string contains at least one substring
*
* @param byteArray source byte arrays to check
*
* @return `true` if string contains at least one substring
*/
fun matches(byteArray: ByteArray): Boolean {
var currentState = 0
for (element in byteArray) {
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
return true
}
}
return false
}
/**
* Search first match in string
*
* @param byteArray source byte array to check
*
* @return first match or `null` if there are no matches
*/
fun findFirst(byteArray: ByteArray): Hit<V>? {
var position = 1
var currentState = 0
for (element in byteArray) {
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
val hitIndex = hitArray[0]
return Hit(position - l[hitIndex], position, v[hitIndex])
}
position++
}
return null
}
/**
* Pick the value by index in value array <br></br>
* Notice that to be more efficiently, this method DOES NOT check the parameter
*
* @param index The index
*
* @return The value
*/
operator fun get(index: Int): V {
return v[index]
}
/**
* transmit state, supports failure function
*/
private fun getState(currentState: Int,
character: Byte): Int {
@Suppress("NAME_SHADOWING")
var currentState = currentState
var newCurrentState = transitionWithRoot(currentState, character) // First press success
while (newCurrentState == -1)
// If the jump fails, press failure to jump
{
currentState = fail[currentState]
newCurrentState = transitionWithRoot(currentState, character)
}
return newCurrentState
}
/**
* store output
*/
private fun storeEmits(position: Int,
currentState: Int,
collectedEmits: MutableList<Hit<V>>) {
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
collectedEmits.add(Hit(position - l[hit], position, v[hit]))
}
}
}
/**
* transition of a state
*/
private fun transition(current: Int,
c: Char): Int {
var b = current
var p: Int
p = b + c.code + 1
if (b == check[p]) {
b = base[p]
}
else {
return -1
}
p = b
return p
}
/**
* transition of a state, if the state is root, and it failed, then returns the root
*/
private fun transitionWithRoot(nodePos: Int,
c: Byte): Int {
val b = base[nodePos]
val p: Int
p = b + c + 1
return if (b != check[p]) {
if (nodePos == 0) {
0
}
else -1
}
else p
}
/**
* match exactly by a key
*
* @param byteArray the key
*
* @return the index of the key, you can use it as a perfect hash function
*/
fun exactMatchSearch(byteArray: ByteArray): Int {
return exactMatchSearch(byteArray, 0, 0, 0)
}
/**
* match exactly by a key
*/
fun exactMatchSearch(byteArray: ByteArray,
pos: Int,
len: Int,
nodePos: Int): Int {
@Suppress("NAME_SHADOWING")
var len = len
@Suppress("NAME_SHADOWING")
var nodePos = nodePos
if (len <= 0) {
len = byteArray.size
}
if (nodePos <= 0) {
nodePos = 0
}
var result = -1
val keyChars = byteArray
var b = base[nodePos]
var p: Int
for (i in pos until len) {
p = b + keyChars[i] + 1
if (b == check[p]) {
b = base[p]
}
else {
return result
}
}
p = b
val n = base[p]
if (b == check[p] && n < 0) {
result = -n - 1
}
return result
}
// /**
// * Just for debug when I wrote it
// */
// public void debug()
// {
// System.out.println("base:");
// for (int i = 0; i < base.length; i++)
// {
// if (base[i] < 0)
// {
// System.out.println(i + " : " + -base[i]);
// }
// }
//
// System.out.println("output:");
// for (int i = 0; i < output.length; i++)
// {
// if (output[i] != null)
// {
// System.out.println(i + " : " + Arrays.toString(output[i]));
// }
// }
//
// System.out.println("fail:");
// for (int i = 0; i < fail.length; i++)
// {
// if (fail[i] != 0)
// {
// System.out.println(i + " : " + fail[i]);
// }
// }
//
// System.out.println(this);
// }
//
// @Override
// public String toString()
// {
// String infoIndex = "i = ";
// String infoChar = "char = ";
// String infoBase = "base = ";
// String infoCheck = "check= ";
// for (int i = 0; i < Math.min(base.length, 200); ++i)
// {
// if (base[i] != 0 || check[i] != 0)
// {
// infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1));
// infoIndex += " " + String.format("%5d", i);
// infoBase += " " + String.format("%5d", base[i]);
// infoCheck += " " + String.format("%5d", check[i]);
// }
// }
// return "DoubleArrayTrie" +
// "\n" + infoChar +
// "\n" + infoIndex +
// "\n" + infoBase +
// "\n" + infoCheck + "\n" +
//// "check=" + Arrays.toString(check) +
//// ", base=" + Arrays.toString(base) +
//// ", used=" + Arrays.toString(used) +
// "size=" + size
//// ", length=" + Arrays.toString(length) +
//// ", value=" + Arrays.toString(value) +
// ;
// }
//
// /**
// * A debug class that sequentially outputs variable names and variable values
// */
// private static class DebugArray
// {
// Map<String, String> nameValueMap = new LinkedHashMap<String, String>();
//
// public void add(String name, int value)
// {
// String valueInMap = nameValueMap.get(name);
// if (valueInMap == null)
// {
// valueInMap = "";
// }
//
// valueInMap += " " + String.format("%5d", value);
//
// nameValueMap.put(name, valueInMap);
// }
//
// @Override
// public String toString()
// {
// String text = "";
// for (Map.Entry<String, String> entry : nameValueMap.entrySet())
// {
// String name = entry.getKey();
// String value = entry.getValue();
// text += String.format("%-5s", name) + "= " + value + '\n';
// }
//
// return text;
// }
//
// public void println()
// {
// System.out.print(this);
// }
// }
}

View File

@ -1,348 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.util.*
/**
* A builder to build the AhoCorasickDoubleArrayTrie
*/
internal abstract class BaseCharBuilder<K, V> {
/**
* the root state of trie
*/
internal var rootState: StateChar? = StateChar()
/**
* whether the position has been used
*/
private var used: BooleanArray? = null
/**
* the allocSize of the dynamic array
*/
private var allocSize: Int = 0
/**
* a parameter controls the memory growth speed of the dynamic array
*/
private var progress: Int = 0
/**
* the next position to check unused memory
*/
private var nextCheckPos: Int = 0
/**
* the size of the key-pair sets
*/
private var keySize: Int = 0
lateinit var output: Array<IntArray?>
lateinit var fail: IntArray
lateinit var base: IntArray
lateinit var check: IntArray
var size: Int = 0
/**
* Build from a map
*
* @param map a map containing key-value pairs
*/
fun build(map: Map<K, V>) {
val keySet = map.keys
// Construct a two-point trie tree
addAllKeyword(keySet)
// Building a double array trie tree based on a two-point trie tree
buildDoubleArrayTrie(keySet.size)
used = null
// Build the failure table and merge the output table
constructFailureStates()
rootState = null
loseWeight()
}
/**
* fetch siblings of a parent node
*
* @param parent parent node
* @param siblings parent node's child nodes, i . e . the siblings
*
* @return the amount of the siblings
*/
private fun fetch(parent: StateChar,
siblings: MutableList<Pair<Int, StateChar>>): Int {
if (parent.isAcceptable) {
// This node is a child of the parent and has the output of the parent.
val fakeNode = StateChar(-(parent.depth + 1))
fakeNode.addEmit(parent.largestValueId!!)
siblings.add(Pair(0, fakeNode))
}
for ((key, value) in parent.getSuccess()) {
siblings.add(Pair(key.code + 1, value))
}
return siblings.size
}
/**
* add a keyword
*
* @param keyword a keyword
* @param index the index of the keyword
*/
internal abstract fun addKeyword(keyword: K, index: Int)
/**
* add a collection of keywords
*
* @param keywordSet the collection holding keywords
*/
private fun addAllKeyword(keywordSet: Collection<K>) {
var i = 0
keywordSet.forEach { keyword ->
addKeyword(keyword, i++)
}
}
/**
* construct failure table
*/
private fun constructFailureStates() {
fail = IntArray((size + 1).coerceAtLeast(2))
fail[1] = base[0]
output = arrayOfNulls(size + 1)
val queue = ArrayDeque<StateChar>()
// The first step is to set the failure of the node with depth 1 to the root node.
this.rootState!!.states.forEach { depthOneState ->
depthOneState.setFailure(this.rootState!!, fail)
queue.add(depthOneState)
constructOutput(depthOneState)
}
// The second step is to create a failure table for nodes with depth > 1, which is a bfs
while (!queue.isEmpty()) {
val currentState = queue.remove()
for (transition in currentState.transitions) {
val targetState = currentState.nextState(transition)
queue.add(targetState)
var traceFailureState = currentState.failure()
while (traceFailureState!!.nextState(transition) == null) {
traceFailureState = traceFailureState.failure()
}
val newFailureState = traceFailureState.nextState(transition)
targetState!!.setFailure(newFailureState!!, fail)
targetState.addEmit(newFailureState.emit())
constructOutput(targetState)
}
}
}
/**
* construct output table
*/
private fun constructOutput(targetState: StateChar) {
val emit = targetState.emit()
if (emit.isEmpty()) {
return
}
val output = IntArray(emit.size)
val it = emit.iterator()
for (i in output.indices) {
output[i] = it.next()
}
this.output[targetState.index] = output
}
private fun buildDoubleArrayTrie(keySize: Int) {
progress = 0
this.keySize = keySize
resize(65536 * 32) // 32 double bytes
base[0] = 1
nextCheckPos = 0
val rootNode = this.rootState
val initialCapacity = rootNode!!.getSuccess().entries.size
val siblings = ArrayList<Pair<Int, StateChar>>(initialCapacity)
fetch(rootNode, siblings)
if (siblings.isNotEmpty()) {
insert(siblings)
}
}
/**
* allocate the memory of the dynamic array
*/
private fun resize(newSize: Int): Int {
val base2 = IntArray(newSize)
val check2 = IntArray(newSize)
val used2 = BooleanArray(newSize)
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize)
System.arraycopy(check, 0, check2, 0, allocSize)
System.arraycopy(used!!, 0, used2, 0, allocSize)
}
base = base2
check = check2
used = used2
allocSize = newSize
return newSize
}
/**
* insert the siblings to double array trie
*
* @param siblings the siblings being inserted
*
* @return the position to insert them
*/
private fun insert(siblings: List<Pair<Int, StateChar>>): Int {
var begin: Int
var pos = Math.max(siblings[0].first + 1, nextCheckPos) - 1
var nonzeroNum = 0
var first = 0
if (allocSize <= pos) {
resize(pos + 1)
}
outer@
// The goal of this loop body is to find n free spaces that satisfy base[begin + a1...an] == 0, a1...an are n nodes in siblings
while (true) {
pos++
if (allocSize <= pos) {
resize(pos + 1)
}
if (check[pos] != 0) {
nonzeroNum++
continue
}
else if (first == 0) {
nextCheckPos = pos
first = 1
}
begin = pos - siblings[0].first // The distance of the current position from the first sibling node
if (allocSize <= begin + siblings[siblings.size - 1].first) {
// progress can be zero
// Prevent progress from generating zero divide errors
val l = if (1.05 > 1.0 * keySize / (progress + 1)) 1.05 else 1.0 * keySize / (progress + 1)
resize((allocSize * l).toInt())
}
if (used!![begin]) {
continue
}
for (i in 1 until siblings.size) {
if (check[begin + siblings[i].first] != 0) {
continue@outer
}
}
break
}
// -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzeroNum / (pos - nextCheckPos + 1) >= 0.95) {
// From the position next_check_pos to pos, if the occupied space is above 95%, the next
// time you insert a node, you can start looking directly at the pos position.
nextCheckPos = pos
}
used!![begin] = true // valid because resize is called.
val sizeLimit = begin + siblings[siblings.size - 1].first + 1
if (size <= sizeLimit) {
size = sizeLimit
}
for (sibling in siblings) {
check[begin + sibling.first] = begin
}
for (sibling in siblings) {
val newSiblings = ArrayList<Pair<Int, StateChar>>(sibling.second.getSuccess().entries.size + 1)
if (fetch(sibling.second, newSiblings) == 0) {
// The termination of a word and not the prefix of other words, in fact, is the leaf node
base[begin + sibling.first] = 0 - sibling.second.largestValueId!! - 1
progress++
}
else {
val h = insert(newSiblings) // depth first search
base[begin + sibling.first] = h
}
sibling.second.index = begin + sibling.first
}
return begin
}
/**
* free the unnecessary memory
*/
private fun loseWeight() {
base = base.copyOf(size + 65535)
check = check.copyOf(size + 65535)
}
}

View File

@ -1,595 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
@file:Suppress("unused")
package dorkbox.collections.ahoCorasick
import java.io.IOException
import java.io.ObjectInputStream
import java.io.ObjectOutputStream
import java.io.Serializable
import java.util.*
/**
* An implementation of Aho Corasick algorithm based on Double Array Trie
*
* Will create a DoubleArray Trie from a Map or InputStream (if previously saved)
*
* @author hankcs, dorkbox
*/
abstract class BaseCharTrie<K, V>(map: Map<K, V>?, inputStream: ObjectInputStream?) : Serializable {
/**
* check array of the Double Array Trie structure
*/
private val check: IntArray
/**
* base array of the Double Array Trie structure
*/
private val base: IntArray
/**
* fail table of the Aho Corasick automata
*/
private val fail: IntArray
/**
* output table of the Aho Corasick automata
*/
private val output: Array<IntArray?>
/**
* outer value array
*/
internal val v: Array<V>
/**
* the length of every key
*/
internal val l: IntArray
/**
* the size of base and check array
*/
private val checkSize: Int
init {
when {
map != null -> {
@Suppress("UNCHECKED_CAST")
v = kotlin.jvm.internal.collectionToArray(map.values) as Array<V>
l = IntArray(map.size)
val builder = builder()
builder.build(map)
fail = builder.fail
base = builder.base
check = builder.check
checkSize = builder.size
output = builder.output
}
inputStream != null -> {
@Suppress("UNCHECKED_CAST")
v = inputStream.readObject() as Array<V>
l = inputStream.readObject() as IntArray
fail = inputStream.readObject() as IntArray
base = inputStream.readObject() as IntArray
check = inputStream.readObject() as IntArray
checkSize = inputStream.readObject() as Int
@Suppress("UNCHECKED_CAST")
output = inputStream.readObject() as Array<IntArray?>
}
else -> throw NullPointerException("Map or InputStream must be specified!")
}
}
internal abstract fun builder(): BaseCharBuilder<K, V>
/**
* Save
*/
@Throws(IOException::class)
fun save(out: ObjectOutputStream) {
out.writeObject(v)
out.writeObject(l)
out.writeObject(fail)
out.writeObject(base)
out.writeObject(check)
out.writeObject(checkSize)
out.writeObject(output)
}
/**
* Get the size of the keywords
*/
val size: Int
get() {
return v.size
}
/**
* Returns the backing keywords IN THEIR NATURAL ORDER, in the case that you need access to the original FSM data.
*
* @return for example, if the FSM was populated with [reddit.com, cnn.com], this will return [cnn.com, reddit.com]
*/
val keywords: Array<V>
get() {
return v
}
/**
* Parses text and returns true if there are PARTIALLY matching results. For exact matches only it is better to use `matches`
*
* @return true if there is a match or partial match. "fun.reddit.com" will partially match to "reddit.com"
*/
fun hasPartialMatch(text: String): Boolean {
return parseText(text).isNotEmpty()
}
/**
* Parses text and finds PARTIALLY matching results. For exact matches only it is better to use `matches`
*
* @return a list of outputs that contain matches or partial matches. The returned list will specify HOW MUCH of the text matches (A full match would be from 0 (the start), to N (the length of the text).
*/
fun partialMatch(text: String): List<Hit<V>> {
return parseText(text)
}
/**
* Parse text
*
* @return a list of outputs
*/
fun parseText(text: CharSequence): List<Hit<V>> {
var position = 1
var currentState = 0
val collectedEmits = LinkedList<Hit<V>>() // unknown size, so
for (element in text) {
currentState = getState(currentState, element)
storeEmits(position++, currentState, collectedEmits)
}
return collectedEmits
}
/**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
fun parseText(text: CharSequence,
processor: IHit<V>
) {
var position = 1
var currentState = 0
for (element in text) {
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
processor.hit(position - l[hit], position, v[hit])
}
}
position++
}
}
/**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
fun parseText(text: CharSequence,
processor: IHitCancellable<V>
) {
var position = 1
var currentState = 0
for (element in text) {
position++
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
val proceed = processor.hit(position - l[hit], position, v[hit])
if (!proceed) {
return
}
}
}
}
}
/**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
fun parseText(text: CharArray,
processor: IHit<V>
) {
var position = 1
var currentState = 0
for (c in text) {
currentState = getState(currentState, c)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
processor.hit(position - l[hit], position, v[hit])
}
}
position++
}
}
/**
* Parse text
*
* @param text The text
* @param processor A processor which handles the output
*/
fun parseText(text: CharArray,
processor: IHitFull<V>
) {
var position = 1
var currentState = 0
for (c in text) {
currentState = getState(currentState, c)
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
processor.hit(position - l[hit], position, v[hit], hit)
}
}
position++
}
}
/**
* Checks that string contains at least one substring
*
* @param text source text to check
*
* @return `true` if string contains at least one substring
*/
fun matches(text: String): Boolean {
var currentState = 0
for (element in text) {
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
return true
}
}
return false
}
/**
* Search first match in string
*
* @param text source text to check
*
* @return first match or `null` if there are no matches
*/
fun findFirst(text: String): Hit<V>? {
var position = 1
var currentState = 0
for (element in text) {
currentState = getState(currentState, element)
val hitArray = output[currentState]
if (hitArray != null) {
val hitIndex = hitArray[0]
return Hit(position - l[hitIndex], position, v[hitIndex])
}
position++
}
return null
}
/**
* Pick the value by index in value array <br></br>
* Notice that to be more efficiently, this method DOES NOT check the parameter
*
* @param index The index
*
* @return The value
*/
operator fun get(index: Int): V {
return v[index]
}
/**
* transmit state, supports failure function
*/
private fun getState(currentState: Int,
character: Char): Int {
@Suppress("NAME_SHADOWING")
var currentState = currentState
var newCurrentState = transitionWithRoot(currentState, character) // First press success
while (newCurrentState == -1)
// If the jump fails, press failure to jump
{
currentState = fail[currentState]
newCurrentState = transitionWithRoot(currentState, character)
}
return newCurrentState
}
/**
* store output
*/
private fun storeEmits(position: Int,
currentState: Int,
collectedEmits: MutableList<Hit<V>>) {
val hitArray = output[currentState]
if (hitArray != null) {
for (hit in hitArray) {
collectedEmits.add(Hit(position - l[hit], position, v[hit]))
}
}
}
/**
* transition of a state
*/
private fun transition(current: Int,
c: Char): Int {
var b = current
var p: Int
p = b + c.code + 1
if (b == check[p]) {
b = base[p]
}
else {
return -1
}
p = b
return p
}
/**
* transition of a state, if the state is root and it failed, then returns the root
*/
private fun transitionWithRoot(nodePos: Int,
c: Char): Int {
val b = base[nodePos]
val p: Int
p = b + c.code + 1
return if (b != check[p]) {
if (nodePos == 0) {
0
}
else -1
}
else p
}
/**
* match exactly by a key-char array
*
* @param keyChars the key (as a Character array)
*
* @return the index of the key, you can use it as a perfect hash function
*/
fun exactMatchSearch(keyChars: CharArray): Int {
return exactMatchSearch(keyChars, 0, 0, 0)
}
/**
* match exactly by a key
*
* @param key the key
*
* @return the index of the key, you can use it as a perfect hash function
*/
fun exactMatchSearch(key: String): Int {
return exactMatchSearch(key.toCharArray(), pos = 0, len = 0, nodePos = 0)
}
/**
* match exactly by a key
*
* @param keyChars the char array of the key
* @param pos the start index of char array
* @param len the length of the key
* @param nodePos the starting position of the node for searching
*
* @return the value index of the key, minus indicates null
*/
internal fun exactMatchSearch(keyChars: CharArray,
pos: Int,
len: Int,
nodePos: Int): Int {
@Suppress("NAME_SHADOWING")
var len = len
@Suppress("NAME_SHADOWING")
var nodePos = nodePos
if (len <= 0) {
len = keyChars.size
}
if (nodePos <= 0) {
nodePos = 0
}
var result = -1
var b = base[nodePos]
var p: Int
for (i in pos until len) {
p = b + keyChars[i].code + 1
if (b == check[p]) {
b = base[p]
}
else {
return result
}
}
p = b
val n = base[p]
if (b == check[p] && n < 0) {
result = -n - 1
}
return result
}
// /**
// * Just for debug when I wrote it
// */
// public void debug()
// {
// System.out.println("base:");
// for (int i = 0; i < base.length; i++)
// {
// if (base[i] < 0)
// {
// System.out.println(i + " : " + -base[i]);
// }
// }
//
// System.out.println("output:");
// for (int i = 0; i < output.length; i++)
// {
// if (output[i] != null)
// {
// System.out.println(i + " : " + Arrays.toString(output[i]));
// }
// }
//
// System.out.println("fail:");
// for (int i = 0; i < fail.length; i++)
// {
// if (fail[i] != 0)
// {
// System.out.println(i + " : " + fail[i]);
// }
// }
//
// System.out.println(this);
// }
//
// @Override
// public String toString()
// {
// String infoIndex = "i = ";
// String infoChar = "char = ";
// String infoBase = "base = ";
// String infoCheck = "check= ";
// for (int i = 0; i < Math.min(base.length, 200); ++i)
// {
// if (base[i] != 0 || check[i] != 0)
// {
// infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1));
// infoIndex += " " + String.format("%5d", i);
// infoBase += " " + String.format("%5d", base[i]);
// infoCheck += " " + String.format("%5d", check[i]);
// }
// }
// return "DoubleArrayTrie" +
// "\n" + infoChar +
// "\n" + infoIndex +
// "\n" + infoBase +
// "\n" + infoCheck + "\n" +
//// "check=" + Arrays.toString(check) +
//// ", base=" + Arrays.toString(base) +
//// ", used=" + Arrays.toString(used) +
// "size=" + size
//// ", length=" + Arrays.toString(length) +
//// ", value=" + Arrays.toString(value) +
// ;
// }
//
// /**
// * A debug class that sequentially outputs variable names and variable values
// */
// private static class DebugArray
// {
// Map<String, String> nameValueMap = new LinkedHashMap<String, String>();
//
// public void add(String name, int value)
// {
// String valueInMap = nameValueMap.get(name);
// if (valueInMap == null)
// {
// valueInMap = "";
// }
//
// valueInMap += " " + String.format("%5d", value);
//
// nameValueMap.put(name, valueInMap);
// }
//
// @Override
// public String toString()
// {
// String text = "";
// for (Map.Entry<String, String> entry : nameValueMap.entrySet())
// {
// String name = entry.getKey();
// String value = entry.getValue();
// text += String.format("%-5s", name) + "= " + value + '\n';
// }
//
// return text;
// }
//
// public void println()
// {
// System.out.print(this);
// }
// }
}

View File

@ -1,97 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.io.ObjectInputStream
class DoubleArrayByteArrayTrie<V>(map: Map<ByteArray, V>? = null, inputStream: ObjectInputStream? = null):
BaseByteTrie<ByteArray, V>(map, inputStream) {
override fun builder(): BaseByteBuilder<ByteArray, V> {
return object: BaseByteBuilder<ByteArray, V>() {
/**
* add a keyword
*
* @param keyword a keyword
* @param index the index of the keyword
*/
override fun addKeyword(keyword: ByteArray, index: Int) {
var currentState = this.rootState
keyword.forEach { character ->
currentState = currentState!!.addState(character)
}
currentState!!.addEmit(index)
this@DoubleArrayByteArrayTrie.l[index] = keyword.size
}
}
}
/**
* Get value by a ByteArray key, just like a map.get() method
*
* @param key The key
*/
operator fun get(key: ByteArray): V? {
val index = exactMatchSearch(key)
return if (index >= 0) {
v[index]
}
else null
}
/**
* Update a value corresponding to a key
*
* @param key the key
* @param value the value
*
* @return successful or notfailure if there is no key
*/
operator fun set(key: ByteArray,
value: V): Boolean {
val index = exactMatchSearch(key)
if (index >= 0) {
v[index] = value
return true
}
return false
}
}

View File

@ -1,95 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.io.ObjectInputStream
class DoubleArrayStringTrie<V>(map: Map<String, V>? = null,
inputStream: ObjectInputStream? = null): BaseCharTrie<String, V>(map, inputStream) {
override fun builder(): BaseCharBuilder<String, V> {
return object: BaseCharBuilder<String, V>() {
/**
* add a keyword
*
* @param keyword a keyword
* @param index the index of the keyword
*/
override fun addKeyword(keyword: String, index: Int) {
var currentState = this.rootState
keyword.toCharArray().forEach { character ->
currentState = currentState!!.addState(character)
}
currentState!!.addEmit(index)
this@DoubleArrayStringTrie.l[index] = keyword.length
}
}
}
/**
* Get value by a String key, just like a map.get() method
*
* @param key The key
*/
operator fun get(key: String): V? {
val index = exactMatchSearch(key)
return if (index >= 0) {
v[index]
}
else null
}
/**
* Update a value corresponding to a key
*
* @param key the key
* @param value the value
*
* @return successful or notfailure if there is no key
*/
operator fun set(key: String,
value: V): Boolean {
val index = exactMatchSearch(key)
if (index >= 0) {
v[index] = value
return true
}
return false
}
}

View File

@ -1,70 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.util.*
/**
* Creates a Finite State Machine for very fast string matching.
*
* This is a wrapper for DoubleArrayTrie, since that class is awkward to use
*/
object FiniteStateMachine {
fun <V> build(map: Map<String, V>): DoubleArrayStringTrie<V> {
return DoubleArrayStringTrie(map)
}
fun <V> build(map: Map<ByteArray, V>): DoubleArrayByteArrayTrie<V> {
return DoubleArrayByteArrayTrie(map)
}
fun build(strings: List<String>): DoubleArrayStringTrie<Boolean> {
val map = TreeMap<String, Boolean>()
for (key in strings) {
map[key] = java.lang.Boolean.TRUE
}
return build(map)
}
fun build(strings: List<ByteArray>): DoubleArrayByteArrayTrie<Boolean> {
val map = TreeMap<ByteArray, Boolean>()
for (key in strings) {
map[key] = java.lang.Boolean.TRUE
}
return build(map)
}
fun build(vararg strings: String): DoubleArrayStringTrie<Boolean> {
val map = TreeMap<String, Boolean>()
for (key in strings) {
map[key] = java.lang.Boolean.TRUE
}
return build(map)
}
fun build(vararg strings: ByteArray): DoubleArrayByteArrayTrie<Boolean> {
val map = TreeMap<ByteArray, Boolean>()
for (key in strings) {
map[key] = java.lang.Boolean.TRUE
}
return build(map)
}
}

View File

@ -1,61 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
/**
* A result output
*
* @param <V> the value type
</V> */
class Hit<V> internal constructor(
/**
* the beginning index, inclusive.
*/
val begin: Int,
/**
* the ending index, exclusive.
*/
val end: Int,
/**
* the value assigned to the keyword
*/
val value: V) {
override fun toString(): String {
return String.format("[%d:%d]=%s", begin, end, value.toString())
}
}

View File

@ -1,54 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
/**
* Processor handles the output when hit a keyword
*/
interface IHit<V> {
/**
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
*
* @param begin the beginning index, inclusive.
* @param end the ending index, exclusive.
* @param value the value assigned to the keyword
*/
fun hit(begin: Int,
end: Int,
value: V)
}

View File

@ -1,56 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
/**
* Callback that allows to cancel the search process.
*/
interface IHitCancellable<V> {
/**
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
*
* @param begin the beginning index, inclusive.
* @param end the ending index, exclusive.
* @param value the value assigned to the keyword
*
* @return Return true for continuing the search and false for stopping it.
*/
fun hit(begin: Int,
end: Int,
value: V): Boolean
}

View File

@ -1,56 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
/**
* Processor handles the output when hit a keyword, with more detail
*/
interface IHitFull<V> {
/**
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
*
* @param begin the beginning index, inclusive.
* @param end the ending index, exclusive.
* @param value the value assigned to the keyword
* @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value
*/
fun hit(begin: Int,
end: Int,
value: V,
index: Int)
}

View File

@ -1,209 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.util.*
/**
*
*
* A state has the following functions
*
*
*
*
* * success; successfully transferred to another state
* * failure; if you cannot jump along the string, jump to a shallow node
* * emits; hit a pattern string
*
*
*
*
*
* The root node is slightly different. The root node has no failure function. Its "failure" refers to moving to the next state according to the string path. Other nodes have a failure state.
*
*
* @author Robert Bor
*/
class StateByte(
/**
* The length of the pattern string is also the depth of this state
*/
val depth: Int = 0) {
/**
* The fail function, if there is no match, jumps to this state.
*/
private var failure: StateByte? = null
/**
* Record mode string as long as this state is reachable
*/
private var emits: MutableSet<Int>? = null
/**
* The goto table, also known as the transfer function. Move to the next state based on the next character of the string
*/
private val success = TreeMap<Byte, StateByte>()
/**
* Corresponding subscript in double array
*/
var index: Int = 0
/**
* Get the largest value
*/
val largestValueId: Int?
get() = if (emits == null || emits!!.size == 0) {
null
}
else emits!!.iterator().next()
/**
* Whether it is the termination status
*/
val isAcceptable: Boolean
get() = this.depth > 0 && this.emits != null
val states: Collection<StateByte>
get() = this.success.values
val transitions: Collection<Byte>
get() = this.success.keys
/**
* Add a matching pattern string (this state corresponds to this pattern string)
*/
fun addEmit(keyword: Int) {
if (this.emits == null) {
this.emits = TreeSet(Collections.reverseOrder())
}
this.emits!!.add(keyword)
}
/**
* Add some matching pattern strings
*/
fun addEmit(emits: Collection<Int>) {
for (emit in emits) {
addEmit(emit)
}
}
/**
* Get the pattern string represented by this node (we)
*/
fun emit(): Collection<Int> {
return this.emits ?: emptyList()
}
/**
* Get the failure status
*/
fun failure(): StateByte? {
return this.failure
}
/**
* Set the failure status
*/
fun setFailure(failState: StateByte,
fail: IntArray) {
this.failure = failState
fail[index] = failState.index
}
/**
* Move to the next state
*
* @param character wants to transfer by this character
* @param ignoreRootState Whether to ignore the root node, it should be true if the root node calls itself, otherwise it is false
*
* @return transfer result
*/
private fun nextState(character: Byte,
ignoreRootState: Boolean): StateByte? {
var nextState: StateByte? = this.success[character]
if (!ignoreRootState && nextState == null && this.depth == 0) {
nextState = this
}
return nextState
}
/**
* According to the character transfer, the root node transfer failure will return itself (never return null)
*/
fun nextState(character: Byte): StateByte? {
return nextState(character, false)
}
/**
* According to character transfer, any node transfer failure will return null
*/
fun nextStateIgnoreRootState(character: Byte): StateByte? {
return nextState(character, true)
}
fun addState(character: Byte): StateByte {
var nextState = nextStateIgnoreRootState(character)
if (nextState == null) {
nextState = StateByte(this.depth + 1)
this.success[character] = nextState
}
return nextState
}
override fun toString(): String {
val sb = StringBuilder("State{")
sb.append("depth=").append(depth)
sb.append(", ID=").append(index)
sb.append(", emits=").append(emits)
sb.append(", success=").append(success.keys)
sb.append(", failureID=").append(if (failure == null) "-1" else failure!!.index)
sb.append(", failure=").append(failure)
sb.append('}')
return sb.toString()
}
/**
* Get goto table
*/
fun getSuccess(): Map<Byte, StateByte> {
return success
}
}

View File

@ -1,209 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* AhoCorasickDoubleArrayTrie Project
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
*
* Copyright 2008-2018 hankcs <me@hankcs.com>
* You may modify and redistribute as long as this attribution remains.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import java.util.*
/**
*
*
* A state has the following functions
*
*
*
*
* * success; successfully transferred to another state
* * failure; if you cannot jump along the string, jump to a shallow node
* * emits; hit a pattern string
*
*
*
*
*
* The root node is slightly different. The root node has no failure function. Its "failure" refers to moving to the next state according to the string path. Other nodes have a failure state.
*
*
* @author Robert Bor
*/
class StateChar(
/**
* The length of the pattern string is also the depth of this state
*/
val depth: Int = 0) {
/**
* The fail function, if there is no match, jumps to this state.
*/
private var failure: StateChar? = null
/**
* Record mode string as long as this state is reachable
*/
private var emits: MutableSet<Int>? = null
/**
* The goto table, also known as the transfer function. Move to the next state based on the next character of the string
*/
private val success = TreeMap<Char, StateChar>()
/**
* Corresponding subscript in double array
*/
var index: Int = 0
/**
* Get the largest value
*/
val largestValueId: Int?
get() = if (emits == null || emits!!.size == 0) {
null
}
else emits!!.iterator().next()
/**
* Whether it is the termination status
*/
val isAcceptable: Boolean
get() = this.depth > 0 && this.emits != null
val states: Collection<StateChar>
get() = this.success.values
val transitions: Collection<Char>
get() = this.success.keys
/**
* Add a matching pattern string (this state corresponds to this pattern string)
*/
fun addEmit(keyword: Int) {
if (this.emits == null) {
this.emits = TreeSet(Collections.reverseOrder())
}
this.emits!!.add(keyword)
}
/**
* Add some matching pattern strings
*/
fun addEmit(emits: Collection<Int>) {
for (emit in emits) {
addEmit(emit)
}
}
/**
* Get the pattern string represented by this node (we)
*/
fun emit(): Collection<Int> {
return this.emits ?: emptyList()
}
/**
* Get the failure status
*/
fun failure(): StateChar? {
return this.failure
}
/**
* Set the failure status
*/
fun setFailure(failState: StateChar,
fail: IntArray) {
this.failure = failState
fail[index] = failState.index
}
/**
* Move to the next state
*
* @param character wants to transfer by this character
* @param ignoreRootState Whether to ignore the root node, it should be true if the root node calls itself, otherwise it is false
*
* @return transfer result
*/
private fun nextState(character: Char,
ignoreRootState: Boolean): StateChar? {
var nextState: StateChar? = this.success[character]
if (!ignoreRootState && nextState == null && this.depth == 0) {
nextState = this
}
return nextState
}
/**
* According to the character transfer, the root node transfer failure will return itself (never return null)
*/
fun nextState(character: Char): StateChar? {
return nextState(character, false)
}
/**
* According to character transfer, any node transfer failure will return null
*/
fun nextStateIgnoreRootState(character: Char): StateChar? {
return nextState(character, true)
}
fun addState(character: Char): StateChar {
var nextState = nextStateIgnoreRootState(character)
if (nextState == null) {
nextState = StateChar(this.depth + 1)
this.success[character] = nextState
}
return nextState
}
override fun toString(): String {
val sb = StringBuilder("State{")
sb.append("depth=").append(depth)
sb.append(", ID=").append(index)
sb.append(", emits=").append(emits)
sb.append(", success=").append(success.keys)
sb.append(", failureID=").append(if (failure == null) "-1" else failure!!.index)
sb.append(", failure=").append(failure)
sb.append('}')
return sb.toString()
}
/**
* Get goto table
*/
fun getSuccess(): Map<Char, StateChar> {
return success
}
}

View File

@ -1,6 +1,5 @@
module dorkbox.collections {
exports dorkbox.collections;
exports dorkbox.collections.ahoCorasick;
requires transitive dorkbox.updates;

View File

@ -1,245 +0,0 @@
/*
* Copyright 2023 dorkbox, llc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dorkbox.collections.ahoCorasick
import org.junit.Test
import java.util.*
class TestTrie {
@Test
fun trieFromStringMap() {
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
val keys = Arrays.asList(*strings)
var text: String
run {
val map = TreeMap<String, String>()
for (key in keys) {
map[key] = key
}
val fsm = FiniteStateMachine.build(map)
text = "reddit.google.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
println()
text = "reddit.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
println()
text = "fun.reddit.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
}
}
@Test
fun trieFromByteArrayMap() {
val strings = arrayOf(
"khanacademy.com".toByteArray(),
"cnn.com".toByteArray(),
"google.com".toByteArray(),
"fun.reddit.com".toByteArray(),
"reddit.com".toByteArray())
val keys = Arrays.asList(*strings)
var text: String
run {
val map = TreeMap<ByteArray, String>()
for (key in keys) {
map[key] = String(key)
}
val fsm = FiniteStateMachine.build(map)
text = "reddit.google.com"
println("Searching : $text")
var result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
println()
text = "reddit.com"
println("Searching : $text")
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
println()
text = "fun.reddit.com"
println("Searching : $text")
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
}
}
@Test
fun trieFromStringList() {
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
val keys = Arrays.asList(*strings)
var text: String
run {
val fsm = FiniteStateMachine.build(keys)
text = "reddit.google.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
println()
text = "reddit.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
println()
text = "fun.reddit.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
}
}
@Test
fun trieFromByteArrayList() {
val strings = arrayOf(
"khanacademy.com".toByteArray(),
"cnn.com".toByteArray(),
"google.com".toByteArray(),
"fun.reddit.com".toByteArray(),
"reddit.com".toByteArray())
val keys = Arrays.asList(*strings)
var text: String
run {
val fsm = FiniteStateMachine.build(keys)
text = "reddit.google.com"
println("Searching : $text")
var result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
println()
text = "reddit.com"
println("Searching : $text")
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
println()
text = "fun.reddit.com"
println("Searching : $text")
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
}
}
@Test
fun trieFromStringVarArg() {
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
var text: String
run {
val fsm = FiniteStateMachine.build(*strings)
text = "reddit.google.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
println()
text = "reddit.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
println()
text = "fun.reddit.com"
println("Searching : $text")
println(fsm.partialMatch(text))
println("Found: " + fsm.matches(text))
}
}
@Test
fun trieFromByteArrayVarArg() {
val strings = arrayOf(
"khanacademy.com".toByteArray(),
"cnn.com".toByteArray(),
"google.com".toByteArray(),
"fun.reddit.com".toByteArray(),
"reddit.com".toByteArray())
var text: String
run {
val fsm = FiniteStateMachine.build(*strings)
text = "reddit.google.com"
println("Searching : $text")
var result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
println()
text = "reddit.com"
println("Searching : $text")
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
println()
text = "fun.reddit.com"
println("Searching : $text")
result = fsm.partialMatch(text.toByteArray())
result.forEach { it ->
println(it.toString())
}
println("Found: " + fsm.matches(text.toByteArray()))
}
}
@Test
fun fmsOutput() {
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
val fsm = FiniteStateMachine.build(*strings)
run {
println("Keywords Orig: " + Arrays.toString(strings))
println("Keywords FSM : " + Arrays.toString(fsm.keywords))
}
}
}