Moved FSM to its own project.
parent
2ffa58cbbd
commit
bb28977529
|
@ -19,7 +19,7 @@ Maven Info
|
|||
<dependency>
|
||||
<groupId>com.dorkbox</groupId>
|
||||
<artifactId>Collections</artifactId>
|
||||
<version>1.2</version>
|
||||
<version>1.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
```
|
||||
|
@ -29,7 +29,7 @@ Gradle Info
|
|||
```
|
||||
dependencies {
|
||||
...
|
||||
implementation("com.dorkbox:Collections:1.2")
|
||||
implementation("com.dorkbox:Collections:1.3")
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ object Extras {
|
|||
// set for the project
|
||||
const val description = "Niche collections to augment what is already available."
|
||||
const val group = "com.dorkbox"
|
||||
const val version = "1.2"
|
||||
const val version = "1.3"
|
||||
|
||||
// set as project.ext
|
||||
const val name = "Collections"
|
||||
|
@ -65,12 +65,6 @@ licensing {
|
|||
author(Extras.vendor)
|
||||
url(Extras.url)
|
||||
|
||||
extra("AhoCorasickDoubleArrayTrie", License.APACHE_2) {
|
||||
description(Extras.description)
|
||||
copyright(2018)
|
||||
author("hankcs <me@hankcs.com>")
|
||||
url("https://github.com/hankcs/AhoCorasickDoubleArrayTrie")
|
||||
}
|
||||
extra("Bias, BinarySearch", License.MIT) {
|
||||
url(Extras.url)
|
||||
url("https://github.com/timboudreau/util")
|
||||
|
|
|
@ -21,7 +21,7 @@ object Collections {
|
|||
/**
|
||||
* Gets the version number.
|
||||
*/
|
||||
const val version = "1.2"
|
||||
const val version = "1.3"
|
||||
|
||||
init {
|
||||
// Add this project to the updates system, which verifies this class + UUID + version information
|
||||
|
|
|
@ -1,348 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
* A builder to build the AhoCorasickDoubleArrayTrie
|
||||
*/
|
||||
internal abstract class BaseByteBuilder<K, V> {
|
||||
/**
|
||||
* the root state of trie
|
||||
*/
|
||||
internal var rootState: StateByte? = StateByte()
|
||||
|
||||
/**
|
||||
* whether the position has been used
|
||||
*/
|
||||
private var used: BooleanArray? = null
|
||||
|
||||
/**
|
||||
* the allocSize of the dynamic array
|
||||
*/
|
||||
private var allocSize: Int = 0
|
||||
|
||||
/**
|
||||
* a parameter controls the memory growth speed of the dynamic array
|
||||
*/
|
||||
private var progress: Int = 0
|
||||
|
||||
/**
|
||||
* the next position to check unused memory
|
||||
*/
|
||||
private var nextCheckPos: Int = 0
|
||||
|
||||
/**
|
||||
* the size of the key-pair sets
|
||||
*/
|
||||
private var keySize: Int = 0
|
||||
|
||||
|
||||
lateinit var output: Array<IntArray?>
|
||||
lateinit var fail: IntArray
|
||||
lateinit var base: IntArray
|
||||
lateinit var check: IntArray
|
||||
var size: Int = 0
|
||||
|
||||
/**
|
||||
* Build from a map
|
||||
*
|
||||
* @param map a map containing key-value pairs
|
||||
*/
|
||||
fun build(map: Map<K, V>) {
|
||||
val keySet = map.keys
|
||||
|
||||
// Construct a two-point trie tree
|
||||
addAllKeyword(keySet)
|
||||
|
||||
// Building a double array trie tree based on a two-point trie tree
|
||||
buildDoubleArrayTrie(keySet.size)
|
||||
used = null
|
||||
|
||||
// Build the failure table and merge the output table
|
||||
constructFailureStates()
|
||||
rootState = null
|
||||
loseWeight()
|
||||
}
|
||||
|
||||
/**
|
||||
* fetch siblings of a parent node
|
||||
*
|
||||
* @param parent parent node
|
||||
* @param siblings parent node's child nodes, i . e . the siblings
|
||||
*
|
||||
* @return the amount of the siblings
|
||||
*/
|
||||
private fun fetch(parent: StateByte,
|
||||
siblings: MutableList<Pair<Int, StateByte>>): Int {
|
||||
|
||||
if (parent.isAcceptable) {
|
||||
// This node is a child of the parent and has the output of the parent.
|
||||
val fakeNode = StateByte(-(parent.depth + 1))
|
||||
fakeNode.addEmit(parent.largestValueId!!)
|
||||
siblings.add(Pair(0, fakeNode))
|
||||
}
|
||||
|
||||
for ((key, value) in parent.getSuccess()) {
|
||||
siblings.add(Pair(key + 1, value))
|
||||
}
|
||||
|
||||
return siblings.size
|
||||
}
|
||||
|
||||
/**
|
||||
* add a keyword
|
||||
*
|
||||
* @param keyword a keyword
|
||||
* @param index the index of the keyword
|
||||
*/
|
||||
internal abstract fun addKeyword(keyword: K, index: Int)
|
||||
|
||||
/**
|
||||
* add a collection of keywords
|
||||
*
|
||||
* @param keywordSet the collection holding keywords
|
||||
*/
|
||||
private fun addAllKeyword(keywordSet: Collection<K>) {
|
||||
var i = 0
|
||||
keywordSet.forEach { keyword ->
|
||||
addKeyword(keyword, i++)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* construct failure table
|
||||
*/
|
||||
private fun constructFailureStates() {
|
||||
fail = IntArray((size + 1).coerceAtLeast(2))
|
||||
fail[1] = base[0]
|
||||
output = arrayOfNulls(size + 1)
|
||||
|
||||
val queue = ArrayDeque<StateByte>()
|
||||
|
||||
// The first step is to set the failure of the node with depth 1 to the root node.
|
||||
this.rootState!!.states.forEach { depthOneState ->
|
||||
depthOneState.setFailure(this.rootState!!, fail)
|
||||
queue.add(depthOneState)
|
||||
constructOutput(depthOneState)
|
||||
}
|
||||
|
||||
// The second step is to create a failure table for nodes with depth > 1, which is a bfs
|
||||
while (!queue.isEmpty()) {
|
||||
val currentState = queue.remove()
|
||||
|
||||
for (transition in currentState.transitions) {
|
||||
val targetState = currentState.nextState(transition)
|
||||
queue.add(targetState)
|
||||
|
||||
var traceFailureState = currentState.failure()
|
||||
while (traceFailureState!!.nextState(transition) == null) {
|
||||
traceFailureState = traceFailureState.failure()
|
||||
}
|
||||
|
||||
val newFailureState = traceFailureState.nextState(transition)
|
||||
targetState!!.setFailure(newFailureState!!, fail)
|
||||
targetState.addEmit(newFailureState.emit())
|
||||
constructOutput(targetState)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* construct output table
|
||||
*/
|
||||
private fun constructOutput(targetState: StateByte) {
|
||||
val emit = targetState.emit()
|
||||
if (emit.isEmpty()) {
|
||||
return
|
||||
}
|
||||
|
||||
val output = IntArray(emit.size)
|
||||
val it = emit.iterator()
|
||||
for (i in output.indices) {
|
||||
output[i] = it.next()
|
||||
}
|
||||
|
||||
this.output[targetState.index] = output
|
||||
}
|
||||
|
||||
private fun buildDoubleArrayTrie(keySize: Int) {
|
||||
progress = 0
|
||||
this.keySize = keySize
|
||||
resize(65536 * 32) // 32 double bytes
|
||||
|
||||
base[0] = 1
|
||||
nextCheckPos = 0
|
||||
|
||||
val rootNode = this.rootState
|
||||
val initialCapacity = rootNode!!.getSuccess().entries.size
|
||||
|
||||
val siblings = ArrayList<Pair<Int, StateByte>>(initialCapacity)
|
||||
fetch(rootNode, siblings)
|
||||
|
||||
if (siblings.isNotEmpty()) {
|
||||
insert(siblings)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* allocate the memory of the dynamic array
|
||||
*/
|
||||
private fun resize(newSize: Int): Int {
|
||||
val base2 = IntArray(newSize)
|
||||
val check2 = IntArray(newSize)
|
||||
val used2 = BooleanArray(newSize)
|
||||
|
||||
if (allocSize > 0) {
|
||||
System.arraycopy(base, 0, base2, 0, allocSize)
|
||||
System.arraycopy(check, 0, check2, 0, allocSize)
|
||||
System.arraycopy(used!!, 0, used2, 0, allocSize)
|
||||
}
|
||||
|
||||
base = base2
|
||||
check = check2
|
||||
used = used2
|
||||
|
||||
allocSize = newSize
|
||||
return newSize
|
||||
}
|
||||
|
||||
/**
|
||||
* insert the siblings to double array trie
|
||||
*
|
||||
* @param siblings the siblings being inserted
|
||||
*
|
||||
* @return the position to insert them
|
||||
*/
|
||||
private fun insert(siblings: List<Pair<Int, StateByte>>): Int {
|
||||
var begin: Int
|
||||
var pos = Math.max(siblings[0].first + 1, nextCheckPos) - 1
|
||||
var nonzeroNum = 0
|
||||
var first = 0
|
||||
|
||||
if (allocSize <= pos) {
|
||||
resize(pos + 1)
|
||||
}
|
||||
|
||||
outer@
|
||||
// The goal of this loop body is to find n free spaces that satisfy base[begin + a1...an] == 0, a1...an are n nodes in siblings
|
||||
while (true) {
|
||||
pos++
|
||||
|
||||
if (allocSize <= pos) {
|
||||
resize(pos + 1)
|
||||
}
|
||||
|
||||
if (check[pos] != 0) {
|
||||
nonzeroNum++
|
||||
continue
|
||||
}
|
||||
else if (first == 0) {
|
||||
nextCheckPos = pos
|
||||
first = 1
|
||||
}
|
||||
|
||||
begin = pos - siblings[0].first // The distance of the current position from the first sibling node
|
||||
if (allocSize <= begin + siblings[siblings.size - 1].first) {
|
||||
// progress can be zero
|
||||
// Prevent progress from generating zero divide errors
|
||||
val l = if (1.05 > 1.0 * keySize / (progress + 1)) 1.05 else 1.0 * keySize / (progress + 1)
|
||||
resize((allocSize * l).toInt())
|
||||
}
|
||||
|
||||
if (used!![begin]) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (i in 1 until siblings.size) {
|
||||
if (check[begin + siblings[i].first] != 0) {
|
||||
continue@outer
|
||||
}
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
// -- Simple heuristics --
|
||||
// if the percentage of non-empty contents in check between the
|
||||
// index
|
||||
// 'next_check_pos' and 'check' is greater than some constant value
|
||||
// (e.g. 0.9),
|
||||
// new 'next_check_pos' index is written by 'check'.
|
||||
if (1.0 * nonzeroNum / (pos - nextCheckPos + 1) >= 0.95) {
|
||||
// From the position next_check_pos to pos, if the occupied space is above 95%, the next
|
||||
// time you insert a node, you can start looking directly at the pos position.
|
||||
nextCheckPos = pos
|
||||
}
|
||||
used!![begin] = true // valid because resize is called.
|
||||
|
||||
val sizeLimit = begin + siblings[siblings.size - 1].first + 1
|
||||
if (size <= sizeLimit) {
|
||||
size = sizeLimit
|
||||
}
|
||||
|
||||
|
||||
for (sibling in siblings) {
|
||||
check[begin + sibling.first] = begin
|
||||
}
|
||||
|
||||
for (sibling in siblings) {
|
||||
val newSiblings = ArrayList<Pair<Int, StateByte>>(sibling.second.getSuccess().entries.size + 1)
|
||||
|
||||
if (fetch(sibling.second, newSiblings) == 0) {
|
||||
// The termination of a word and not the prefix of other words, in fact, is the leaf node
|
||||
base[begin + sibling.first] = 0 - sibling.second.largestValueId!! - 1
|
||||
progress++
|
||||
}
|
||||
else {
|
||||
val h = insert(newSiblings) // depth first search
|
||||
base[begin + sibling.first] = h
|
||||
}
|
||||
sibling.second.index = begin + sibling.first
|
||||
}
|
||||
return begin
|
||||
}
|
||||
|
||||
/**
|
||||
* free the unnecessary memory
|
||||
*/
|
||||
private fun loseWeight() {
|
||||
base = base.copyOf(size + 65535)
|
||||
check = check.copyOf(size + 65535)
|
||||
}
|
||||
}
|
|
@ -1,558 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
@file:Suppress("unused")
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.io.IOException
|
||||
import java.io.ObjectInputStream
|
||||
import java.io.ObjectOutputStream
|
||||
import java.io.Serializable
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
* An implementation of Aho Corasick algorithm based on Double Array Trie
|
||||
*
|
||||
* Will create a DoubleArray Trie from a Map or InputStream (if previously saved)
|
||||
*
|
||||
* @author hankcs, dorkbox
|
||||
*/
|
||||
@Suppress("DuplicatedCode")
|
||||
abstract class BaseByteTrie<K, V>(map: Map<K, V>?, inputStream: ObjectInputStream?) : Serializable {
|
||||
|
||||
/**
|
||||
* check array of the Double Array Trie structure
|
||||
*/
|
||||
private val check: IntArray
|
||||
|
||||
/**
|
||||
* base array of the Double Array Trie structure
|
||||
*/
|
||||
private val base: IntArray
|
||||
|
||||
/**
|
||||
* fail table of the Aho Corasick automata
|
||||
*/
|
||||
private val fail: IntArray
|
||||
|
||||
/**
|
||||
* output table of the Aho Corasick automata
|
||||
*/
|
||||
private val output: Array<IntArray?>
|
||||
|
||||
/**
|
||||
* outer value array
|
||||
*/
|
||||
internal val v: Array<V>
|
||||
|
||||
/**
|
||||
* the length of every key
|
||||
*/
|
||||
internal val l: IntArray
|
||||
|
||||
/**
|
||||
* the size of base and check array
|
||||
*/
|
||||
private val checkSize: Int
|
||||
|
||||
init {
|
||||
when {
|
||||
map != null -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
v = kotlin.jvm.internal.collectionToArray(map.values) as Array<V>
|
||||
l = IntArray(map.size)
|
||||
|
||||
@Suppress("LeakingThis")
|
||||
val builder = builder()
|
||||
builder.build(map)
|
||||
|
||||
fail = builder.fail
|
||||
base = builder.base
|
||||
check = builder.check
|
||||
|
||||
checkSize = builder.size
|
||||
output = builder.output
|
||||
}
|
||||
|
||||
inputStream != null -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
v = inputStream.readObject() as Array<V>
|
||||
l = inputStream.readObject() as IntArray
|
||||
|
||||
fail = inputStream.readObject() as IntArray
|
||||
base = inputStream.readObject() as IntArray
|
||||
check = inputStream.readObject() as IntArray
|
||||
checkSize = inputStream.readObject() as Int
|
||||
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
output = inputStream.readObject() as Array<IntArray?>
|
||||
}
|
||||
else -> throw NullPointerException("Map or InputStream must be specified!")
|
||||
}
|
||||
}
|
||||
|
||||
internal abstract fun builder(): BaseByteBuilder<K, V>
|
||||
|
||||
/**
|
||||
* Save
|
||||
*/
|
||||
@Throws(IOException::class)
|
||||
fun save(out: ObjectOutputStream) {
|
||||
out.writeObject(v)
|
||||
out.writeObject(l)
|
||||
out.writeObject(fail)
|
||||
out.writeObject(base)
|
||||
out.writeObject(check)
|
||||
out.writeObject(checkSize)
|
||||
out.writeObject(output)
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the backing keywords IN THEIR NATURAL ORDER, in the case that you need access to the original FSM data.
|
||||
*
|
||||
* @return for example, if the FSM was populated with [reddit.com, cnn.com], this will return [cnn.com, reddit.com]
|
||||
*/
|
||||
val keywords: Array<V>
|
||||
get() {
|
||||
return v
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size of the keywords
|
||||
*/
|
||||
val size: Int
|
||||
get() {
|
||||
return v.size
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses text and returns true if there are PARTIALLY matching results. For exact matches only it is better to use `matches`
|
||||
*
|
||||
* @return true if there is a match or partial match. "fun.reddit.com" will partially match to "reddit.com"
|
||||
*/
|
||||
fun hasPartialMatch(byteArray: ByteArray): Boolean {
|
||||
return parseBytes(byteArray).isNotEmpty()
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses text and finds PARTIALLY matching results. For exact matches only it is better to use `matches`
|
||||
*
|
||||
* @return a list of outputs that contain matches or partial matches. The returned list will specify HOW MUCH of the text matches (A full match would be from 0 (the start), to N (the length of the text).
|
||||
*/
|
||||
fun partialMatch(byteArray: ByteArray): List<Hit<V>> {
|
||||
return parseBytes(byteArray)
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse byte arrays
|
||||
*
|
||||
* @return a list of outputs
|
||||
*/
|
||||
fun parseBytes(byteArray: ByteArray): List<Hit<V>> {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
val collectedEmits = LinkedList<Hit<V>>() // unknown size, so
|
||||
|
||||
for (element in byteArray) {
|
||||
currentState = getState(currentState, element)
|
||||
storeEmits(position++, currentState, collectedEmits)
|
||||
}
|
||||
|
||||
return collectedEmits
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse byte arrays
|
||||
*
|
||||
* @param byteArray The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseBytes(byteArray: ByteArray,
|
||||
processor: IHitCancellable<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (element in byteArray) {
|
||||
position++
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
val proceed = processor.hit(position - l[hit], position, v[hit])
|
||||
if (!proceed) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse byte arrays
|
||||
*
|
||||
* @param byteArray The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseBytes(byteArray: ByteArray,
|
||||
processor: IHit<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (c in byteArray) {
|
||||
currentState = getState(currentState, c)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
processor.hit(position - l[hit], position, v[hit])
|
||||
}
|
||||
}
|
||||
position++
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse byte arrays
|
||||
*
|
||||
* @param byteArray The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseBytes(byteArray: ByteArray,
|
||||
processor: IHitFull<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (c in byteArray) {
|
||||
currentState = getState(currentState, c)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
processor.hit(position - l[hit], position, v[hit], hit)
|
||||
}
|
||||
}
|
||||
position++
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that string contains at least one substring
|
||||
*
|
||||
* @param byteArray source byte arrays to check
|
||||
*
|
||||
* @return `true` if string contains at least one substring
|
||||
*/
|
||||
fun matches(byteArray: ByteArray): Boolean {
|
||||
var currentState = 0
|
||||
for (element in byteArray) {
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Search first match in string
|
||||
*
|
||||
* @param byteArray source byte array to check
|
||||
*
|
||||
* @return first match or `null` if there are no matches
|
||||
*/
|
||||
fun findFirst(byteArray: ByteArray): Hit<V>? {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (element in byteArray) {
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
val hitIndex = hitArray[0]
|
||||
return Hit(position - l[hitIndex], position, v[hitIndex])
|
||||
}
|
||||
position++
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the value by index in value array <br></br>
|
||||
* Notice that to be more efficiently, this method DOES NOT check the parameter
|
||||
*
|
||||
* @param index The index
|
||||
*
|
||||
* @return The value
|
||||
*/
|
||||
operator fun get(index: Int): V {
|
||||
return v[index]
|
||||
}
|
||||
|
||||
/**
|
||||
* transmit state, supports failure function
|
||||
*/
|
||||
private fun getState(currentState: Int,
|
||||
character: Byte): Int {
|
||||
|
||||
@Suppress("NAME_SHADOWING")
|
||||
var currentState = currentState
|
||||
|
||||
var newCurrentState = transitionWithRoot(currentState, character) // First press success
|
||||
while (newCurrentState == -1)
|
||||
// If the jump fails, press failure to jump
|
||||
{
|
||||
currentState = fail[currentState]
|
||||
newCurrentState = transitionWithRoot(currentState, character)
|
||||
}
|
||||
return newCurrentState
|
||||
}
|
||||
|
||||
/**
|
||||
* store output
|
||||
*/
|
||||
private fun storeEmits(position: Int,
|
||||
currentState: Int,
|
||||
collectedEmits: MutableList<Hit<V>>) {
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
collectedEmits.add(Hit(position - l[hit], position, v[hit]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* transition of a state
|
||||
*/
|
||||
private fun transition(current: Int,
|
||||
c: Char): Int {
|
||||
var b = current
|
||||
var p: Int
|
||||
|
||||
p = b + c.code + 1
|
||||
if (b == check[p]) {
|
||||
b = base[p]
|
||||
}
|
||||
else {
|
||||
return -1
|
||||
}
|
||||
|
||||
p = b
|
||||
return p
|
||||
}
|
||||
|
||||
/**
|
||||
* transition of a state, if the state is root, and it failed, then returns the root
|
||||
*/
|
||||
private fun transitionWithRoot(nodePos: Int,
|
||||
c: Byte): Int {
|
||||
val b = base[nodePos]
|
||||
val p: Int
|
||||
|
||||
p = b + c + 1
|
||||
return if (b != check[p]) {
|
||||
if (nodePos == 0) {
|
||||
0
|
||||
}
|
||||
else -1
|
||||
}
|
||||
else p
|
||||
}
|
||||
|
||||
/**
|
||||
* match exactly by a key
|
||||
*
|
||||
* @param byteArray the key
|
||||
*
|
||||
* @return the index of the key, you can use it as a perfect hash function
|
||||
*/
|
||||
fun exactMatchSearch(byteArray: ByteArray): Int {
|
||||
return exactMatchSearch(byteArray, 0, 0, 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* match exactly by a key
|
||||
*/
|
||||
fun exactMatchSearch(byteArray: ByteArray,
|
||||
pos: Int,
|
||||
len: Int,
|
||||
nodePos: Int): Int {
|
||||
|
||||
@Suppress("NAME_SHADOWING")
|
||||
var len = len
|
||||
|
||||
@Suppress("NAME_SHADOWING")
|
||||
var nodePos = nodePos
|
||||
|
||||
if (len <= 0) {
|
||||
len = byteArray.size
|
||||
}
|
||||
if (nodePos <= 0) {
|
||||
nodePos = 0
|
||||
}
|
||||
|
||||
var result = -1
|
||||
|
||||
val keyChars = byteArray
|
||||
|
||||
var b = base[nodePos]
|
||||
var p: Int
|
||||
|
||||
for (i in pos until len) {
|
||||
p = b + keyChars[i] + 1
|
||||
if (b == check[p]) {
|
||||
b = base[p]
|
||||
}
|
||||
else {
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
p = b
|
||||
val n = base[p]
|
||||
if (b == check[p] && n < 0) {
|
||||
result = -n - 1
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Just for debug when I wrote it
|
||||
// */
|
||||
// public void debug()
|
||||
// {
|
||||
// System.out.println("base:");
|
||||
// for (int i = 0; i < base.length; i++)
|
||||
// {
|
||||
// if (base[i] < 0)
|
||||
// {
|
||||
// System.out.println(i + " : " + -base[i]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// System.out.println("output:");
|
||||
// for (int i = 0; i < output.length; i++)
|
||||
// {
|
||||
// if (output[i] != null)
|
||||
// {
|
||||
// System.out.println(i + " : " + Arrays.toString(output[i]));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// System.out.println("fail:");
|
||||
// for (int i = 0; i < fail.length; i++)
|
||||
// {
|
||||
// if (fail[i] != 0)
|
||||
// {
|
||||
// System.out.println(i + " : " + fail[i]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// System.out.println(this);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String toString()
|
||||
// {
|
||||
// String infoIndex = "i = ";
|
||||
// String infoChar = "char = ";
|
||||
// String infoBase = "base = ";
|
||||
// String infoCheck = "check= ";
|
||||
// for (int i = 0; i < Math.min(base.length, 200); ++i)
|
||||
// {
|
||||
// if (base[i] != 0 || check[i] != 0)
|
||||
// {
|
||||
// infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1));
|
||||
// infoIndex += " " + String.format("%5d", i);
|
||||
// infoBase += " " + String.format("%5d", base[i]);
|
||||
// infoCheck += " " + String.format("%5d", check[i]);
|
||||
// }
|
||||
// }
|
||||
// return "DoubleArrayTrie:" +
|
||||
// "\n" + infoChar +
|
||||
// "\n" + infoIndex +
|
||||
// "\n" + infoBase +
|
||||
// "\n" + infoCheck + "\n" +
|
||||
//// "check=" + Arrays.toString(check) +
|
||||
//// ", base=" + Arrays.toString(base) +
|
||||
//// ", used=" + Arrays.toString(used) +
|
||||
// "size=" + size
|
||||
//// ", length=" + Arrays.toString(length) +
|
||||
//// ", value=" + Arrays.toString(value) +
|
||||
// ;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * A debug class that sequentially outputs variable names and variable values
|
||||
// */
|
||||
// private static class DebugArray
|
||||
// {
|
||||
// Map<String, String> nameValueMap = new LinkedHashMap<String, String>();
|
||||
//
|
||||
// public void add(String name, int value)
|
||||
// {
|
||||
// String valueInMap = nameValueMap.get(name);
|
||||
// if (valueInMap == null)
|
||||
// {
|
||||
// valueInMap = "";
|
||||
// }
|
||||
//
|
||||
// valueInMap += " " + String.format("%5d", value);
|
||||
//
|
||||
// nameValueMap.put(name, valueInMap);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String toString()
|
||||
// {
|
||||
// String text = "";
|
||||
// for (Map.Entry<String, String> entry : nameValueMap.entrySet())
|
||||
// {
|
||||
// String name = entry.getKey();
|
||||
// String value = entry.getValue();
|
||||
// text += String.format("%-5s", name) + "= " + value + '\n';
|
||||
// }
|
||||
//
|
||||
// return text;
|
||||
// }
|
||||
//
|
||||
// public void println()
|
||||
// {
|
||||
// System.out.print(this);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
}
|
|
@ -1,348 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
* A builder to build the AhoCorasickDoubleArrayTrie
|
||||
*/
|
||||
internal abstract class BaseCharBuilder<K, V> {
|
||||
/**
|
||||
* the root state of trie
|
||||
*/
|
||||
internal var rootState: StateChar? = StateChar()
|
||||
|
||||
/**
|
||||
* whether the position has been used
|
||||
*/
|
||||
private var used: BooleanArray? = null
|
||||
|
||||
/**
|
||||
* the allocSize of the dynamic array
|
||||
*/
|
||||
private var allocSize: Int = 0
|
||||
|
||||
/**
|
||||
* a parameter controls the memory growth speed of the dynamic array
|
||||
*/
|
||||
private var progress: Int = 0
|
||||
|
||||
/**
|
||||
* the next position to check unused memory
|
||||
*/
|
||||
private var nextCheckPos: Int = 0
|
||||
|
||||
/**
|
||||
* the size of the key-pair sets
|
||||
*/
|
||||
private var keySize: Int = 0
|
||||
|
||||
|
||||
lateinit var output: Array<IntArray?>
|
||||
lateinit var fail: IntArray
|
||||
lateinit var base: IntArray
|
||||
lateinit var check: IntArray
|
||||
var size: Int = 0
|
||||
|
||||
/**
|
||||
* Build from a map
|
||||
*
|
||||
* @param map a map containing key-value pairs
|
||||
*/
|
||||
fun build(map: Map<K, V>) {
|
||||
val keySet = map.keys
|
||||
|
||||
// Construct a two-point trie tree
|
||||
addAllKeyword(keySet)
|
||||
|
||||
// Building a double array trie tree based on a two-point trie tree
|
||||
buildDoubleArrayTrie(keySet.size)
|
||||
used = null
|
||||
|
||||
// Build the failure table and merge the output table
|
||||
constructFailureStates()
|
||||
rootState = null
|
||||
loseWeight()
|
||||
}
|
||||
|
||||
/**
|
||||
* fetch siblings of a parent node
|
||||
*
|
||||
* @param parent parent node
|
||||
* @param siblings parent node's child nodes, i . e . the siblings
|
||||
*
|
||||
* @return the amount of the siblings
|
||||
*/
|
||||
private fun fetch(parent: StateChar,
|
||||
siblings: MutableList<Pair<Int, StateChar>>): Int {
|
||||
|
||||
if (parent.isAcceptable) {
|
||||
// This node is a child of the parent and has the output of the parent.
|
||||
val fakeNode = StateChar(-(parent.depth + 1))
|
||||
fakeNode.addEmit(parent.largestValueId!!)
|
||||
siblings.add(Pair(0, fakeNode))
|
||||
}
|
||||
|
||||
for ((key, value) in parent.getSuccess()) {
|
||||
siblings.add(Pair(key.code + 1, value))
|
||||
}
|
||||
|
||||
return siblings.size
|
||||
}
|
||||
|
||||
/**
|
||||
* add a keyword
|
||||
*
|
||||
* @param keyword a keyword
|
||||
* @param index the index of the keyword
|
||||
*/
|
||||
internal abstract fun addKeyword(keyword: K, index: Int)
|
||||
|
||||
/**
|
||||
* add a collection of keywords
|
||||
*
|
||||
* @param keywordSet the collection holding keywords
|
||||
*/
|
||||
private fun addAllKeyword(keywordSet: Collection<K>) {
|
||||
var i = 0
|
||||
keywordSet.forEach { keyword ->
|
||||
addKeyword(keyword, i++)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* construct failure table
|
||||
*/
|
||||
private fun constructFailureStates() {
|
||||
fail = IntArray((size + 1).coerceAtLeast(2))
|
||||
fail[1] = base[0]
|
||||
output = arrayOfNulls(size + 1)
|
||||
|
||||
val queue = ArrayDeque<StateChar>()
|
||||
|
||||
// The first step is to set the failure of the node with depth 1 to the root node.
|
||||
this.rootState!!.states.forEach { depthOneState ->
|
||||
depthOneState.setFailure(this.rootState!!, fail)
|
||||
queue.add(depthOneState)
|
||||
constructOutput(depthOneState)
|
||||
}
|
||||
|
||||
// The second step is to create a failure table for nodes with depth > 1, which is a bfs
|
||||
while (!queue.isEmpty()) {
|
||||
val currentState = queue.remove()
|
||||
|
||||
for (transition in currentState.transitions) {
|
||||
val targetState = currentState.nextState(transition)
|
||||
queue.add(targetState)
|
||||
|
||||
var traceFailureState = currentState.failure()
|
||||
while (traceFailureState!!.nextState(transition) == null) {
|
||||
traceFailureState = traceFailureState.failure()
|
||||
}
|
||||
|
||||
val newFailureState = traceFailureState.nextState(transition)
|
||||
targetState!!.setFailure(newFailureState!!, fail)
|
||||
targetState.addEmit(newFailureState.emit())
|
||||
constructOutput(targetState)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* construct output table
|
||||
*/
|
||||
private fun constructOutput(targetState: StateChar) {
|
||||
val emit = targetState.emit()
|
||||
if (emit.isEmpty()) {
|
||||
return
|
||||
}
|
||||
|
||||
val output = IntArray(emit.size)
|
||||
val it = emit.iterator()
|
||||
for (i in output.indices) {
|
||||
output[i] = it.next()
|
||||
}
|
||||
|
||||
this.output[targetState.index] = output
|
||||
}
|
||||
|
||||
private fun buildDoubleArrayTrie(keySize: Int) {
|
||||
progress = 0
|
||||
this.keySize = keySize
|
||||
resize(65536 * 32) // 32 double bytes
|
||||
|
||||
base[0] = 1
|
||||
nextCheckPos = 0
|
||||
|
||||
val rootNode = this.rootState
|
||||
val initialCapacity = rootNode!!.getSuccess().entries.size
|
||||
|
||||
val siblings = ArrayList<Pair<Int, StateChar>>(initialCapacity)
|
||||
fetch(rootNode, siblings)
|
||||
|
||||
if (siblings.isNotEmpty()) {
|
||||
insert(siblings)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* allocate the memory of the dynamic array
|
||||
*/
|
||||
private fun resize(newSize: Int): Int {
|
||||
val base2 = IntArray(newSize)
|
||||
val check2 = IntArray(newSize)
|
||||
val used2 = BooleanArray(newSize)
|
||||
|
||||
if (allocSize > 0) {
|
||||
System.arraycopy(base, 0, base2, 0, allocSize)
|
||||
System.arraycopy(check, 0, check2, 0, allocSize)
|
||||
System.arraycopy(used!!, 0, used2, 0, allocSize)
|
||||
}
|
||||
|
||||
base = base2
|
||||
check = check2
|
||||
used = used2
|
||||
|
||||
allocSize = newSize
|
||||
return newSize
|
||||
}
|
||||
|
||||
/**
|
||||
* insert the siblings to double array trie
|
||||
*
|
||||
* @param siblings the siblings being inserted
|
||||
*
|
||||
* @return the position to insert them
|
||||
*/
|
||||
private fun insert(siblings: List<Pair<Int, StateChar>>): Int {
|
||||
var begin: Int
|
||||
var pos = Math.max(siblings[0].first + 1, nextCheckPos) - 1
|
||||
var nonzeroNum = 0
|
||||
var first = 0
|
||||
|
||||
if (allocSize <= pos) {
|
||||
resize(pos + 1)
|
||||
}
|
||||
|
||||
outer@
|
||||
// The goal of this loop body is to find n free spaces that satisfy base[begin + a1...an] == 0, a1...an are n nodes in siblings
|
||||
while (true) {
|
||||
pos++
|
||||
|
||||
if (allocSize <= pos) {
|
||||
resize(pos + 1)
|
||||
}
|
||||
|
||||
if (check[pos] != 0) {
|
||||
nonzeroNum++
|
||||
continue
|
||||
}
|
||||
else if (first == 0) {
|
||||
nextCheckPos = pos
|
||||
first = 1
|
||||
}
|
||||
|
||||
begin = pos - siblings[0].first // The distance of the current position from the first sibling node
|
||||
if (allocSize <= begin + siblings[siblings.size - 1].first) {
|
||||
// progress can be zero
|
||||
// Prevent progress from generating zero divide errors
|
||||
val l = if (1.05 > 1.0 * keySize / (progress + 1)) 1.05 else 1.0 * keySize / (progress + 1)
|
||||
resize((allocSize * l).toInt())
|
||||
}
|
||||
|
||||
if (used!![begin]) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (i in 1 until siblings.size) {
|
||||
if (check[begin + siblings[i].first] != 0) {
|
||||
continue@outer
|
||||
}
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
// -- Simple heuristics --
|
||||
// if the percentage of non-empty contents in check between the
|
||||
// index
|
||||
// 'next_check_pos' and 'check' is greater than some constant value
|
||||
// (e.g. 0.9),
|
||||
// new 'next_check_pos' index is written by 'check'.
|
||||
if (1.0 * nonzeroNum / (pos - nextCheckPos + 1) >= 0.95) {
|
||||
// From the position next_check_pos to pos, if the occupied space is above 95%, the next
|
||||
// time you insert a node, you can start looking directly at the pos position.
|
||||
nextCheckPos = pos
|
||||
}
|
||||
used!![begin] = true // valid because resize is called.
|
||||
|
||||
val sizeLimit = begin + siblings[siblings.size - 1].first + 1
|
||||
if (size <= sizeLimit) {
|
||||
size = sizeLimit
|
||||
}
|
||||
|
||||
|
||||
for (sibling in siblings) {
|
||||
check[begin + sibling.first] = begin
|
||||
}
|
||||
|
||||
for (sibling in siblings) {
|
||||
val newSiblings = ArrayList<Pair<Int, StateChar>>(sibling.second.getSuccess().entries.size + 1)
|
||||
|
||||
if (fetch(sibling.second, newSiblings) == 0) {
|
||||
// The termination of a word and not the prefix of other words, in fact, is the leaf node
|
||||
base[begin + sibling.first] = 0 - sibling.second.largestValueId!! - 1
|
||||
progress++
|
||||
}
|
||||
else {
|
||||
val h = insert(newSiblings) // depth first search
|
||||
base[begin + sibling.first] = h
|
||||
}
|
||||
sibling.second.index = begin + sibling.first
|
||||
}
|
||||
return begin
|
||||
}
|
||||
|
||||
/**
|
||||
* free the unnecessary memory
|
||||
*/
|
||||
private fun loseWeight() {
|
||||
base = base.copyOf(size + 65535)
|
||||
check = check.copyOf(size + 65535)
|
||||
}
|
||||
}
|
|
@ -1,595 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
@file:Suppress("unused")
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.io.IOException
|
||||
import java.io.ObjectInputStream
|
||||
import java.io.ObjectOutputStream
|
||||
import java.io.Serializable
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
* An implementation of Aho Corasick algorithm based on Double Array Trie
|
||||
*
|
||||
* Will create a DoubleArray Trie from a Map or InputStream (if previously saved)
|
||||
*
|
||||
* @author hankcs, dorkbox
|
||||
*/
|
||||
abstract class BaseCharTrie<K, V>(map: Map<K, V>?, inputStream: ObjectInputStream?) : Serializable {
|
||||
|
||||
/**
|
||||
* check array of the Double Array Trie structure
|
||||
*/
|
||||
private val check: IntArray
|
||||
|
||||
/**
|
||||
* base array of the Double Array Trie structure
|
||||
*/
|
||||
private val base: IntArray
|
||||
|
||||
/**
|
||||
* fail table of the Aho Corasick automata
|
||||
*/
|
||||
private val fail: IntArray
|
||||
|
||||
/**
|
||||
* output table of the Aho Corasick automata
|
||||
*/
|
||||
private val output: Array<IntArray?>
|
||||
|
||||
/**
|
||||
* outer value array
|
||||
*/
|
||||
internal val v: Array<V>
|
||||
|
||||
/**
|
||||
* the length of every key
|
||||
*/
|
||||
internal val l: IntArray
|
||||
|
||||
/**
|
||||
* the size of base and check array
|
||||
*/
|
||||
private val checkSize: Int
|
||||
|
||||
init {
|
||||
when {
|
||||
map != null -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
v = kotlin.jvm.internal.collectionToArray(map.values) as Array<V>
|
||||
l = IntArray(map.size)
|
||||
|
||||
val builder = builder()
|
||||
builder.build(map)
|
||||
|
||||
fail = builder.fail
|
||||
base = builder.base
|
||||
check = builder.check
|
||||
|
||||
checkSize = builder.size
|
||||
output = builder.output
|
||||
}
|
||||
|
||||
inputStream != null -> {
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
v = inputStream.readObject() as Array<V>
|
||||
l = inputStream.readObject() as IntArray
|
||||
|
||||
fail = inputStream.readObject() as IntArray
|
||||
base = inputStream.readObject() as IntArray
|
||||
check = inputStream.readObject() as IntArray
|
||||
checkSize = inputStream.readObject() as Int
|
||||
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
output = inputStream.readObject() as Array<IntArray?>
|
||||
}
|
||||
else -> throw NullPointerException("Map or InputStream must be specified!")
|
||||
}
|
||||
}
|
||||
|
||||
internal abstract fun builder(): BaseCharBuilder<K, V>
|
||||
|
||||
/**
|
||||
* Save
|
||||
*/
|
||||
@Throws(IOException::class)
|
||||
fun save(out: ObjectOutputStream) {
|
||||
out.writeObject(v)
|
||||
out.writeObject(l)
|
||||
out.writeObject(fail)
|
||||
out.writeObject(base)
|
||||
out.writeObject(check)
|
||||
out.writeObject(checkSize)
|
||||
out.writeObject(output)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the size of the keywords
|
||||
*/
|
||||
val size: Int
|
||||
get() {
|
||||
return v.size
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the backing keywords IN THEIR NATURAL ORDER, in the case that you need access to the original FSM data.
|
||||
*
|
||||
* @return for example, if the FSM was populated with [reddit.com, cnn.com], this will return [cnn.com, reddit.com]
|
||||
*/
|
||||
val keywords: Array<V>
|
||||
get() {
|
||||
return v
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses text and returns true if there are PARTIALLY matching results. For exact matches only it is better to use `matches`
|
||||
*
|
||||
* @return true if there is a match or partial match. "fun.reddit.com" will partially match to "reddit.com"
|
||||
*/
|
||||
fun hasPartialMatch(text: String): Boolean {
|
||||
return parseText(text).isNotEmpty()
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses text and finds PARTIALLY matching results. For exact matches only it is better to use `matches`
|
||||
*
|
||||
* @return a list of outputs that contain matches or partial matches. The returned list will specify HOW MUCH of the text matches (A full match would be from 0 (the start), to N (the length of the text).
|
||||
*/
|
||||
fun partialMatch(text: String): List<Hit<V>> {
|
||||
return parseText(text)
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse text
|
||||
*
|
||||
* @return a list of outputs
|
||||
*/
|
||||
fun parseText(text: CharSequence): List<Hit<V>> {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
val collectedEmits = LinkedList<Hit<V>>() // unknown size, so
|
||||
|
||||
for (element in text) {
|
||||
currentState = getState(currentState, element)
|
||||
storeEmits(position++, currentState, collectedEmits)
|
||||
}
|
||||
|
||||
return collectedEmits
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse text
|
||||
*
|
||||
* @param text The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseText(text: CharSequence,
|
||||
processor: IHit<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (element in text) {
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
processor.hit(position - l[hit], position, v[hit])
|
||||
}
|
||||
}
|
||||
position++
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse text
|
||||
*
|
||||
* @param text The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseText(text: CharSequence,
|
||||
processor: IHitCancellable<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (element in text) {
|
||||
position++
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
val proceed = processor.hit(position - l[hit], position, v[hit])
|
||||
if (!proceed) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse text
|
||||
*
|
||||
* @param text The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseText(text: CharArray,
|
||||
processor: IHit<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (c in text) {
|
||||
currentState = getState(currentState, c)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
processor.hit(position - l[hit], position, v[hit])
|
||||
}
|
||||
}
|
||||
position++
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse text
|
||||
*
|
||||
* @param text The text
|
||||
* @param processor A processor which handles the output
|
||||
*/
|
||||
fun parseText(text: CharArray,
|
||||
processor: IHitFull<V>
|
||||
) {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (c in text) {
|
||||
currentState = getState(currentState, c)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
processor.hit(position - l[hit], position, v[hit], hit)
|
||||
}
|
||||
}
|
||||
position++
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that string contains at least one substring
|
||||
*
|
||||
* @param text source text to check
|
||||
*
|
||||
* @return `true` if string contains at least one substring
|
||||
*/
|
||||
fun matches(text: String): Boolean {
|
||||
var currentState = 0
|
||||
for (element in text) {
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Search first match in string
|
||||
*
|
||||
* @param text source text to check
|
||||
*
|
||||
* @return first match or `null` if there are no matches
|
||||
*/
|
||||
fun findFirst(text: String): Hit<V>? {
|
||||
var position = 1
|
||||
var currentState = 0
|
||||
for (element in text) {
|
||||
currentState = getState(currentState, element)
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
val hitIndex = hitArray[0]
|
||||
return Hit(position - l[hitIndex], position, v[hitIndex])
|
||||
}
|
||||
position++
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the value by index in value array <br></br>
|
||||
* Notice that to be more efficiently, this method DOES NOT check the parameter
|
||||
*
|
||||
* @param index The index
|
||||
*
|
||||
* @return The value
|
||||
*/
|
||||
operator fun get(index: Int): V {
|
||||
return v[index]
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* transmit state, supports failure function
|
||||
*/
|
||||
private fun getState(currentState: Int,
|
||||
character: Char): Int {
|
||||
|
||||
@Suppress("NAME_SHADOWING")
|
||||
var currentState = currentState
|
||||
|
||||
var newCurrentState = transitionWithRoot(currentState, character) // First press success
|
||||
while (newCurrentState == -1)
|
||||
// If the jump fails, press failure to jump
|
||||
{
|
||||
currentState = fail[currentState]
|
||||
newCurrentState = transitionWithRoot(currentState, character)
|
||||
}
|
||||
return newCurrentState
|
||||
}
|
||||
|
||||
/**
|
||||
* store output
|
||||
*/
|
||||
private fun storeEmits(position: Int,
|
||||
currentState: Int,
|
||||
collectedEmits: MutableList<Hit<V>>) {
|
||||
val hitArray = output[currentState]
|
||||
if (hitArray != null) {
|
||||
for (hit in hitArray) {
|
||||
collectedEmits.add(Hit(position - l[hit], position, v[hit]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* transition of a state
|
||||
*/
|
||||
private fun transition(current: Int,
|
||||
c: Char): Int {
|
||||
var b = current
|
||||
var p: Int
|
||||
|
||||
p = b + c.code + 1
|
||||
if (b == check[p]) {
|
||||
b = base[p]
|
||||
}
|
||||
else {
|
||||
return -1
|
||||
}
|
||||
|
||||
p = b
|
||||
return p
|
||||
}
|
||||
|
||||
/**
|
||||
* transition of a state, if the state is root and it failed, then returns the root
|
||||
*/
|
||||
private fun transitionWithRoot(nodePos: Int,
|
||||
c: Char): Int {
|
||||
val b = base[nodePos]
|
||||
val p: Int
|
||||
|
||||
p = b + c.code + 1
|
||||
return if (b != check[p]) {
|
||||
if (nodePos == 0) {
|
||||
0
|
||||
}
|
||||
else -1
|
||||
}
|
||||
else p
|
||||
}
|
||||
|
||||
/**
|
||||
* match exactly by a key-char array
|
||||
*
|
||||
* @param keyChars the key (as a Character array)
|
||||
*
|
||||
* @return the index of the key, you can use it as a perfect hash function
|
||||
*/
|
||||
fun exactMatchSearch(keyChars: CharArray): Int {
|
||||
return exactMatchSearch(keyChars, 0, 0, 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* match exactly by a key
|
||||
*
|
||||
* @param key the key
|
||||
*
|
||||
* @return the index of the key, you can use it as a perfect hash function
|
||||
*/
|
||||
fun exactMatchSearch(key: String): Int {
|
||||
return exactMatchSearch(key.toCharArray(), pos = 0, len = 0, nodePos = 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* match exactly by a key
|
||||
*
|
||||
* @param keyChars the char array of the key
|
||||
* @param pos the start index of char array
|
||||
* @param len the length of the key
|
||||
* @param nodePos the starting position of the node for searching
|
||||
*
|
||||
* @return the value index of the key, minus indicates null
|
||||
*/
|
||||
internal fun exactMatchSearch(keyChars: CharArray,
|
||||
pos: Int,
|
||||
len: Int,
|
||||
nodePos: Int): Int {
|
||||
@Suppress("NAME_SHADOWING")
|
||||
var len = len
|
||||
|
||||
@Suppress("NAME_SHADOWING")
|
||||
var nodePos = nodePos
|
||||
|
||||
if (len <= 0) {
|
||||
len = keyChars.size
|
||||
}
|
||||
if (nodePos <= 0) {
|
||||
nodePos = 0
|
||||
}
|
||||
|
||||
var result = -1
|
||||
|
||||
var b = base[nodePos]
|
||||
var p: Int
|
||||
|
||||
for (i in pos until len) {
|
||||
p = b + keyChars[i].code + 1
|
||||
if (b == check[p]) {
|
||||
b = base[p]
|
||||
}
|
||||
else {
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
p = b
|
||||
val n = base[p]
|
||||
if (b == check[p] && n < 0) {
|
||||
result = -n - 1
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Just for debug when I wrote it
|
||||
// */
|
||||
// public void debug()
|
||||
// {
|
||||
// System.out.println("base:");
|
||||
// for (int i = 0; i < base.length; i++)
|
||||
// {
|
||||
// if (base[i] < 0)
|
||||
// {
|
||||
// System.out.println(i + " : " + -base[i]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// System.out.println("output:");
|
||||
// for (int i = 0; i < output.length; i++)
|
||||
// {
|
||||
// if (output[i] != null)
|
||||
// {
|
||||
// System.out.println(i + " : " + Arrays.toString(output[i]));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// System.out.println("fail:");
|
||||
// for (int i = 0; i < fail.length; i++)
|
||||
// {
|
||||
// if (fail[i] != 0)
|
||||
// {
|
||||
// System.out.println(i + " : " + fail[i]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// System.out.println(this);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String toString()
|
||||
// {
|
||||
// String infoIndex = "i = ";
|
||||
// String infoChar = "char = ";
|
||||
// String infoBase = "base = ";
|
||||
// String infoCheck = "check= ";
|
||||
// for (int i = 0; i < Math.min(base.length, 200); ++i)
|
||||
// {
|
||||
// if (base[i] != 0 || check[i] != 0)
|
||||
// {
|
||||
// infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1));
|
||||
// infoIndex += " " + String.format("%5d", i);
|
||||
// infoBase += " " + String.format("%5d", base[i]);
|
||||
// infoCheck += " " + String.format("%5d", check[i]);
|
||||
// }
|
||||
// }
|
||||
// return "DoubleArrayTrie:" +
|
||||
// "\n" + infoChar +
|
||||
// "\n" + infoIndex +
|
||||
// "\n" + infoBase +
|
||||
// "\n" + infoCheck + "\n" +
|
||||
//// "check=" + Arrays.toString(check) +
|
||||
//// ", base=" + Arrays.toString(base) +
|
||||
//// ", used=" + Arrays.toString(used) +
|
||||
// "size=" + size
|
||||
//// ", length=" + Arrays.toString(length) +
|
||||
//// ", value=" + Arrays.toString(value) +
|
||||
// ;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * A debug class that sequentially outputs variable names and variable values
|
||||
// */
|
||||
// private static class DebugArray
|
||||
// {
|
||||
// Map<String, String> nameValueMap = new LinkedHashMap<String, String>();
|
||||
//
|
||||
// public void add(String name, int value)
|
||||
// {
|
||||
// String valueInMap = nameValueMap.get(name);
|
||||
// if (valueInMap == null)
|
||||
// {
|
||||
// valueInMap = "";
|
||||
// }
|
||||
//
|
||||
// valueInMap += " " + String.format("%5d", value);
|
||||
//
|
||||
// nameValueMap.put(name, valueInMap);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String toString()
|
||||
// {
|
||||
// String text = "";
|
||||
// for (Map.Entry<String, String> entry : nameValueMap.entrySet())
|
||||
// {
|
||||
// String name = entry.getKey();
|
||||
// String value = entry.getValue();
|
||||
// text += String.format("%-5s", name) + "= " + value + '\n';
|
||||
// }
|
||||
//
|
||||
// return text;
|
||||
// }
|
||||
//
|
||||
// public void println()
|
||||
// {
|
||||
// System.out.print(this);
|
||||
// }
|
||||
// }
|
||||
|
||||
}
|
|
@ -1,97 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.io.ObjectInputStream
|
||||
|
||||
class DoubleArrayByteArrayTrie<V>(map: Map<ByteArray, V>? = null, inputStream: ObjectInputStream? = null):
|
||||
BaseByteTrie<ByteArray, V>(map, inputStream) {
|
||||
|
||||
override fun builder(): BaseByteBuilder<ByteArray, V> {
|
||||
return object: BaseByteBuilder<ByteArray, V>() {
|
||||
/**
|
||||
* add a keyword
|
||||
*
|
||||
* @param keyword a keyword
|
||||
* @param index the index of the keyword
|
||||
*/
|
||||
override fun addKeyword(keyword: ByteArray, index: Int) {
|
||||
var currentState = this.rootState
|
||||
keyword.forEach { character ->
|
||||
currentState = currentState!!.addState(character)
|
||||
}
|
||||
|
||||
currentState!!.addEmit(index)
|
||||
this@DoubleArrayByteArrayTrie.l[index] = keyword.size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get value by a ByteArray key, just like a map.get() method
|
||||
*
|
||||
* @param key The key
|
||||
*/
|
||||
operator fun get(key: ByteArray): V? {
|
||||
val index = exactMatchSearch(key)
|
||||
return if (index >= 0) {
|
||||
v[index]
|
||||
}
|
||||
else null
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a value corresponding to a key
|
||||
*
|
||||
* @param key the key
|
||||
* @param value the value
|
||||
*
|
||||
* @return successful or not(failure if there is no key)
|
||||
*/
|
||||
operator fun set(key: ByteArray,
|
||||
value: V): Boolean {
|
||||
val index = exactMatchSearch(key)
|
||||
if (index >= 0) {
|
||||
v[index] = value
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
}
|
|
@ -1,95 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.io.ObjectInputStream
|
||||
|
||||
class DoubleArrayStringTrie<V>(map: Map<String, V>? = null,
|
||||
inputStream: ObjectInputStream? = null): BaseCharTrie<String, V>(map, inputStream) {
|
||||
override fun builder(): BaseCharBuilder<String, V> {
|
||||
return object: BaseCharBuilder<String, V>() {
|
||||
/**
|
||||
* add a keyword
|
||||
*
|
||||
* @param keyword a keyword
|
||||
* @param index the index of the keyword
|
||||
*/
|
||||
override fun addKeyword(keyword: String, index: Int) {
|
||||
var currentState = this.rootState
|
||||
keyword.toCharArray().forEach { character ->
|
||||
currentState = currentState!!.addState(character)
|
||||
}
|
||||
|
||||
currentState!!.addEmit(index)
|
||||
this@DoubleArrayStringTrie.l[index] = keyword.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get value by a String key, just like a map.get() method
|
||||
*
|
||||
* @param key The key
|
||||
*/
|
||||
operator fun get(key: String): V? {
|
||||
val index = exactMatchSearch(key)
|
||||
return if (index >= 0) {
|
||||
v[index]
|
||||
}
|
||||
else null
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a value corresponding to a key
|
||||
*
|
||||
* @param key the key
|
||||
* @param value the value
|
||||
*
|
||||
* @return successful or not(failure if there is no key)
|
||||
*/
|
||||
operator fun set(key: String,
|
||||
value: V): Boolean {
|
||||
val index = exactMatchSearch(key)
|
||||
if (index >= 0) {
|
||||
v[index] = value
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
}
|
|
@ -1,70 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
* Creates a Finite State Machine for very fast string matching.
|
||||
*
|
||||
* This is a wrapper for DoubleArrayTrie, since that class is awkward to use
|
||||
*/
|
||||
object FiniteStateMachine {
|
||||
fun <V> build(map: Map<String, V>): DoubleArrayStringTrie<V> {
|
||||
return DoubleArrayStringTrie(map)
|
||||
}
|
||||
|
||||
fun <V> build(map: Map<ByteArray, V>): DoubleArrayByteArrayTrie<V> {
|
||||
return DoubleArrayByteArrayTrie(map)
|
||||
}
|
||||
|
||||
fun build(strings: List<String>): DoubleArrayStringTrie<Boolean> {
|
||||
val map = TreeMap<String, Boolean>()
|
||||
for (key in strings) {
|
||||
map[key] = java.lang.Boolean.TRUE
|
||||
}
|
||||
|
||||
return build(map)
|
||||
}
|
||||
|
||||
fun build(strings: List<ByteArray>): DoubleArrayByteArrayTrie<Boolean> {
|
||||
val map = TreeMap<ByteArray, Boolean>()
|
||||
for (key in strings) {
|
||||
map[key] = java.lang.Boolean.TRUE
|
||||
}
|
||||
|
||||
return build(map)
|
||||
}
|
||||
|
||||
fun build(vararg strings: String): DoubleArrayStringTrie<Boolean> {
|
||||
val map = TreeMap<String, Boolean>()
|
||||
for (key in strings) {
|
||||
map[key] = java.lang.Boolean.TRUE
|
||||
}
|
||||
|
||||
return build(map)
|
||||
}
|
||||
|
||||
fun build(vararg strings: ByteArray): DoubleArrayByteArrayTrie<Boolean> {
|
||||
val map = TreeMap<ByteArray, Boolean>()
|
||||
for (key in strings) {
|
||||
map[key] = java.lang.Boolean.TRUE
|
||||
}
|
||||
|
||||
return build(map)
|
||||
}
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
/**
|
||||
* A result output
|
||||
*
|
||||
* @param <V> the value type
|
||||
</V> */
|
||||
class Hit<V> internal constructor(
|
||||
/**
|
||||
* the beginning index, inclusive.
|
||||
*/
|
||||
val begin: Int,
|
||||
/**
|
||||
* the ending index, exclusive.
|
||||
*/
|
||||
val end: Int,
|
||||
/**
|
||||
* the value assigned to the keyword
|
||||
*/
|
||||
val value: V) {
|
||||
|
||||
override fun toString(): String {
|
||||
return String.format("[%d:%d]=%s", begin, end, value.toString())
|
||||
}
|
||||
}
|
|
@ -1,54 +0,0 @@
|
|||
|
||||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
/**
|
||||
* Processor handles the output when hit a keyword
|
||||
*/
|
||||
interface IHit<V> {
|
||||
/**
|
||||
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
|
||||
*
|
||||
* @param begin the beginning index, inclusive.
|
||||
* @param end the ending index, exclusive.
|
||||
* @param value the value assigned to the keyword
|
||||
*/
|
||||
fun hit(begin: Int,
|
||||
end: Int,
|
||||
value: V)
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
|
||||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
/**
|
||||
* Callback that allows to cancel the search process.
|
||||
*/
|
||||
interface IHitCancellable<V> {
|
||||
/**
|
||||
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
|
||||
*
|
||||
* @param begin the beginning index, inclusive.
|
||||
* @param end the ending index, exclusive.
|
||||
* @param value the value assigned to the keyword
|
||||
*
|
||||
* @return Return true for continuing the search and false for stopping it.
|
||||
*/
|
||||
fun hit(begin: Int,
|
||||
end: Int,
|
||||
value: V): Boolean
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
|
||||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
/**
|
||||
* Processor handles the output when hit a keyword, with more detail
|
||||
*/
|
||||
interface IHitFull<V> {
|
||||
/**
|
||||
* Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
|
||||
*
|
||||
* @param begin the beginning index, inclusive.
|
||||
* @param end the ending index, exclusive.
|
||||
* @param value the value assigned to the keyword
|
||||
* @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value
|
||||
*/
|
||||
fun hit(begin: Int,
|
||||
end: Int,
|
||||
value: V,
|
||||
index: Int)
|
||||
}
|
|
@ -1,209 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* A state has the following functions
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* * success; successfully transferred to another state
|
||||
* * failure; if you cannot jump along the string, jump to a shallow node
|
||||
* * emits; hit a pattern string
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* The root node is slightly different. The root node has no failure function. Its "failure" refers to moving to the next state according to the string path. Other nodes have a failure state.
|
||||
*
|
||||
*
|
||||
* @author Robert Bor
|
||||
*/
|
||||
class StateByte(
|
||||
/**
|
||||
* The length of the pattern string is also the depth of this state
|
||||
*/
|
||||
val depth: Int = 0) {
|
||||
|
||||
/**
|
||||
* The fail function, if there is no match, jumps to this state.
|
||||
*/
|
||||
private var failure: StateByte? = null
|
||||
|
||||
/**
|
||||
* Record mode string as long as this state is reachable
|
||||
*/
|
||||
private var emits: MutableSet<Int>? = null
|
||||
|
||||
/**
|
||||
* The goto table, also known as the transfer function. Move to the next state based on the next character of the string
|
||||
*/
|
||||
private val success = TreeMap<Byte, StateByte>()
|
||||
|
||||
/**
|
||||
* Corresponding subscript in double array
|
||||
*/
|
||||
var index: Int = 0
|
||||
|
||||
/**
|
||||
* Get the largest value
|
||||
*/
|
||||
val largestValueId: Int?
|
||||
get() = if (emits == null || emits!!.size == 0) {
|
||||
null
|
||||
}
|
||||
else emits!!.iterator().next()
|
||||
|
||||
/**
|
||||
* Whether it is the termination status
|
||||
*/
|
||||
val isAcceptable: Boolean
|
||||
get() = this.depth > 0 && this.emits != null
|
||||
|
||||
val states: Collection<StateByte>
|
||||
get() = this.success.values
|
||||
|
||||
val transitions: Collection<Byte>
|
||||
get() = this.success.keys
|
||||
|
||||
/**
|
||||
* Add a matching pattern string (this state corresponds to this pattern string)
|
||||
*/
|
||||
fun addEmit(keyword: Int) {
|
||||
if (this.emits == null) {
|
||||
this.emits = TreeSet(Collections.reverseOrder())
|
||||
}
|
||||
this.emits!!.add(keyword)
|
||||
}
|
||||
|
||||
/**
|
||||
* Add some matching pattern strings
|
||||
*/
|
||||
fun addEmit(emits: Collection<Int>) {
|
||||
for (emit in emits) {
|
||||
addEmit(emit)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the pattern string represented by this node (we)
|
||||
*/
|
||||
fun emit(): Collection<Int> {
|
||||
return this.emits ?: emptyList()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the failure status
|
||||
*/
|
||||
fun failure(): StateByte? {
|
||||
return this.failure
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the failure status
|
||||
*/
|
||||
fun setFailure(failState: StateByte,
|
||||
fail: IntArray) {
|
||||
this.failure = failState
|
||||
fail[index] = failState.index
|
||||
}
|
||||
|
||||
/**
|
||||
* Move to the next state
|
||||
*
|
||||
* @param character wants to transfer by this character
|
||||
* @param ignoreRootState Whether to ignore the root node, it should be true if the root node calls itself, otherwise it is false
|
||||
*
|
||||
* @return transfer result
|
||||
*/
|
||||
private fun nextState(character: Byte,
|
||||
ignoreRootState: Boolean): StateByte? {
|
||||
var nextState: StateByte? = this.success[character]
|
||||
if (!ignoreRootState && nextState == null && this.depth == 0) {
|
||||
nextState = this
|
||||
}
|
||||
return nextState
|
||||
}
|
||||
|
||||
/**
|
||||
* According to the character transfer, the root node transfer failure will return itself (never return null)
|
||||
*/
|
||||
fun nextState(character: Byte): StateByte? {
|
||||
return nextState(character, false)
|
||||
}
|
||||
|
||||
/**
|
||||
* According to character transfer, any node transfer failure will return null
|
||||
*/
|
||||
fun nextStateIgnoreRootState(character: Byte): StateByte? {
|
||||
return nextState(character, true)
|
||||
}
|
||||
|
||||
fun addState(character: Byte): StateByte {
|
||||
var nextState = nextStateIgnoreRootState(character)
|
||||
if (nextState == null) {
|
||||
nextState = StateByte(this.depth + 1)
|
||||
this.success[character] = nextState
|
||||
}
|
||||
return nextState
|
||||
}
|
||||
|
||||
override fun toString(): String {
|
||||
val sb = StringBuilder("State{")
|
||||
sb.append("depth=").append(depth)
|
||||
sb.append(", ID=").append(index)
|
||||
sb.append(", emits=").append(emits)
|
||||
sb.append(", success=").append(success.keys)
|
||||
sb.append(", failureID=").append(if (failure == null) "-1" else failure!!.index)
|
||||
sb.append(", failure=").append(failure)
|
||||
sb.append('}')
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get goto table
|
||||
*/
|
||||
fun getSuccess(): Map<Byte, StateByte> {
|
||||
return success
|
||||
}
|
||||
}
|
|
@ -1,209 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* AhoCorasickDoubleArrayTrie Project
|
||||
* https://github.com/hankcs/AhoCorasickDoubleArrayTrie
|
||||
*
|
||||
* Copyright 2008-2018 hankcs <me@hankcs.com>
|
||||
* You may modify and redistribute as long as this attribution remains.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import java.util.*
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* A state has the following functions
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* * success; successfully transferred to another state
|
||||
* * failure; if you cannot jump along the string, jump to a shallow node
|
||||
* * emits; hit a pattern string
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* The root node is slightly different. The root node has no failure function. Its "failure" refers to moving to the next state according to the string path. Other nodes have a failure state.
|
||||
*
|
||||
*
|
||||
* @author Robert Bor
|
||||
*/
|
||||
class StateChar(
|
||||
/**
|
||||
* The length of the pattern string is also the depth of this state
|
||||
*/
|
||||
val depth: Int = 0) {
|
||||
|
||||
/**
|
||||
* The fail function, if there is no match, jumps to this state.
|
||||
*/
|
||||
private var failure: StateChar? = null
|
||||
|
||||
/**
|
||||
* Record mode string as long as this state is reachable
|
||||
*/
|
||||
private var emits: MutableSet<Int>? = null
|
||||
|
||||
/**
|
||||
* The goto table, also known as the transfer function. Move to the next state based on the next character of the string
|
||||
*/
|
||||
private val success = TreeMap<Char, StateChar>()
|
||||
|
||||
/**
|
||||
* Corresponding subscript in double array
|
||||
*/
|
||||
var index: Int = 0
|
||||
|
||||
/**
|
||||
* Get the largest value
|
||||
*/
|
||||
val largestValueId: Int?
|
||||
get() = if (emits == null || emits!!.size == 0) {
|
||||
null
|
||||
}
|
||||
else emits!!.iterator().next()
|
||||
|
||||
/**
|
||||
* Whether it is the termination status
|
||||
*/
|
||||
val isAcceptable: Boolean
|
||||
get() = this.depth > 0 && this.emits != null
|
||||
|
||||
val states: Collection<StateChar>
|
||||
get() = this.success.values
|
||||
|
||||
val transitions: Collection<Char>
|
||||
get() = this.success.keys
|
||||
|
||||
/**
|
||||
* Add a matching pattern string (this state corresponds to this pattern string)
|
||||
*/
|
||||
fun addEmit(keyword: Int) {
|
||||
if (this.emits == null) {
|
||||
this.emits = TreeSet(Collections.reverseOrder())
|
||||
}
|
||||
this.emits!!.add(keyword)
|
||||
}
|
||||
|
||||
/**
|
||||
* Add some matching pattern strings
|
||||
*/
|
||||
fun addEmit(emits: Collection<Int>) {
|
||||
for (emit in emits) {
|
||||
addEmit(emit)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the pattern string represented by this node (we)
|
||||
*/
|
||||
fun emit(): Collection<Int> {
|
||||
return this.emits ?: emptyList()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the failure status
|
||||
*/
|
||||
fun failure(): StateChar? {
|
||||
return this.failure
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the failure status
|
||||
*/
|
||||
fun setFailure(failState: StateChar,
|
||||
fail: IntArray) {
|
||||
this.failure = failState
|
||||
fail[index] = failState.index
|
||||
}
|
||||
|
||||
/**
|
||||
* Move to the next state
|
||||
*
|
||||
* @param character wants to transfer by this character
|
||||
* @param ignoreRootState Whether to ignore the root node, it should be true if the root node calls itself, otherwise it is false
|
||||
*
|
||||
* @return transfer result
|
||||
*/
|
||||
private fun nextState(character: Char,
|
||||
ignoreRootState: Boolean): StateChar? {
|
||||
var nextState: StateChar? = this.success[character]
|
||||
if (!ignoreRootState && nextState == null && this.depth == 0) {
|
||||
nextState = this
|
||||
}
|
||||
return nextState
|
||||
}
|
||||
|
||||
/**
|
||||
* According to the character transfer, the root node transfer failure will return itself (never return null)
|
||||
*/
|
||||
fun nextState(character: Char): StateChar? {
|
||||
return nextState(character, false)
|
||||
}
|
||||
|
||||
/**
|
||||
* According to character transfer, any node transfer failure will return null
|
||||
*/
|
||||
fun nextStateIgnoreRootState(character: Char): StateChar? {
|
||||
return nextState(character, true)
|
||||
}
|
||||
|
||||
fun addState(character: Char): StateChar {
|
||||
var nextState = nextStateIgnoreRootState(character)
|
||||
if (nextState == null) {
|
||||
nextState = StateChar(this.depth + 1)
|
||||
this.success[character] = nextState
|
||||
}
|
||||
return nextState
|
||||
}
|
||||
|
||||
override fun toString(): String {
|
||||
val sb = StringBuilder("State{")
|
||||
sb.append("depth=").append(depth)
|
||||
sb.append(", ID=").append(index)
|
||||
sb.append(", emits=").append(emits)
|
||||
sb.append(", success=").append(success.keys)
|
||||
sb.append(", failureID=").append(if (failure == null) "-1" else failure!!.index)
|
||||
sb.append(", failure=").append(failure)
|
||||
sb.append('}')
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get goto table
|
||||
*/
|
||||
fun getSuccess(): Map<Char, StateChar> {
|
||||
return success
|
||||
}
|
||||
}
|
|
@ -1,6 +1,5 @@
|
|||
module dorkbox.collections {
|
||||
exports dorkbox.collections;
|
||||
exports dorkbox.collections.ahoCorasick;
|
||||
|
||||
requires transitive dorkbox.updates;
|
||||
|
||||
|
|
|
@ -1,245 +0,0 @@
|
|||
/*
|
||||
* Copyright 2023 dorkbox, llc
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package dorkbox.collections.ahoCorasick
|
||||
|
||||
import org.junit.Test
|
||||
import java.util.*
|
||||
|
||||
class TestTrie {
|
||||
@Test
|
||||
fun trieFromStringMap() {
|
||||
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
|
||||
val keys = Arrays.asList(*strings)
|
||||
var text: String
|
||||
run {
|
||||
val map = TreeMap<String, String>()
|
||||
for (key in keys) {
|
||||
map[key] = key
|
||||
}
|
||||
val fsm = FiniteStateMachine.build(map)
|
||||
text = "reddit.google.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
println()
|
||||
|
||||
text = "reddit.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
println()
|
||||
|
||||
text = "fun.reddit.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun trieFromByteArrayMap() {
|
||||
val strings = arrayOf(
|
||||
"khanacademy.com".toByteArray(),
|
||||
"cnn.com".toByteArray(),
|
||||
"google.com".toByteArray(),
|
||||
"fun.reddit.com".toByteArray(),
|
||||
"reddit.com".toByteArray())
|
||||
val keys = Arrays.asList(*strings)
|
||||
var text: String
|
||||
run {
|
||||
val map = TreeMap<ByteArray, String>()
|
||||
for (key in keys) {
|
||||
map[key] = String(key)
|
||||
}
|
||||
val fsm = FiniteStateMachine.build(map)
|
||||
|
||||
text = "reddit.google.com"
|
||||
println("Searching : $text")
|
||||
var result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
println()
|
||||
|
||||
text = "reddit.com"
|
||||
println("Searching : $text")
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
println()
|
||||
|
||||
text = "fun.reddit.com"
|
||||
println("Searching : $text")
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun trieFromStringList() {
|
||||
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
|
||||
val keys = Arrays.asList(*strings)
|
||||
var text: String
|
||||
run {
|
||||
val fsm = FiniteStateMachine.build(keys)
|
||||
text = "reddit.google.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
println()
|
||||
|
||||
text = "reddit.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
println()
|
||||
|
||||
text = "fun.reddit.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun trieFromByteArrayList() {
|
||||
val strings = arrayOf(
|
||||
"khanacademy.com".toByteArray(),
|
||||
"cnn.com".toByteArray(),
|
||||
"google.com".toByteArray(),
|
||||
"fun.reddit.com".toByteArray(),
|
||||
"reddit.com".toByteArray())
|
||||
|
||||
val keys = Arrays.asList(*strings)
|
||||
var text: String
|
||||
run {
|
||||
val fsm = FiniteStateMachine.build(keys)
|
||||
text = "reddit.google.com"
|
||||
println("Searching : $text")
|
||||
var result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
println()
|
||||
|
||||
text = "reddit.com"
|
||||
println("Searching : $text")
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
println()
|
||||
|
||||
text = "fun.reddit.com"
|
||||
println("Searching : $text")
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun trieFromStringVarArg() {
|
||||
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
|
||||
var text: String
|
||||
run {
|
||||
val fsm = FiniteStateMachine.build(*strings)
|
||||
text = "reddit.google.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
println()
|
||||
|
||||
text = "reddit.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
println()
|
||||
|
||||
text = "fun.reddit.com"
|
||||
println("Searching : $text")
|
||||
println(fsm.partialMatch(text))
|
||||
println("Found: " + fsm.matches(text))
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun trieFromByteArrayVarArg() {
|
||||
val strings = arrayOf(
|
||||
"khanacademy.com".toByteArray(),
|
||||
"cnn.com".toByteArray(),
|
||||
"google.com".toByteArray(),
|
||||
"fun.reddit.com".toByteArray(),
|
||||
"reddit.com".toByteArray())
|
||||
|
||||
var text: String
|
||||
run {
|
||||
val fsm = FiniteStateMachine.build(*strings)
|
||||
|
||||
text = "reddit.google.com"
|
||||
println("Searching : $text")
|
||||
var result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
println()
|
||||
|
||||
text = "reddit.com"
|
||||
println("Searching : $text")
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
println()
|
||||
|
||||
text = "fun.reddit.com"
|
||||
println("Searching : $text")
|
||||
result = fsm.partialMatch(text.toByteArray())
|
||||
result.forEach { it ->
|
||||
println(it.toString())
|
||||
}
|
||||
println("Found: " + fsm.matches(text.toByteArray()))
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun fmsOutput() {
|
||||
val strings = arrayOf("khanacademy.com", "cnn.com", "google.com", "fun.reddit.com", "reddit.com")
|
||||
val fsm = FiniteStateMachine.build(*strings)
|
||||
|
||||
run {
|
||||
println("Keywords Orig: " + Arrays.toString(strings))
|
||||
println("Keywords FSM : " + Arrays.toString(fsm.keywords))
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue