BackwardsTreeScanner.java
/*
* BackwardsTreeScanner.java
*
* Created on February 20, 2002, 1:19 PM
*/
package emissary.util.search;
import java.io.PrintStream;
import javax.annotation.Nullable;
/**
* This class implements a tree state machine scanner that searches text backwards starting from the end. A list of
* strings is provided as the keywords to be searched. This class is useful for a relatively small set of keywords.
* Larger keyword lists can be used if you have memory!
*
* @author ce
* @version 1.0
*/
@SuppressWarnings("AvoidObjectArrays")
public class BackwardsTreeScanner {
// The internal structure of the offset-keyword id array
// contained in the hit vectors
/** Root node of tree state diagram. Always start a search from here! */
private State root = new State((byte) 0);
/**
* Optional empty constructor
*/
public BackwardsTreeScanner() {
}
/**
* Constructor that delegates learning keywords to resetKeywords method.
*
* @param keywordStrings - array of keywords to learn
* @throws Exception - thrown if problem encountered
*/
public BackwardsTreeScanner(String[] keywordStrings) throws Exception {
resetKeywords(keywordStrings);
}
/**
* Resets keywords and internal State learns them. This method destroys previous state.
*
* @param keywordStrings - String of keywords to learn
* @throws Exception - if problem encountered while learning
*/
public synchronized void resetKeywords(String[] keywordStrings) throws Exception {
// make byte arrays
// Original list of keywords stored in byte array form.
byte[][] keywords = new byte[keywordStrings.length][];
root = new State((byte) 0); // reset state
// and learn them
for (int i = 0; i < keywords.length; i++) {
keywords[i] = keywordStrings[i].getBytes();
root.learn(keywordStrings[i].getBytes(), i);
}
// root.print(System.out);
}
/**
* This scans the byte array backwards from the offset. Each hit is added to the result vector. We stop when all
* possibilities are found
*/
public synchronized int scan(byte[] data, int offset, HitList result) throws Exception {
if (result == null) {
throw new Exception("Null result vector in 3rd parameter of scan()");
}
// reset the state machine
State state = root;
int curPos = offset;
while (state != null && curPos >= 0) {
// Save any matches for this state.
// get the next character
byte ch = data[curPos];
// move to the next state. Really complicated, right?
if (ch < 0) {
state = state.nextStates[256 + (int) ch];
} else {
state = state.nextStates[ch];
}
// move the the previous character
if (state != null && state.matches != null) {
for (int i = 0; i < state.matches.length; i++) {
int id = state.matches[i];
Hit tmp = new Hit(curPos, id);
result.add(tmp);
}
}
curPos--;
}
return curPos;
}
/*
* This class implements a state machine that can learn character sequences.
*/
public class State {
// Each state has 256 transitions leaving it to new states based
// on a single ascii character. If there is no next state for a
// character, then the next state will be null.
public State[] nextStates = new State[256];
// Each state can be visited by a single character. This is it!
public byte gotHereBy;
// A list of keyword ids that are matched at this state.
@Nullable
public int[] matches = null;
// constructor
public State(byte gotHereBy) {
this.gotHereBy = gotHereBy;
}
public void learn(byte[] word, int id) throws Exception {
learn(word, word.length - 1, id);
}
/**
* Walk through the keyword backwards. Adding states to the root (or current state) when they don't exists. At the end,
* record the keyword id in the ending state.
*
* Warning this is recursive, but that is OK for small keywords.
*/
public void learn(byte[] word, int wordLoc, int id) throws Exception {
if (word == null) {
throw new Exception("null keyword in BackwardsTreeScanner.learn()");
}
if (wordLoc >= word.length) {
throw new Exception("char pos > word length:" + wordLoc + ">" + word.length);
}
if (wordLoc < 0) {
// we are finished because this is the first character,
// so save the id in this state. We want the matches to be
// in an array so this is a little harder than a vector thing.
if (matches == null) {
matches = new int[0];
}
int[] newMatches = new int[matches.length + 1];
System.arraycopy(matches, 0, newMatches, 0, matches.length);
matches = newMatches;
matches[matches.length - 1] = id;
} else {
// Get the next character in the word
byte nextChar = word[wordLoc];
// See if the state already exists
State nextState = nextStates[nextChar];
if (nextState == null) {
// Make a new state because it isn't there yet.
nextState = nextStates[nextChar] = new State(nextChar);
}
// Learn the rest of the keyword in the new state.
nextState.learn(word, wordLoc - 1, id);
}
}
public void print(PrintStream out) {
print(out, "root:");
}
// Make a pretty picture.
public void print(PrintStream out, String prefix) {
if (gotHereBy < ' ' || gotHereBy > '~') {
out.println(prefix + "-> " + "(byte)" + gotHereBy);
} else {
out.println(prefix + "-> " + (char) gotHereBy);
}
if (matches != null) {
out.print(prefix + "ids [");
for (int match : matches) {
out.print(" " + match);
}
out.println(" ]");
}
for (State nextState : nextStates) {
if (nextState != null) {
nextState.print(out, prefix + " ");
}
}
}
}
}