FastBoyerMoore.java

package emissary.util.search;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import javax.annotation.Nullable;

@SuppressWarnings("AvoidObjectArrays")
public class FastBoyerMoore {
    private static final Logger logger = LoggerFactory.getLogger(FastBoyerMoore.class);
    public byte[][] keywords;
    int minKeywordLength;
    int[] lookup = new int[259];
    transient BackwardsTreeScanner scanner;
    @Nullable
    byte[] data = null;
    BackwardsTreeScanner.State root;

    // copy constructor

    private FastBoyerMoore() {}

    public FastBoyerMoore(final FastBoyerMoore original) {
        this();
        this.keywords = original.keywords;
        this.minKeywordLength = original.minKeywordLength;
        this.lookup = original.lookup;
        this.root = original.root;
        this.scanner = new BackwardsTreeScanner(original.scanner);
    }

    public FastBoyerMoore(final String[] keywordStrings) throws Exception {
        this.keywords = new byte[keywordStrings.length][];
        this.minKeywordLength = Integer.MAX_VALUE;
        for (int i = 0; i < this.keywords.length; i++) {
            this.keywords[i] = keywordStrings[i].getBytes();
            this.minKeywordLength = Math.min(this.minKeywordLength, this.keywords[i].length);
        }
        for (int i = 0; i < this.lookup.length; i++) {
            this.lookup[i] = this.minKeywordLength;
        }
        // each keyword
        for (int i = 0; i < this.keywords.length; i++) {
            final byte[] kw = this.keywords[i];
            // each keyword character
            for (int j = 0; j < kw.length - 1; j++) {
                this.lookup[kw[j]] = Math.min(this.lookup[kw[j]], kw.length - j - 1);
            }
        }
        this.data = null;
        this.scanner = new BackwardsTreeScanner(keywordStrings);
        this.root = this.scanner.getRoot();
    }

    public FastBoyerMoore(final String[][] keywordStrings) throws Exception {
        this.minKeywordLength = Integer.MAX_VALUE;
        for (int i = 0; i < keywordStrings.length; i++) {
            for (int j = 0; j < keywordStrings[i].length; j++) {
                this.minKeywordLength = Math.min(this.minKeywordLength, keywordStrings[i][j].length());
            }
        }
        for (int i = 0; i < this.lookup.length; i++) {
            this.lookup[i] = this.minKeywordLength;
        }
        // each keyword
        for (int i = 0; i < keywordStrings.length; i++) {
            for (int j = 0; j < keywordStrings[i].length; j++) {
                final byte[] kw = keywordStrings[i][j].getBytes();
                // each keyword character
                for (int k = 0; k < kw.length - 1; k++) {
                    this.lookup[kw[k]] = Math.min(this.lookup[kw[k]], kw.length - k - 1);
                }
            }
        }
        this.data = null;
        this.scanner = new BackwardsTreeScanner(keywordStrings);
        this.root = this.scanner.getRoot();
    }

    public void setData(final byte[] dataArg) {
        this.scanner.setData(dataArg);
        this.data = dataArg;
    }

    public void scan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
        this.data = dataArg;
        this.scanner.setData(dataArg);
        scan(start, end, result);
    }

    public void scan(final int start, final int end, final Collection<int[]> result) {
        final int actualEnd = Math.min(end, this.data.length);
        int pos = start;
        while (pos < actualEnd) {
            final int ch = this.data[pos] & 0x7f;
            final int jump = this.lookup[ch];
            BackwardsTreeScanner.State state = this.root.nextStates[ch];
            int curPos = pos - 1;
            while ((state != null) && (curPos >= 0)) {
                if (state.matches != null) {
                    for (int i = 0; i < state.matches.length; i++) {
                        final int id = state.matches[i];
                        final int[] tmp = new int[3];
                        tmp[0] = curPos + 1;
                        tmp[1] = id;
                        tmp[2] = pos - curPos;
                        result.add(tmp);
                    }
                }
                final int ch2 = this.data[curPos] & 0x7f;
                state = state.nextStates[ch2];
                curPos--;
            }
            if ((state != null) && (curPos == -1)) {
                for (int i = 0; i < state.matches.length; i++) {
                    final int id = state.matches[i];
                    final int[] tmp = new int[3];
                    tmp[0] = curPos + 1;
                    tmp[1] = id;
                    tmp[2] = pos - curPos;
                    result.add(tmp);
                }
            }
            pos += jump;
        }
    }

    public int staticSingleScan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
        final int actualEnd = Math.min(end, dataArg.length);
        boolean found = false;
        int pos = start;
        while ((pos < actualEnd) && !found) {
            final int ch = dataArg[pos] & 0x7f;
            final int jump = this.lookup[ch];
            BackwardsTreeScanner.State state = this.root.nextStates[ch];
            int curPos = pos - 1;
            while ((state != null) && (curPos >= 0)) {
                if (state.matches != null) {
                    for (int i = 0; i < state.matches.length; i++) {
                        final int id = state.matches[i];
                        final int[] tmp = new int[3];
                        tmp[0] = curPos + 1;
                        tmp[1] = id;
                        tmp[2] = pos - curPos;
                        result.add(tmp);
                        found = true;
                    }
                }
                final int ch2 = dataArg[curPos] & 0x7f;
                state = state.nextStates[ch2];
                curPos--;
            }
            if ((state != null) && (state.matches != null) && (curPos == -1)) {
                for (int i = 0; i < state.matches.length; i++) {
                    final int id = state.matches[i];
                    final int[] tmp = new int[3];
                    tmp[0] = curPos + 1;
                    tmp[1] = id;
                    tmp[2] = pos - curPos;
                    result.add(tmp);
                    found = true;
                }
            }
            pos += jump;
        }
        return pos;
    }

    public static final int ID = 1;
    public static final int LOC = 0;
    public static final int LENGTH = 2;

    public static void main(final String[] args) {
        try {
            // a list of interesting keywords. */
            final String[][] keys = {{"\nABCD"}, // 0,1,2,3,4
                    {"\nABC"}, // 5 6 7 8
                    {"\nAB"}, // 9 10 11
                    {"\n//xyz//"}, // 12 13 14 15 16 17 18 19
                    {"\nxxxxx"}, // 20 21 22 23 24 25
                    {"\nabcdefghi"}, // 26 27 28 29 30 31 32 33 34 35
                    {"\nabcde", "\nabcdefg"}}; // 36 37 38 39 41 41 42 43 44 45
            // A string for holding the test data to be searched
            final StringBuilder dataString = new StringBuilder();
            // The thing we are testing
            final FastBoyerMoore scanner = new FastBoyerMoore(keys);
            // Make up an interesting string. The second half should never match.
            for (int i = 0; i < keys.length; i++) {
                for (int j = 0; j < keys[i].length; j++) {
                    dataString.append(keys[i][j]);
                }
            }
            // for (int i = 0;i<keys.length;i++)dataString +=keys[i].toString().substring(0,2);
            // A byte array version of the data.
            final byte[] dataBytes = dataString.toString().getBytes();

            // A vector for holding the results.
            final List<int[]> result = new ArrayList<>();
            scanner.setData(dataBytes);
            scanner.scan(0, dataBytes.length, result);
            for (int i = 0; i < result.size(); i++) {
                final int[] tmp = result.get(i);
                logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
            }
        } catch (Exception e) {
            logger.error("Exception in test", e);
        }
    }

    /**
     * This class implements a tree state machine scanner that searches text backwards starting from the end. A list of
     * strings is provided as the keywords to be searched. This class is usefull for a relatively small set of keywords.
     * Larger keyword lists can be used if you have memory!
     *
     * @author ce
     * @version 1.0
     */
    public static class BackwardsTreeScanner {
        /** Original list of keywords storred in byte array form. */
        // byte[][] keywords;
        /** Root node of tree state diagram. Always start a search from here! */
        State root = new State((byte) 0);
        @Nullable
        byte[] data = null;

        public BackwardsTreeScanner(final BackwardsTreeScanner o) {
            this.root = o.root;
        }

        public BackwardsTreeScanner(final String[][] keywordStrings) throws Exception {
            for (int i = 0; i < keywordStrings.length; i++) {
                for (int j = 0; j < keywordStrings[i].length; j++) {
                    this.root.learn(keywordStrings[i][j].getBytes(), i);
                }
            }
            // this.root.print(System.out);
        }

        public BackwardsTreeScanner(final String[] keywordStrings) throws Exception {
            // make byte arrays
            final byte[][] keywords = new byte[keywordStrings.length][];
            // and learn them
            for (int i = 0; i < keywords.length; i++) {
                keywords[i] = keywordStrings[i].getBytes();
                this.root.learn(keywordStrings[i].getBytes(), i);
            }
            // this.root.print(System.out);
        }

        public static void main(final String[] args) {
            try {
                // a list of interesting keywords. */
                final String[][] keys = {{"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi"}, {"\nabcde", "\nabcdefg"}};

                // The thing we are testing
                final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
                // A string for holding the test data to be searched
                final StringBuilder dataString = new StringBuilder();
                // Make up an interesting string. The second half should never match.
                for (int i = 0; i < keys.length; i++) {
                    for (int j = 0; j < keys[i].length; j++) {
                        dataString.append(keys[i][j]);
                    }
                }
                // for (int i = 0;i<keys.length;i++)dataString +=keys[i].toString().substring(0,2);
                // A byte array version of the data.
                final byte[] dataBytes = dataString.toString().getBytes();

                // A vector for holding the results.
                final List<int[]> hits = new ArrayList<>();
                /*
                 * loop through the data from beginint to end calling scan at each position. This shows how to use scan(), but in
                 * general this should be used more effediently (with a boyer more algorithm or something.
                 */
                for (int pos = 1; pos < dataBytes.length; pos++) {
                    scanner.scan(dataBytes, pos, hits);
                    for (int i = 0; i < hits.size(); i++) {
                        final int[] tmp = hits.get(i);
                        logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
                    }
                    hits.clear();
                }
            } catch (Exception e) {
                logger.error("Exception in test", e);
            }
        }

        public static void main2(final String[] args) {
            try {
                // a list of interesting keywords. */
                final String[] keys = {"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi", "\nabcde", "\nabcdefg"};
                // A string for holding the test data to be searched
                final StringBuilder dataString = new StringBuilder();
                // The thing we are testing
                final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
                // Make up an interesting string. The second half should never match.
                for (int i = 0; i < keys.length; i++) {
                    dataString.append(keys[i]);
                }
                // for (int i = 0;i<keys.length;i++)dataString +=keys[i].toString().substring(0,2);
                // A byte array version of the data.
                final byte[] dataBytes = dataString.toString().getBytes();

                // A vector for holding the results.
                final List<int[]> hits = new ArrayList<>();
                /*
                 * loop through the data from beginint to end calling scan at each position. This shows how to use scan(), but in
                 * general this should be used more effediently (with a boyer more algorithm or something.
                 */
                for (int pos = 1; pos < dataBytes.length; pos++) {
                    scanner.scan(dataBytes, pos, hits);
                    for (int i = 0; i < hits.size(); i++) {
                        final int[] tmp = hits.get(i);
                        logger.info("Hit At: {} id: {}", tmp[0], tmp[1]);
                    }
                    hits.clear();
                }
            } catch (Exception e) {
                logger.error("Exception in test", e);
            }
        }

        public synchronized State getRoot() {
            return this.root;
        }

        public synchronized void setData(final byte[] dataArg) {
            this.data = dataArg;
        }

        /**
         * This scans the byte array backwards from the offset. Each hit is added to the result vector. We stop when all
         * posibilities are found
         */
        public synchronized int scan(final byte[] dataArg, final int offset, final Collection<int[]> result) {
            this.data = dataArg;
            return scan(offset, result);
        }

        public synchronized int scan(final int curPosArg, final Collection<int[]> result) {
            if (!(curPosArg < this.data.length)) {
                return curPosArg;
            }
            State state = this.root;
            int length = 0;
            int curPos = curPosArg;
            while ((state != null) && (curPos >= 0)) {
                if (curPos > this.data.length || curPos < 0) {
                    return curPos;
                }
                int ch = this.data[curPos];
                if (ch < 0) {
                    ch += Byte.MAX_VALUE - Byte.MIN_VALUE;
                }
                state = state.nextStates[ch];
                length++;
                if (state != null && state.matches != null) {
                    for (int i = 0; i < state.matches.length; i++) {
                        final int id = state.matches[i];
                        final int[] tmp = new int[3];
                        tmp[0] = curPos;
                        tmp[1] = id;
                        tmp[2] = length;
                        result.add(tmp);
                    }
                }
                curPos--;
            }
            return curPos;
        }

        /*
         * This class implements a state machine that can learn character sequences.
         */
        public class State {

            // Each state has 256 transitions leaving it to new states based
            // on a single ascii character. If there is no next state for a
            // character, then the next state will be null.
            public State[] nextStates = new State[256];

            // Each state can be visited by a single character. This is it!
            public int gotHereBy;

            // A list of keyword ids that are matched at this state.
            @Nullable
            public int[] matches = null;

            // constructor
            public State(final int gotHereBy) {
                this.gotHereBy = gotHereBy;
            }

            public void learn(final byte[] word, final int id) throws Exception {
                learn(word, word.length - 1, id);
            }

            /**
             * Walk throught he keyword backwards. Adding states to the root (or current state) when they don't exists. At the end,
             * record the keyowrd id in the ending state.
             *
             * Warning this is recursive, but thats OK for small keywords.
             */
            public void learn(final byte[] word, final int wordLoc, final int id) throws Exception {
                if (word == null) {
                    throw new Exception("null keyword in BackwardsTreeScanner.learn()");
                }
                if (wordLoc >= word.length) {
                    throw new Exception("char pos > word length:" + wordLoc + ">" + word.length);
                }
                if (wordLoc < 0) {
                    // we are finished because this is the first character,
                    // so save the id in this state. We want the matches to be
                    // in an array so this is a little harder than a vector thing.
                    if (this.matches == null) {
                        this.matches = new int[0];
                    }
                    final int[] newMatches = new int[this.matches.length + 1];
                    System.arraycopy(this.matches, 0, newMatches, 0, this.matches.length);
                    this.matches = newMatches;
                    this.matches[this.matches.length - 1] = id;
                } else {
                    // Get the next character in the word
                    int nextChar = word[wordLoc];
                    if (nextChar < 0) {
                        nextChar += Byte.MAX_VALUE - Byte.MIN_VALUE;
                    }

                    // See if the state already exists
                    State nextState = this.nextStates[nextChar];
                    if (nextState == null) {
                        // Make a new state because it isn't there yet.
                        nextState = this.nextStates[nextChar] = new State(nextChar);
                    }
                    // Learn the rest of the keyword in the new state.
                    nextState.learn(word, wordLoc - 1, id);
                }
            }

            public void print(final PrintStream out) {
                print(out, "root:");
            }

            // Make a pretty picture.
            public void print(final PrintStream out, final String prefix) {
                if ((this.gotHereBy < ' ') || (this.gotHereBy > '~')) {
                    out.println(prefix + "-> " + "(byte)" + this.gotHereBy);
                } else {
                    out.println(prefix + "-> " + (char) this.gotHereBy);
                }
                if (this.matches != null) {
                    out.print(prefix + "ids [");
                    for (int i = 0; i < this.matches.length; i++) {
                        out.print(" " + this.matches[i]);
                    }
                    out.println(" ]");
                }
                for (int i = 0; i < this.nextStates.length; i++) {
                    if (this.nextStates[i] != null) {
                        this.nextStates[i].print(out, prefix + "  ");
                    }
                }
            }
        }
    }
}