View Javadoc
1   package emissary.util.search;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   
6   import java.io.PrintStream;
7   import java.util.ArrayList;
8   import java.util.Collection;
9   import java.util.List;
10  import javax.annotation.Nullable;
11  
12  @SuppressWarnings("AvoidObjectArrays")
13  public class FastBoyerMoore {
14      private static final Logger logger = LoggerFactory.getLogger(FastBoyerMoore.class);
15      public byte[][] keywords;
16      int minKeywordLength;
17      int[] lookup = new int[259];
18      transient BackwardsTreeScanner scanner;
19      @Nullable
20      byte[] data = null;
21      BackwardsTreeScanner.State root;
22  
23      // copy constructor
24  
25      private FastBoyerMoore() {}
26  
27      public FastBoyerMoore(final FastBoyerMoore original) {
28          this();
29          this.keywords = original.keywords;
30          this.minKeywordLength = original.minKeywordLength;
31          this.lookup = original.lookup;
32          this.root = original.root;
33          this.scanner = new BackwardsTreeScanner(original.scanner);
34      }
35  
36      public FastBoyerMoore(final String[] keywordStrings) throws Exception {
37          this.keywords = new byte[keywordStrings.length][];
38          this.minKeywordLength = Integer.MAX_VALUE;
39          for (int i = 0; i < this.keywords.length; i++) {
40              this.keywords[i] = keywordStrings[i].getBytes();
41              this.minKeywordLength = Math.min(this.minKeywordLength, this.keywords[i].length);
42          }
43          for (int i = 0; i < this.lookup.length; i++) {
44              this.lookup[i] = this.minKeywordLength;
45          }
46          // each keyword
47          for (int i = 0; i < this.keywords.length; i++) {
48              final byte[] kw = this.keywords[i];
49              // each keyword character
50              for (int j = 0; j < kw.length - 1; j++) {
51                  this.lookup[kw[j]] = Math.min(this.lookup[kw[j]], kw.length - j - 1);
52              }
53          }
54          this.data = null;
55          this.scanner = new BackwardsTreeScanner(keywordStrings);
56          this.root = this.scanner.getRoot();
57      }
58  
59      public FastBoyerMoore(final String[][] keywordStrings) throws Exception {
60          this.minKeywordLength = Integer.MAX_VALUE;
61          for (int i = 0; i < keywordStrings.length; i++) {
62              for (int j = 0; j < keywordStrings[i].length; j++) {
63                  this.minKeywordLength = Math.min(this.minKeywordLength, keywordStrings[i][j].length());
64              }
65          }
66          for (int i = 0; i < this.lookup.length; i++) {
67              this.lookup[i] = this.minKeywordLength;
68          }
69          // each keyword
70          for (int i = 0; i < keywordStrings.length; i++) {
71              for (int j = 0; j < keywordStrings[i].length; j++) {
72                  final byte[] kw = keywordStrings[i][j].getBytes();
73                  // each keyword character
74                  for (int k = 0; k < kw.length - 1; k++) {
75                      this.lookup[kw[k]] = Math.min(this.lookup[kw[k]], kw.length - k - 1);
76                  }
77              }
78          }
79          this.data = null;
80          this.scanner = new BackwardsTreeScanner(keywordStrings);
81          this.root = this.scanner.getRoot();
82      }
83  
84      public void setData(final byte[] dataArg) {
85          this.scanner.setData(dataArg);
86          this.data = dataArg;
87      }
88  
89      public void scan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
90          this.data = dataArg;
91          this.scanner.setData(dataArg);
92          scan(start, end, result);
93      }
94  
95      public void scan(final int start, final int end, final Collection<int[]> result) {
96          final int actualEnd = Math.min(end, this.data.length);
97          int pos = start;
98          while (pos < actualEnd) {
99              final int ch = this.data[pos] & 0x7f;
100             final int jump = this.lookup[ch];
101             BackwardsTreeScanner.State state = this.root.nextStates[ch];
102             int curPos = pos - 1;
103             while ((state != null) && (curPos >= 0)) {
104                 if (state.matches != null) {
105                     for (int i = 0; i < state.matches.length; i++) {
106                         final int id = state.matches[i];
107                         final int[] tmp = new int[3];
108                         tmp[0] = curPos + 1;
109                         tmp[1] = id;
110                         tmp[2] = pos - curPos;
111                         result.add(tmp);
112                     }
113                 }
114                 final int ch2 = this.data[curPos] & 0x7f;
115                 state = state.nextStates[ch2];
116                 curPos--;
117             }
118             if ((state != null) && (curPos == -1)) {
119                 for (int i = 0; i < state.matches.length; i++) {
120                     final int id = state.matches[i];
121                     final int[] tmp = new int[3];
122                     tmp[0] = curPos + 1;
123                     tmp[1] = id;
124                     tmp[2] = pos - curPos;
125                     result.add(tmp);
126                 }
127             }
128             pos += jump;
129         }
130     }
131 
132     public int staticSingleScan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
133         final int actualEnd = Math.min(end, dataArg.length);
134         boolean found = false;
135         int pos = start;
136         while ((pos < actualEnd) && !found) {
137             final int ch = dataArg[pos] & 0x7f;
138             final int jump = this.lookup[ch];
139             BackwardsTreeScanner.State state = this.root.nextStates[ch];
140             int curPos = pos - 1;
141             while ((state != null) && (curPos >= 0)) {
142                 if (state.matches != null) {
143                     for (int i = 0; i < state.matches.length; i++) {
144                         final int id = state.matches[i];
145                         final int[] tmp = new int[3];
146                         tmp[0] = curPos + 1;
147                         tmp[1] = id;
148                         tmp[2] = pos - curPos;
149                         result.add(tmp);
150                         found = true;
151                     }
152                 }
153                 final int ch2 = dataArg[curPos] & 0x7f;
154                 state = state.nextStates[ch2];
155                 curPos--;
156             }
157             if ((state != null) && (state.matches != null) && (curPos == -1)) {
158                 for (int i = 0; i < state.matches.length; i++) {
159                     final int id = state.matches[i];
160                     final int[] tmp = new int[3];
161                     tmp[0] = curPos + 1;
162                     tmp[1] = id;
163                     tmp[2] = pos - curPos;
164                     result.add(tmp);
165                     found = true;
166                 }
167             }
168             pos += jump;
169         }
170         return pos;
171     }
172 
173     public static final int ID = 1;
174     public static final int LOC = 0;
175     public static final int LENGTH = 2;
176 
177     public static void main(final String[] args) {
178         try {
179             // a list of interesting keywords. */
180             final String[][] keys = {{"\nABCD"}, // 0,1,2,3,4
181                     {"\nABC"}, // 5 6 7 8
182                     {"\nAB"}, // 9 10 11
183                     {"\n//xyz//"}, // 12 13 14 15 16 17 18 19
184                     {"\nxxxxx"}, // 20 21 22 23 24 25
185                     {"\nabcdefghi"}, // 26 27 28 29 30 31 32 33 34 35
186                     {"\nabcde", "\nabcdefg"}}; // 36 37 38 39 41 41 42 43 44 45
187             // A string for holding the test data to be searched
188             final StringBuilder dataString = new StringBuilder();
189             // The thing we are testing
190             final FastBoyerMoore scanner = new FastBoyerMoore(keys);
191             // Make up an interesting string. The second half should never match.
192             for (int i = 0; i < keys.length; i++) {
193                 for (int j = 0; j < keys[i].length; j++) {
194                     dataString.append(keys[i][j]);
195                 }
196             }
197             // for (int i = 0;i<keys.length;i++)dataString +=keys[i].toString().substring(0,2);
198             // A byte array version of the data.
199             final byte[] dataBytes = dataString.toString().getBytes();
200 
201             // A vector for holding the results.
202             final List<int[]> result = new ArrayList<>();
203             scanner.setData(dataBytes);
204             scanner.scan(0, dataBytes.length, result);
205             for (int i = 0; i < result.size(); i++) {
206                 final int[] tmp = result.get(i);
207                 logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
208             }
209         } catch (Exception e) {
210             logger.error("Exception in test", e);
211         }
212     }
213 
214     /**
215      * This class implements a tree state machine scanner that searches text backwards starting from the end. A list of
216      * strings is provided as the keywords to be searched. This class is usefull for a relatively small set of keywords.
217      * Larger keyword lists can be used if you have memory!
218      *
219      * @author ce
220      * @version 1.0
221      */
222     public static class BackwardsTreeScanner {
223         /** Original list of keywords storred in byte array form. */
224         // byte[][] keywords;
225         /** Root node of tree state diagram. Always start a search from here! */
226         State root = new State((byte) 0);
227         @Nullable
228         byte[] data = null;
229 
230         public BackwardsTreeScanner(final BackwardsTreeScanner o) {
231             this.root = o.root;
232         }
233 
234         public BackwardsTreeScanner(final String[][] keywordStrings) throws Exception {
235             for (int i = 0; i < keywordStrings.length; i++) {
236                 for (int j = 0; j < keywordStrings[i].length; j++) {
237                     this.root.learn(keywordStrings[i][j].getBytes(), i);
238                 }
239             }
240             // this.root.print(System.out);
241         }
242 
243         public BackwardsTreeScanner(final String[] keywordStrings) throws Exception {
244             // make byte arrays
245             final byte[][] keywords = new byte[keywordStrings.length][];
246             // and learn them
247             for (int i = 0; i < keywords.length; i++) {
248                 keywords[i] = keywordStrings[i].getBytes();
249                 this.root.learn(keywordStrings[i].getBytes(), i);
250             }
251             // this.root.print(System.out);
252         }
253 
254         public static void main(final String[] args) {
255             try {
256                 // a list of interesting keywords. */
257                 final String[][] keys = {{"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi"}, {"\nabcde", "\nabcdefg"}};
258 
259                 // The thing we are testing
260                 final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
261                 // A string for holding the test data to be searched
262                 final StringBuilder dataString = new StringBuilder();
263                 // Make up an interesting string. The second half should never match.
264                 for (int i = 0; i < keys.length; i++) {
265                     for (int j = 0; j < keys[i].length; j++) {
266                         dataString.append(keys[i][j]);
267                     }
268                 }
269                 // for (int i = 0;i<keys.length;i++)dataString +=keys[i].toString().substring(0,2);
270                 // A byte array version of the data.
271                 final byte[] dataBytes = dataString.toString().getBytes();
272 
273                 // A vector for holding the results.
274                 final List<int[]> hits = new ArrayList<>();
275                 /*
276                  * loop through the data from beginint to end calling scan at each position. This shows how to use scan(), but in
277                  * general this should be used more effediently (with a boyer more algorithm or something.
278                  */
279                 for (int pos = 1; pos < dataBytes.length; pos++) {
280                     scanner.scan(dataBytes, pos, hits);
281                     for (int i = 0; i < hits.size(); i++) {
282                         final int[] tmp = hits.get(i);
283                         logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
284                     }
285                     hits.clear();
286                 }
287             } catch (Exception e) {
288                 logger.error("Exception in test", e);
289             }
290         }
291 
292         public static void main2(final String[] args) {
293             try {
294                 // a list of interesting keywords. */
295                 final String[] keys = {"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi", "\nabcde", "\nabcdefg"};
296                 // A string for holding the test data to be searched
297                 final StringBuilder dataString = new StringBuilder();
298                 // The thing we are testing
299                 final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
300                 // Make up an interesting string. The second half should never match.
301                 for (int i = 0; i < keys.length; i++) {
302                     dataString.append(keys[i]);
303                 }
304                 // for (int i = 0;i<keys.length;i++)dataString +=keys[i].toString().substring(0,2);
305                 // A byte array version of the data.
306                 final byte[] dataBytes = dataString.toString().getBytes();
307 
308                 // A vector for holding the results.
309                 final List<int[]> hits = new ArrayList<>();
310                 /*
311                  * loop through the data from beginint to end calling scan at each position. This shows how to use scan(), but in
312                  * general this should be used more effediently (with a boyer more algorithm or something.
313                  */
314                 for (int pos = 1; pos < dataBytes.length; pos++) {
315                     scanner.scan(dataBytes, pos, hits);
316                     for (int i = 0; i < hits.size(); i++) {
317                         final int[] tmp = hits.get(i);
318                         logger.info("Hit At: {} id: {}", tmp[0], tmp[1]);
319                     }
320                     hits.clear();
321                 }
322             } catch (Exception e) {
323                 logger.error("Exception in test", e);
324             }
325         }
326 
327         public synchronized State getRoot() {
328             return this.root;
329         }
330 
331         public synchronized void setData(final byte[] dataArg) {
332             this.data = dataArg;
333         }
334 
335         /**
336          * This scans the byte array backwards from the offset. Each hit is added to the result vector. We stop when all
337          * posibilities are found
338          */
339         public synchronized int scan(final byte[] dataArg, final int offset, final Collection<int[]> result) {
340             this.data = dataArg;
341             return scan(offset, result);
342         }
343 
344         public synchronized int scan(final int curPosArg, final Collection<int[]> result) {
345             if (!(curPosArg < this.data.length)) {
346                 return curPosArg;
347             }
348             State state = this.root;
349             int length = 0;
350             int curPos = curPosArg;
351             while ((state != null) && (curPos >= 0)) {
352                 if (curPos > this.data.length || curPos < 0) {
353                     return curPos;
354                 }
355                 int ch = this.data[curPos];
356                 if (ch < 0) {
357                     ch += Byte.MAX_VALUE - Byte.MIN_VALUE;
358                 }
359                 state = state.nextStates[ch];
360                 length++;
361                 if (state != null && state.matches != null) {
362                     for (int i = 0; i < state.matches.length; i++) {
363                         final int id = state.matches[i];
364                         final int[] tmp = new int[3];
365                         tmp[0] = curPos;
366                         tmp[1] = id;
367                         tmp[2] = length;
368                         result.add(tmp);
369                     }
370                 }
371                 curPos--;
372             }
373             return curPos;
374         }
375 
376         /*
377          * This class implements a state machine that can learn character sequences.
378          */
379         public class State {
380 
381             // Each state has 256 transitions leaving it to new states based
382             // on a single ascii character. If there is no next state for a
383             // character, then the next state will be null.
384             public State[] nextStates = new State[256];
385 
386             // Each state can be visited by a single character. This is it!
387             public int gotHereBy;
388 
389             // A list of keyword ids that are matched at this state.
390             @Nullable
391             public int[] matches = null;
392 
393             // constructor
394             public State(final int gotHereBy) {
395                 this.gotHereBy = gotHereBy;
396             }
397 
398             public void learn(final byte[] word, final int id) throws Exception {
399                 learn(word, word.length - 1, id);
400             }
401 
402             /**
403              * Walk throught he keyword backwards. Adding states to the root (or current state) when they don't exists. At the end,
404              * record the keyowrd id in the ending state.
405              *
406              * Warning this is recursive, but thats OK for small keywords.
407              */
408             public void learn(final byte[] word, final int wordLoc, final int id) throws Exception {
409                 if (word == null) {
410                     throw new Exception("null keyword in BackwardsTreeScanner.learn()");
411                 }
412                 if (wordLoc >= word.length) {
413                     throw new Exception("char pos > word length:" + wordLoc + ">" + word.length);
414                 }
415                 if (wordLoc < 0) {
416                     // we are finished because this is the first character,
417                     // so save the id in this state. We want the matches to be
418                     // in an array so this is a little harder than a vector thing.
419                     if (this.matches == null) {
420                         this.matches = new int[0];
421                     }
422                     final int[] newMatches = new int[this.matches.length + 1];
423                     System.arraycopy(this.matches, 0, newMatches, 0, this.matches.length);
424                     this.matches = newMatches;
425                     this.matches[this.matches.length - 1] = id;
426                 } else {
427                     // Get the next character in the word
428                     int nextChar = word[wordLoc];
429                     if (nextChar < 0) {
430                         nextChar += Byte.MAX_VALUE - Byte.MIN_VALUE;
431                     }
432 
433                     // See if the state already exists
434                     State nextState = this.nextStates[nextChar];
435                     if (nextState == null) {
436                         // Make a new state because it isn't there yet.
437                         nextState = this.nextStates[nextChar] = new State(nextChar);
438                     }
439                     // Learn the rest of the keyword in the new state.
440                     nextState.learn(word, wordLoc - 1, id);
441                 }
442             }
443 
444             public void print(final PrintStream out) {
445                 print(out, "root:");
446             }
447 
448             // Make a pretty picture.
449             public void print(final PrintStream out, final String prefix) {
450                 if ((this.gotHereBy < ' ') || (this.gotHereBy > '~')) {
451                     out.println(prefix + "-> " + "(byte)" + this.gotHereBy);
452                 } else {
453                     out.println(prefix + "-> " + (char) this.gotHereBy);
454                 }
455                 if (this.matches != null) {
456                     out.print(prefix + "ids [");
457                     for (int i = 0; i < this.matches.length; i++) {
458                         out.print(" " + this.matches[i]);
459                     }
460                     out.println(" ]");
461                 }
462                 for (int i = 0; i < this.nextStates.length; i++) {
463                     if (this.nextStates[i] != null) {
464                         this.nextStates[i].print(out, prefix + "  ");
465                     }
466                 }
467             }
468         }
469     }
470 }