1   package emissary.util.search;
2   
3   import jakarta.annotation.Nullable;
4   import org.slf4j.Logger;
5   import org.slf4j.LoggerFactory;
6   
7   import java.io.PrintStream;
8   import java.util.ArrayList;
9   import java.util.Collection;
10  import java.util.List;
11  
12  @SuppressWarnings("AvoidObjectArrays")
13  public class FastBoyerMoore {
14      private static final Logger logger = LoggerFactory.getLogger(FastBoyerMoore.class);
15      public byte[][] keywords;
16      int minKeywordLength;
17      int[] lookup = new int[259];
18      transient BackwardsTreeScanner scanner;
19      @Nullable
20      byte[] data = null;
21      BackwardsTreeScanner.State root;
22  
23      
24  
25      private FastBoyerMoore() {}
26  
27      public FastBoyerMoore(final FastBoyerMoore original) {
28          this();
29          this.keywords = original.keywords;
30          this.minKeywordLength = original.minKeywordLength;
31          this.lookup = original.lookup;
32          this.root = original.root;
33          this.scanner = new BackwardsTreeScanner(original.scanner);
34      }
35  
36      public FastBoyerMoore(final String[] keywordStrings) throws Exception {
37          this.keywords = new byte[keywordStrings.length][];
38          this.minKeywordLength = Integer.MAX_VALUE;
39          for (int i = 0; i < this.keywords.length; i++) {
40              this.keywords[i] = keywordStrings[i].getBytes();
41              this.minKeywordLength = Math.min(this.minKeywordLength, this.keywords[i].length);
42          }
43          for (int i = 0; i < this.lookup.length; i++) {
44              this.lookup[i] = this.minKeywordLength;
45          }
46          
47          for (int i = 0; i < this.keywords.length; i++) {
48              final byte[] kw = this.keywords[i];
49              
50              for (int j = 0; j < kw.length - 1; j++) {
51                  this.lookup[kw[j]] = Math.min(this.lookup[kw[j]], kw.length - j - 1);
52              }
53          }
54          this.data = null;
55          this.scanner = new BackwardsTreeScanner(keywordStrings);
56          this.root = this.scanner.getRoot();
57      }
58  
59      public FastBoyerMoore(final String[][] keywordStrings) throws Exception {
60          this.minKeywordLength = Integer.MAX_VALUE;
61          for (int i = 0; i < keywordStrings.length; i++) {
62              for (int j = 0; j < keywordStrings[i].length; j++) {
63                  this.minKeywordLength = Math.min(this.minKeywordLength, keywordStrings[i][j].length());
64              }
65          }
66          for (int i = 0; i < this.lookup.length; i++) {
67              this.lookup[i] = this.minKeywordLength;
68          }
69          
70          for (int i = 0; i < keywordStrings.length; i++) {
71              for (int j = 0; j < keywordStrings[i].length; j++) {
72                  final byte[] kw = keywordStrings[i][j].getBytes();
73                  
74                  for (int k = 0; k < kw.length - 1; k++) {
75                      this.lookup[kw[k]] = Math.min(this.lookup[kw[k]], kw.length - k - 1);
76                  }
77              }
78          }
79          this.data = null;
80          this.scanner = new BackwardsTreeScanner(keywordStrings);
81          this.root = this.scanner.getRoot();
82      }
83  
84      public void setData(final byte[] dataArg) {
85          this.scanner.setData(dataArg);
86          this.data = dataArg;
87      }
88  
89      public void scan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
90          this.data = dataArg;
91          this.scanner.setData(dataArg);
92          scan(start, end, result);
93      }
94  
95      public void scan(final int start, final int end, final Collection<int[]> result) {
96          final int actualEnd = Math.min(end, this.data.length);
97          int pos = start;
98          while (pos < actualEnd) {
99              final int ch = this.data[pos] & 0x7f;
100             final int jump = this.lookup[ch];
101             BackwardsTreeScanner.State state = this.root.nextStates[ch];
102             int curPos = pos - 1;
103             while ((state != null) && (curPos >= 0)) {
104                 if (state.matches != null) {
105                     for (int i = 0; i < state.matches.length; i++) {
106                         final int id = state.matches[i];
107                         final int[] tmp = new int[3];
108                         tmp[0] = curPos + 1;
109                         tmp[1] = id;
110                         tmp[2] = pos - curPos;
111                         result.add(tmp);
112                     }
113                 }
114                 final int ch2 = this.data[curPos] & 0x7f;
115                 state = state.nextStates[ch2];
116                 curPos--;
117             }
118             if ((state != null) && (curPos == -1)) {
119                 for (int i = 0; i < state.matches.length; i++) {
120                     final int id = state.matches[i];
121                     final int[] tmp = new int[3];
122                     tmp[0] = curPos + 1;
123                     tmp[1] = id;
124                     tmp[2] = pos - curPos;
125                     result.add(tmp);
126                 }
127             }
128             pos += jump;
129         }
130     }
131 
132     public int staticSingleScan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
133         final int actualEnd = Math.min(end, dataArg.length);
134         boolean found = false;
135         int pos = start;
136         while ((pos < actualEnd) && !found) {
137             final int ch = dataArg[pos] & 0x7f;
138             final int jump = this.lookup[ch];
139             BackwardsTreeScanner.State state = this.root.nextStates[ch];
140             int curPos = pos - 1;
141             while ((state != null) && (curPos >= 0)) {
142                 if (state.matches != null) {
143                     for (int i = 0; i < state.matches.length; i++) {
144                         final int id = state.matches[i];
145                         final int[] tmp = new int[3];
146                         tmp[0] = curPos + 1;
147                         tmp[1] = id;
148                         tmp[2] = pos - curPos;
149                         result.add(tmp);
150                         found = true;
151                     }
152                 }
153                 final int ch2 = dataArg[curPos] & 0x7f;
154                 state = state.nextStates[ch2];
155                 curPos--;
156             }
157             if ((state != null) && (state.matches != null) && (curPos == -1)) {
158                 for (int i = 0; i < state.matches.length; i++) {
159                     final int id = state.matches[i];
160                     final int[] tmp = new int[3];
161                     tmp[0] = curPos + 1;
162                     tmp[1] = id;
163                     tmp[2] = pos - curPos;
164                     result.add(tmp);
165                     found = true;
166                 }
167             }
168             pos += jump;
169         }
170         return pos;
171     }
172 
173     public static final int ID = 1;
174     public static final int LOC = 0;
175     public static final int LENGTH = 2;
176 
177     public static void main(final String[] args) {
178         try {
179             
180             final String[][] keys = {{"\nABCD"}, 
181                     {"\nABC"}, 
182                     {"\nAB"}, 
183                     {"\n//xyz//"}, // 12 13 14 15 16 17 18 19
184                     {"\nxxxxx"}, 
185                     {"\nabcdefghi"}, 
186                     {"\nabcde", "\nabcdefg"}}; 
187             
188             final StringBuilder dataString = new StringBuilder();
189             
190             final FastBoyerMoore scanner = new FastBoyerMoore(keys);
191             
192             for (int i = 0; i < keys.length; i++) {
193                 for (int j = 0; j < keys[i].length; j++) {
194                     dataString.append(keys[i][j]);
195                 }
196             }
197             
198             
199             final byte[] dataBytes = dataString.toString().getBytes();
200 
201             
202             final List<int[]> result = new ArrayList<>();
203             scanner.setData(dataBytes);
204             scanner.scan(0, dataBytes.length, result);
205             for (int i = 0; i < result.size(); i++) {
206                 final int[] tmp = result.get(i);
207                 logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
208             }
209         } catch (Exception e) {
210             logger.error("Exception in test", e);
211         }
212     }
213 
214     
215 
216 
217 
218 
219 
220 
221 
222     public static class BackwardsTreeScanner {
223         
224         
225         
226         State root = new State((byte) 0);
227         @Nullable
228         byte[] data = null;
229 
230         public BackwardsTreeScanner(final BackwardsTreeScanner o) {
231             this.root = o.root;
232         }
233 
234         public BackwardsTreeScanner(final String[][] keywordStrings) throws Exception {
235             for (int i = 0; i < keywordStrings.length; i++) {
236                 for (int j = 0; j < keywordStrings[i].length; j++) {
237                     this.root.learn(keywordStrings[i][j].getBytes(), i);
238                 }
239             }
240             
241         }
242 
243         public BackwardsTreeScanner(final String[] keywordStrings) throws Exception {
244             
245             final byte[][] keywords = new byte[keywordStrings.length][];
246             
247             for (int i = 0; i < keywords.length; i++) {
248                 keywords[i] = keywordStrings[i].getBytes();
249                 this.root.learn(keywordStrings[i].getBytes(), i);
250             }
251             
252         }
253 
254         public static void main(final String[] args) {
255             try {
256                 
257                 final String[][] keys = {{"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi"}, {"\nabcde", "\nabcdefg"}};
258 
259                 
260                 final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
261                 
262                 final StringBuilder dataString = new StringBuilder();
263                 
264                 for (int i = 0; i < keys.length; i++) {
265                     for (int j = 0; j < keys[i].length; j++) {
266                         dataString.append(keys[i][j]);
267                     }
268                 }
269                 
270                 
271                 final byte[] dataBytes = dataString.toString().getBytes();
272 
273                 
274                 final List<int[]> hits = new ArrayList<>();
275                 
276 
277 
278 
279                 for (int pos = 1; pos < dataBytes.length; pos++) {
280                     scanner.scan(dataBytes, pos, hits);
281                     for (int i = 0; i < hits.size(); i++) {
282                         final int[] tmp = hits.get(i);
283                         logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
284                     }
285                     hits.clear();
286                 }
287             } catch (Exception e) {
288                 logger.error("Exception in test", e);
289             }
290         }
291 
292         public static void main2(final String[] args) {
293             try {
294                 
295                 final String[] keys = {"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi", "\nabcde", "\nabcdefg"};
296                 
297                 final StringBuilder dataString = new StringBuilder();
298                 
299                 final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
300                 
301                 for (int i = 0; i < keys.length; i++) {
302                     dataString.append(keys[i]);
303                 }
304                 
305                 
306                 final byte[] dataBytes = dataString.toString().getBytes();
307 
308                 
309                 final List<int[]> hits = new ArrayList<>();
310                 
311 
312 
313 
314                 for (int pos = 1; pos < dataBytes.length; pos++) {
315                     scanner.scan(dataBytes, pos, hits);
316                     for (int i = 0; i < hits.size(); i++) {
317                         final int[] tmp = hits.get(i);
318                         logger.info("Hit At: {} id: {}", tmp[0], tmp[1]);
319                     }
320                     hits.clear();
321                 }
322             } catch (Exception e) {
323                 logger.error("Exception in test", e);
324             }
325         }
326 
327         public synchronized State getRoot() {
328             return this.root;
329         }
330 
331         public synchronized void setData(final byte[] dataArg) {
332             this.data = dataArg;
333         }
334 
335         
336 
337 
338 
339         public synchronized int scan(final byte[] dataArg, final int offset, final Collection<int[]> result) {
340             this.data = dataArg;
341             return scan(offset, result);
342         }
343 
344         public synchronized int scan(final int curPosArg, final Collection<int[]> result) {
345             if (!(curPosArg < this.data.length)) {
346                 return curPosArg;
347             }
348             State state = this.root;
349             int length = 0;
350             int curPos = curPosArg;
351             while ((state != null) && (curPos >= 0)) {
352                 if (curPos > this.data.length || curPos < 0) {
353                     return curPos;
354                 }
355                 int ch = this.data[curPos];
356                 if (ch < 0) {
357                     ch += Byte.MAX_VALUE - Byte.MIN_VALUE;
358                 }
359                 state = state.nextStates[ch];
360                 length++;
361                 if (state != null && state.matches != null) {
362                     for (int i = 0; i < state.matches.length; i++) {
363                         final int id = state.matches[i];
364                         final int[] tmp = new int[3];
365                         tmp[0] = curPos;
366                         tmp[1] = id;
367                         tmp[2] = length;
368                         result.add(tmp);
369                     }
370                 }
371                 curPos--;
372             }
373             return curPos;
374         }
375 
376         
377 
378 
379         public class State {
380 
381             
382             
383             
384             public State[] nextStates = new State[256];
385 
386             
387             public int gotHereBy;
388 
389             
390             @Nullable
391             public int[] matches = null;
392 
393             
394             public State(final int gotHereBy) {
395                 this.gotHereBy = gotHereBy;
396             }
397 
398             public void learn(final byte[] word, final int id) throws Exception {
399                 learn(word, word.length - 1, id);
400             }
401 
402             
403 
404 
405 
406 
407 
408             public void learn(final byte[] word, final int wordLoc, final int id) throws Exception {
409                 if (word == null) {
410                     throw new Exception("null keyword in BackwardsTreeScanner.learn()");
411                 }
412                 if (wordLoc >= word.length) {
413                     throw new Exception("char pos > word length:" + wordLoc + ">" + word.length);
414                 }
415                 if (wordLoc < 0) {
416                     
417                     
418                     
419                     if (this.matches == null) {
420                         this.matches = new int[0];
421                     }
422                     final int[] newMatches = new int[this.matches.length + 1];
423                     System.arraycopy(this.matches, 0, newMatches, 0, this.matches.length);
424                     this.matches = newMatches;
425                     this.matches[this.matches.length - 1] = id;
426                 } else {
427                     
428                     int nextChar = word[wordLoc];
429                     if (nextChar < 0) {
430                         nextChar += Byte.MAX_VALUE - Byte.MIN_VALUE;
431                     }
432 
433                     
434                     State nextState = this.nextStates[nextChar];
435                     if (nextState == null) {
436                         
437                         nextState = this.nextStates[nextChar] = new State(nextChar);
438                     }
439                     
440                     nextState.learn(word, wordLoc - 1, id);
441                 }
442             }
443 
444             public void print(final PrintStream out) {
445                 print(out, "root:");
446             }
447 
448             
449             public void print(final PrintStream out, final String prefix) {
450                 if ((this.gotHereBy < ' ') || (this.gotHereBy > '~')) {
451                     out.println(prefix + "-> " + "(byte)" + this.gotHereBy);
452                 } else {
453                     out.println(prefix + "-> " + (char) this.gotHereBy);
454                 }
455                 if (this.matches != null) {
456                     out.print(prefix + "ids [");
457                     for (int i = 0; i < this.matches.length; i++) {
458                         out.print(" " + this.matches[i]);
459                     }
460                     out.println(" ]");
461                 }
462                 for (int i = 0; i < this.nextStates.length; i++) {
463                     if (this.nextStates[i] != null) {
464                         this.nextStates[i].print(out, prefix + "  ");
465                     }
466                 }
467             }
468         }
469     }
470 }