1 package emissary.util.search;
2
3 import org.slf4j.Logger;
4 import org.slf4j.LoggerFactory;
5
6 import java.io.PrintStream;
7 import java.util.ArrayList;
8 import java.util.Collection;
9 import java.util.List;
10 import javax.annotation.Nullable;
11
12 @SuppressWarnings("AvoidObjectArrays")
13 public class FastBoyerMoore {
14 private static final Logger logger = LoggerFactory.getLogger(FastBoyerMoore.class);
15 public byte[][] keywords;
16 int minKeywordLength;
17 int[] lookup = new int[259];
18 transient BackwardsTreeScanner scanner;
19 @Nullable
20 byte[] data = null;
21 BackwardsTreeScanner.State root;
22
23
24
25 private FastBoyerMoore() {}
26
27 public FastBoyerMoore(final FastBoyerMoore original) {
28 this();
29 this.keywords = original.keywords;
30 this.minKeywordLength = original.minKeywordLength;
31 this.lookup = original.lookup;
32 this.root = original.root;
33 this.scanner = new BackwardsTreeScanner(original.scanner);
34 }
35
36 public FastBoyerMoore(final String[] keywordStrings) throws Exception {
37 this.keywords = new byte[keywordStrings.length][];
38 this.minKeywordLength = Integer.MAX_VALUE;
39 for (int i = 0; i < this.keywords.length; i++) {
40 this.keywords[i] = keywordStrings[i].getBytes();
41 this.minKeywordLength = Math.min(this.minKeywordLength, this.keywords[i].length);
42 }
43 for (int i = 0; i < this.lookup.length; i++) {
44 this.lookup[i] = this.minKeywordLength;
45 }
46
47 for (int i = 0; i < this.keywords.length; i++) {
48 final byte[] kw = this.keywords[i];
49
50 for (int j = 0; j < kw.length - 1; j++) {
51 this.lookup[kw[j]] = Math.min(this.lookup[kw[j]], kw.length - j - 1);
52 }
53 }
54 this.data = null;
55 this.scanner = new BackwardsTreeScanner(keywordStrings);
56 this.root = this.scanner.getRoot();
57 }
58
59 public FastBoyerMoore(final String[][] keywordStrings) throws Exception {
60 this.minKeywordLength = Integer.MAX_VALUE;
61 for (int i = 0; i < keywordStrings.length; i++) {
62 for (int j = 0; j < keywordStrings[i].length; j++) {
63 this.minKeywordLength = Math.min(this.minKeywordLength, keywordStrings[i][j].length());
64 }
65 }
66 for (int i = 0; i < this.lookup.length; i++) {
67 this.lookup[i] = this.minKeywordLength;
68 }
69
70 for (int i = 0; i < keywordStrings.length; i++) {
71 for (int j = 0; j < keywordStrings[i].length; j++) {
72 final byte[] kw = keywordStrings[i][j].getBytes();
73
74 for (int k = 0; k < kw.length - 1; k++) {
75 this.lookup[kw[k]] = Math.min(this.lookup[kw[k]], kw.length - k - 1);
76 }
77 }
78 }
79 this.data = null;
80 this.scanner = new BackwardsTreeScanner(keywordStrings);
81 this.root = this.scanner.getRoot();
82 }
83
84 public void setData(final byte[] dataArg) {
85 this.scanner.setData(dataArg);
86 this.data = dataArg;
87 }
88
89 public void scan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
90 this.data = dataArg;
91 this.scanner.setData(dataArg);
92 scan(start, end, result);
93 }
94
95 public void scan(final int start, final int end, final Collection<int[]> result) {
96 final int actualEnd = Math.min(end, this.data.length);
97 int pos = start;
98 while (pos < actualEnd) {
99 final int ch = this.data[pos] & 0x7f;
100 final int jump = this.lookup[ch];
101 BackwardsTreeScanner.State state = this.root.nextStates[ch];
102 int curPos = pos - 1;
103 while ((state != null) && (curPos >= 0)) {
104 if (state.matches != null) {
105 for (int i = 0; i < state.matches.length; i++) {
106 final int id = state.matches[i];
107 final int[] tmp = new int[3];
108 tmp[0] = curPos + 1;
109 tmp[1] = id;
110 tmp[2] = pos - curPos;
111 result.add(tmp);
112 }
113 }
114 final int ch2 = this.data[curPos] & 0x7f;
115 state = state.nextStates[ch2];
116 curPos--;
117 }
118 if ((state != null) && (curPos == -1)) {
119 for (int i = 0; i < state.matches.length; i++) {
120 final int id = state.matches[i];
121 final int[] tmp = new int[3];
122 tmp[0] = curPos + 1;
123 tmp[1] = id;
124 tmp[2] = pos - curPos;
125 result.add(tmp);
126 }
127 }
128 pos += jump;
129 }
130 }
131
132 public int staticSingleScan(final byte[] dataArg, final int start, final int end, final Collection<int[]> result) {
133 final int actualEnd = Math.min(end, dataArg.length);
134 boolean found = false;
135 int pos = start;
136 while ((pos < actualEnd) && !found) {
137 final int ch = dataArg[pos] & 0x7f;
138 final int jump = this.lookup[ch];
139 BackwardsTreeScanner.State state = this.root.nextStates[ch];
140 int curPos = pos - 1;
141 while ((state != null) && (curPos >= 0)) {
142 if (state.matches != null) {
143 for (int i = 0; i < state.matches.length; i++) {
144 final int id = state.matches[i];
145 final int[] tmp = new int[3];
146 tmp[0] = curPos + 1;
147 tmp[1] = id;
148 tmp[2] = pos - curPos;
149 result.add(tmp);
150 found = true;
151 }
152 }
153 final int ch2 = dataArg[curPos] & 0x7f;
154 state = state.nextStates[ch2];
155 curPos--;
156 }
157 if ((state != null) && (state.matches != null) && (curPos == -1)) {
158 for (int i = 0; i < state.matches.length; i++) {
159 final int id = state.matches[i];
160 final int[] tmp = new int[3];
161 tmp[0] = curPos + 1;
162 tmp[1] = id;
163 tmp[2] = pos - curPos;
164 result.add(tmp);
165 found = true;
166 }
167 }
168 pos += jump;
169 }
170 return pos;
171 }
172
173 public static final int ID = 1;
174 public static final int LOC = 0;
175 public static final int LENGTH = 2;
176
177 public static void main(final String[] args) {
178 try {
179
180 final String[][] keys = {{"\nABCD"},
181 {"\nABC"},
182 {"\nAB"},
183 {"\n//xyz//"}, // 12 13 14 15 16 17 18 19
184 {"\nxxxxx"},
185 {"\nabcdefghi"},
186 {"\nabcde", "\nabcdefg"}};
187
188 final StringBuilder dataString = new StringBuilder();
189
190 final FastBoyerMoore scanner = new FastBoyerMoore(keys);
191
192 for (int i = 0; i < keys.length; i++) {
193 for (int j = 0; j < keys[i].length; j++) {
194 dataString.append(keys[i][j]);
195 }
196 }
197
198
199 final byte[] dataBytes = dataString.toString().getBytes();
200
201
202 final List<int[]> result = new ArrayList<>();
203 scanner.setData(dataBytes);
204 scanner.scan(0, dataBytes.length, result);
205 for (int i = 0; i < result.size(); i++) {
206 final int[] tmp = result.get(i);
207 logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
208 }
209 } catch (Exception e) {
210 logger.error("Exception in test", e);
211 }
212 }
213
214
215
216
217
218
219
220
221
222 public static class BackwardsTreeScanner {
223
224
225
226 State root = new State((byte) 0);
227 @Nullable
228 byte[] data = null;
229
230 public BackwardsTreeScanner(final BackwardsTreeScanner o) {
231 this.root = o.root;
232 }
233
234 public BackwardsTreeScanner(final String[][] keywordStrings) throws Exception {
235 for (int i = 0; i < keywordStrings.length; i++) {
236 for (int j = 0; j < keywordStrings[i].length; j++) {
237 this.root.learn(keywordStrings[i][j].getBytes(), i);
238 }
239 }
240
241 }
242
243 public BackwardsTreeScanner(final String[] keywordStrings) throws Exception {
244
245 final byte[][] keywords = new byte[keywordStrings.length][];
246
247 for (int i = 0; i < keywords.length; i++) {
248 keywords[i] = keywordStrings[i].getBytes();
249 this.root.learn(keywordStrings[i].getBytes(), i);
250 }
251
252 }
253
254 public static void main(final String[] args) {
255 try {
256
257 final String[][] keys = {{"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi"}, {"\nabcde", "\nabcdefg"}};
258
259
260 final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
261
262 final StringBuilder dataString = new StringBuilder();
263
264 for (int i = 0; i < keys.length; i++) {
265 for (int j = 0; j < keys[i].length; j++) {
266 dataString.append(keys[i][j]);
267 }
268 }
269
270
271 final byte[] dataBytes = dataString.toString().getBytes();
272
273
274 final List<int[]> hits = new ArrayList<>();
275
276
277
278
279 for (int pos = 1; pos < dataBytes.length; pos++) {
280 scanner.scan(dataBytes, pos, hits);
281 for (int i = 0; i < hits.size(); i++) {
282 final int[] tmp = hits.get(i);
283 logger.info("Hit At: {} id: {} l: {}", tmp[0], tmp[1], tmp[2]);
284 }
285 hits.clear();
286 }
287 } catch (Exception e) {
288 logger.error("Exception in test", e);
289 }
290 }
291
292 public static void main2(final String[] args) {
293 try {
294
295 final String[] keys = {"\nABCD", "\nABC", "\nAB", "\n//xyz//", "\nxxxxx", "\nabcdefghi", "\nabcde", "\nabcdefg"};
296
297 final StringBuilder dataString = new StringBuilder();
298
299 final BackwardsTreeScanner scanner = new BackwardsTreeScanner(keys);
300
301 for (int i = 0; i < keys.length; i++) {
302 dataString.append(keys[i]);
303 }
304
305
306 final byte[] dataBytes = dataString.toString().getBytes();
307
308
309 final List<int[]> hits = new ArrayList<>();
310
311
312
313
314 for (int pos = 1; pos < dataBytes.length; pos++) {
315 scanner.scan(dataBytes, pos, hits);
316 for (int i = 0; i < hits.size(); i++) {
317 final int[] tmp = hits.get(i);
318 logger.info("Hit At: {} id: {}", tmp[0], tmp[1]);
319 }
320 hits.clear();
321 }
322 } catch (Exception e) {
323 logger.error("Exception in test", e);
324 }
325 }
326
327 public synchronized State getRoot() {
328 return this.root;
329 }
330
331 public synchronized void setData(final byte[] dataArg) {
332 this.data = dataArg;
333 }
334
335
336
337
338
339 public synchronized int scan(final byte[] dataArg, final int offset, final Collection<int[]> result) {
340 this.data = dataArg;
341 return scan(offset, result);
342 }
343
344 public synchronized int scan(final int curPosArg, final Collection<int[]> result) {
345 if (!(curPosArg < this.data.length)) {
346 return curPosArg;
347 }
348 State state = this.root;
349 int length = 0;
350 int curPos = curPosArg;
351 while ((state != null) && (curPos >= 0)) {
352 if (curPos > this.data.length || curPos < 0) {
353 return curPos;
354 }
355 int ch = this.data[curPos];
356 if (ch < 0) {
357 ch += Byte.MAX_VALUE - Byte.MIN_VALUE;
358 }
359 state = state.nextStates[ch];
360 length++;
361 if (state != null && state.matches != null) {
362 for (int i = 0; i < state.matches.length; i++) {
363 final int id = state.matches[i];
364 final int[] tmp = new int[3];
365 tmp[0] = curPos;
366 tmp[1] = id;
367 tmp[2] = length;
368 result.add(tmp);
369 }
370 }
371 curPos--;
372 }
373 return curPos;
374 }
375
376
377
378
379 public class State {
380
381
382
383
384 public State[] nextStates = new State[256];
385
386
387 public int gotHereBy;
388
389
390 @Nullable
391 public int[] matches = null;
392
393
394 public State(final int gotHereBy) {
395 this.gotHereBy = gotHereBy;
396 }
397
398 public void learn(final byte[] word, final int id) throws Exception {
399 learn(word, word.length - 1, id);
400 }
401
402
403
404
405
406
407
408 public void learn(final byte[] word, final int wordLoc, final int id) throws Exception {
409 if (word == null) {
410 throw new Exception("null keyword in BackwardsTreeScanner.learn()");
411 }
412 if (wordLoc >= word.length) {
413 throw new Exception("char pos > word length:" + wordLoc + ">" + word.length);
414 }
415 if (wordLoc < 0) {
416
417
418
419 if (this.matches == null) {
420 this.matches = new int[0];
421 }
422 final int[] newMatches = new int[this.matches.length + 1];
423 System.arraycopy(this.matches, 0, newMatches, 0, this.matches.length);
424 this.matches = newMatches;
425 this.matches[this.matches.length - 1] = id;
426 } else {
427
428 int nextChar = word[wordLoc];
429 if (nextChar < 0) {
430 nextChar += Byte.MAX_VALUE - Byte.MIN_VALUE;
431 }
432
433
434 State nextState = this.nextStates[nextChar];
435 if (nextState == null) {
436
437 nextState = this.nextStates[nextChar] = new State(nextChar);
438 }
439
440 nextState.learn(word, wordLoc - 1, id);
441 }
442 }
443
444 public void print(final PrintStream out) {
445 print(out, "root:");
446 }
447
448
449 public void print(final PrintStream out, final String prefix) {
450 if ((this.gotHereBy < ' ') || (this.gotHereBy > '~')) {
451 out.println(prefix + "-> " + "(byte)" + this.gotHereBy);
452 } else {
453 out.println(prefix + "-> " + (char) this.gotHereBy);
454 }
455 if (this.matches != null) {
456 out.print(prefix + "ids [");
457 for (int i = 0; i < this.matches.length; i++) {
458 out.print(" " + this.matches[i]);
459 }
460 out.println(" ]");
461 }
462 for (int i = 0; i < this.nextStates.length; i++) {
463 if (this.nextStates[i] != null) {
464 this.nextStates[i].print(out, prefix + " ");
465 }
466 }
467 }
468 }
469 }
470 }