1 package emissary.kff;
2
3 import emissary.core.channels.SeekableByteChannelFactory;
4
5 import jakarta.annotation.Nullable;
6 import org.apache.commons.io.IOUtils;
7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory;
9
10 import java.io.File;
11 import java.io.IOException;
12 import java.io.InputStream;
13 import java.io.RandomAccessFile;
14 import java.nio.channels.Channels;
15 import java.nio.channels.SeekableByteChannel;
16 import java.nio.file.Files;
17 import java.nio.file.Paths;
18 import java.util.Arrays;
19
20
21
22
23
24
25
26
27 public final class Ssdeep {
28
29 private static final Logger logger = LoggerFactory.getLogger(Ssdeep.class);
30
31 private static final int SPAMSUM_LENGTH = 64;
32 private static final int MIN_BLOCKSIZE = 3;
33
34 @SuppressWarnings("PMD.UselessParentheses")
35 public static final int FUZZY_MAX_RESULT = SPAMSUM_LENGTH + (SPAMSUM_LENGTH / 2 + 20);
36
37
38 private static final int ROLLING_WINDOW_SIZE = 7;
39
40
41 private static final int BUFFER_SIZE = 8192;
42
43
44 private static final long HASH_INIT = 0x28021967;
45
46
47 private static final long HASH_PRIME = 0x01000193;
48
49
50 private static final long MASK32 = 0xffffffffL;
51
52
53
54
55
56 private static final byte[] b64Table = SpamSumSignature.getBytes("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
57
58
59
60
61
62
63
64 private static byte b64EncodeLowBits(final long v) {
65 return b64Table[((int) v) & 0x3f];
66 }
67
68 private static final class SsContext {
69
70
71 private final byte[] fuzzHash1 = new byte[SPAMSUM_LENGTH + 1];
72
73
74 private final byte[] fuzzHash2 = new byte[SPAMSUM_LENGTH / 2 + 1];
75
76
77 private int fuzzLen1;
78
79
80 private int fuzzLen2;
81
82 private long sumHash1;
83 private long sumHash2;
84 private long blockSize;
85
86
87
88
89
90
91
92
93 private static long estimateBlockSize(final long expectedInputLength) {
94 long blockSize = MIN_BLOCKSIZE;
95 while ((blockSize * SPAMSUM_LENGTH) < expectedInputLength) {
96 blockSize *= 2;
97 }
98 return blockSize;
99 }
100
101
102
103
104
105
106
107 public SsContext(@Nullable final File f) {
108 final long expectedInputLength = (f != null) ? f.length() : 0;
109 this.blockSize = estimateBlockSize(expectedInputLength);
110 }
111
112
113
114
115
116
117
118 public SsContext(@Nullable final byte[] data) {
119 final long expectedInputLength = (data != null) ? data.length : 0;
120 this.blockSize = estimateBlockSize(expectedInputLength);
121 }
122
123
124
125
126
127
128
129 public SsContext(final SeekableByteChannelFactory sbcf) {
130 long expectedInputLength = 0;
131
132 try (SeekableByteChannel sbc = sbcf.create()) {
133 expectedInputLength = sbc.size();
134 } catch (final IOException ignored) {
135
136 }
137
138 this.blockSize = estimateBlockSize(expectedInputLength);
139 }
140
141
142
143
144
145
146
147
148 private static long updateSumHash(final int b, final long h) {
149 return ((h * HASH_PRIME) ^ b) & MASK32;
150 }
151
152
153
154
155
156
157
158
159
160 private void applyBytes(final RollingState rollState, final byte[] buffer, final int start, final int end) {
161
162
163
164
165 for (int i = start; i < end; i++) {
166
167 final int nextByte = ((int) buffer[i]) & 0xff;
168
169
170 this.sumHash1 = updateSumHash(nextByte, this.sumHash1);
171 this.sumHash2 = updateSumHash(nextByte, this.sumHash2);
172 final long rollingHash = rollState.roll(nextByte);
173
174 if ((rollingHash % this.blockSize) == (this.blockSize - 1)) {
175
176
177
178 if (this.fuzzLen1 < (SPAMSUM_LENGTH - 1)) {
179
180
181
182
183
184
185
186 this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
187 this.sumHash1 = HASH_INIT;
188 }
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206 if ((rollingHash % (this.blockSize * 2)) == ((this.blockSize * 2) - 1)) {
207 if (this.fuzzLen2 < (SPAMSUM_LENGTH / 2 - 1)) {
208 this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
209 this.sumHash2 = HASH_INIT;
210 }
211 }
212 }
213 }
214 }
215
216
217
218
219
220
221 private void beginHashing() {
222 this.fuzzLen1 = 0;
223 this.fuzzLen2 = 0;
224 this.sumHash1 = HASH_INIT;
225 this.sumHash2 = HASH_INIT;
226 }
227
228
229
230
231
232
233
234
235
236 private static byte[] truncateArray(final byte[] input, final int maxLength) {
237 if (input.length == maxLength) {
238 return input;
239 } else {
240 return Arrays.copyOf(input, maxLength);
241 }
242 }
243
244
245
246
247
248
249
250
251 private SpamSumSignature finishHashing(final RollingState rollState) {
252 if (rollState.getHash() != 0) {
253 this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
254 this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
255 }
256
257 final byte[] finalHash1 = truncateArray(this.fuzzHash1, this.fuzzLen1);
258 final byte[] finalHash2 = truncateArray(this.fuzzHash2, this.fuzzLen2);
259 return new SpamSumSignature(this.blockSize, finalHash1, finalHash2);
260 }
261
262
263
264
265
266
267
268
269
270
271
272 public SpamSumSignature generateHash(@Nullable final byte[] data) {
273 beginHashing();
274 final RollingState rollState = new RollingState();
275
276 if (data != null) {
277 applyBytes(rollState, data, 0, data.length);
278 }
279
280 return finishHashing(rollState);
281 }
282
283 public SpamSumSignature generateHash(final SeekableByteChannelFactory sbcf) {
284 beginHashing();
285 final RollingState rollState = new RollingState();
286
287 try (InputStream is = Channels.newInputStream(sbcf.create())) {
288 final byte[] b = new byte[1024];
289
290 int bytesRead;
291 while ((bytesRead = is.read(b)) != -1) {
292 applyBytes(rollState, b, 0, bytesRead);
293 }
294 } catch (final IOException ignored) {
295
296 }
297
298 return finishHashing(rollState);
299 }
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314 public SpamSumSignature generateHash(final RandomAccessFile stream) throws IOException {
315 beginHashing();
316 final RollingState rollState = new RollingState();
317
318 final byte[] buffer = new byte[BUFFER_SIZE];
319 while (true) {
320 final int bytesRead = stream.read(buffer, 0, buffer.length);
321 if (bytesRead <= 0) {
322 break;
323 }
324 applyBytes(rollState, buffer, 0, bytesRead);
325 }
326
327 return finishHashing(rollState);
328 }
329 }
330
331
332
333
334
335 private static final class RollingState {
336
337
338 private final int[] window = new int[ROLLING_WINDOW_SIZE];
339
340
341 private int windowPosition;
342
343
344 private long h1;
345
346
347
348
349
350 private long h2;
351
352
353
354
355
356 private long h3;
357
358
359
360
361 public RollingState() {}
362
363
364
365
366
367
368 public long getHash() {
369 return (this.h1 + this.h2 + this.h3) & MASK32;
370 }
371
372
373
374
375
376
377
378 public long roll(final int b) {
379 this.h2 = (this.h2 - this.h1 + (ROLLING_WINDOW_SIZE * ((long) b))) & MASK32;
380 this.h1 = (this.h1 + b - this.window[this.windowPosition]) & MASK32;
381 this.window[this.windowPosition] = b;
382
383
384 if (this.windowPosition == (ROLLING_WINDOW_SIZE - 1)) {
385 this.windowPosition = 0;
386 } else {
387 this.windowPosition++;
388 }
389
390 this.h3 = ((this.h3 << 5) & MASK32) ^ b;
391
392 return (this.h1 + this.h2 + this.h3) & MASK32;
393 }
394 }
395
396 public Ssdeep() {}
397
398
399
400
401
402
403
404 public String fuzzyHash(final byte[] data) {
405 final SsContext ctx = new SsContext(data);
406 while (true) {
407 final SpamSumSignature signature = ctx.generateHash(data);
408
409
410
411 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
412 ctx.blockSize = ctx.blockSize / 2;
413 } else {
414 return signature.toString();
415 }
416 }
417 }
418
419 public String fuzzyHash(final SeekableByteChannelFactory sbcf) {
420 final SsContext ctx = new SsContext(sbcf);
421 while (true) {
422 final SpamSumSignature signature = ctx.generateHash(sbcf);
423
424
425
426 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
427 ctx.blockSize = ctx.blockSize / 2;
428 } else {
429 return signature.toString();
430 }
431 }
432 }
433
434
435
436
437
438
439
440
441 public String fuzzyHashFile(final File file) throws IOException {
442 try (RandomAccessFile stream = new RandomAccessFile(file, "r")) {
443 final SsContext ctx = new SsContext(file);
444 while (true) {
445 stream.seek(0);
446 final SpamSumSignature signature = ctx.generateHash(stream);
447
448
449
450 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
451 ctx.blockSize = ctx.blockSize / 2;
452 } else {
453 return signature.toString();
454 }
455 }
456 }
457 }
458
459
460
461
462
463
464
465
466 public String fuzzyHashFile(final String fileName) throws IOException {
467 return this.fuzzyHashFile(new File(fileName));
468 }
469
470
471
472
473
474
475
476
477
478
479
480
481
482 private static int indexOfSubSequence(final byte[] haystack, final byte[] needle, final int needleStart, final int length) {
483 final int lastCandidatePos = haystack.length - length;
484 final byte firstNeedleByte = needle[needleStart];
485 NEXT_CANDIDATE: for (int candidatePos = 0; candidatePos <= lastCandidatePos; candidatePos++) {
486 if (haystack[candidatePos] == firstNeedleByte) {
487
488
489
490 for (int needlePos = 1; needlePos < length; needlePos++) {
491 if (haystack[candidatePos + needlePos] != needle[needleStart + needlePos]) {
492 continue NEXT_CANDIDATE;
493 }
494 }
495
496
497 return candidatePos;
498 }
499 }
500 return -1;
501 }
502
503
504
505
506
507
508
509
510
511
512 private static boolean hasCommonSequence(final byte[] s1, final byte[] s2, final int length) {
513 if ((s1.length < length) || (s2.length < length)) {
514 return false;
515 }
516
517
518
519
520 final int lastS1Pos = s1.length - length;
521 for (int s1Pos = 0; s1Pos <= lastS1Pos; s1Pos++) {
522 final int s2Pos = indexOfSubSequence(s2, s1, s1Pos, length);
523 if (s2Pos != -1) {
524 return true;
525 }
526 }
527 return false;
528 }
529
530
531
532
533
534
535
536
537
538 private static byte[] eliminateLongSequences(final byte[] in) {
539 if (in.length < 4) {
540 return in;
541 }
542
543
544 byte prev = (in[0] != 0) ? 0 : (byte) 1;
545 int repeatCount = 0;
546
547
548
549 int inPos = 0;
550 while (true) {
551 if (inPos == in.length) {
552
553 return in;
554 } else if (in[inPos] == prev) {
555
556 repeatCount++;
557 if (repeatCount == 3) {
558 break;
559 }
560 } else {
561
562 prev = in[inPos];
563 repeatCount = 0;
564 }
565 inPos++;
566 }
567
568
569
570
571
572 final byte[] out = new byte[in.length - 1];
573 System.arraycopy(in, 0, out, 0, inPos);
574 int outPos = inPos;
575
576
577 while (++inPos < in.length) {
578 if (in[inPos] == prev) {
579 repeatCount++;
580 } else {
581 prev = in[inPos];
582 repeatCount = 0;
583 }
584 if (repeatCount < 3) {
585 out[outPos++] = in[inPos];
586 }
587 }
588
589 return (outPos == out.length) ? out : Arrays.copyOf(out, outPos);
590 }
591
592
593
594
595
596 private static long scoreStrings(final byte[] s1, final byte[] s2, final long blockSize) {
597 final int len1 = s1.length;
598 final int len2 = s2.length;
599
600 if ((len1 > SPAMSUM_LENGTH) || (len2 > SPAMSUM_LENGTH)) {
601
602 return 0;
603 }
604
605
606
607 if (!hasCommonSequence(s1, s2, ROLLING_WINDOW_SIZE)) {
608 return 0;
609 }
610
611
612
613
614 long score = EditDistance.calculateEditDistance(s1, len1, s2, len2);
615 if (logger.isDebugEnabled()) {
616 logger.debug("edit_dist: {}", score);
617 }
618
619
620
621
622
623
624 score = score * SPAMSUM_LENGTH / (len1 + len2);
625
626
627
628
629
630 score = 100 * score / 64;
631
632
633
634 if (score >= 100) {
635 return 0;
636 }
637
638
639
640 score = 100 - score;
641
642
643
644 if (score > (blockSize / MIN_BLOCKSIZE * Math.min(len1, len2))) {
645 score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2);
646 }
647 return score;
648 }
649
650
651
652
653
654
655
656
657
658 public int compare(@Nullable final SpamSumSignature signature1, @Nullable final SpamSumSignature signature2) {
659 if ((null == signature1) || (null == signature2)) {
660 return -1;
661 }
662 final long blockSize1 = signature1.getBlockSize();
663 final long blockSize2 = signature2.getBlockSize();
664
665
666
667
668
669
670 if ((blockSize1 != blockSize2) && (blockSize1 != (blockSize2 * 2)) && (blockSize2 != (blockSize1 * 2))) {
671 if (logger.isDebugEnabled()) {
672 logger.debug("block sizes too different: {} {}", blockSize1, blockSize2);
673 }
674 return 0;
675 }
676
677
678
679
680
681 final byte[] s1First = eliminateLongSequences(signature1.getHashPart1());
682 final byte[] s1Second = eliminateLongSequences(signature1.getHashPart2());
683 final byte[] s2First = eliminateLongSequences(signature2.getHashPart1());
684 final byte[] s2Second = eliminateLongSequences(signature2.getHashPart2());
685
686
687
688
689 final long score;
690 if (blockSize1 == blockSize2) {
691
692 final long score1 = scoreStrings(s1First, s2First, blockSize1);
693 final long score2 = scoreStrings(s1Second, s2Second, blockSize2);
694 score = Math.max(score1, score2);
695 } else if (blockSize1 == (blockSize2 * 2)) {
696
697 score = scoreStrings(s1First, s2Second, blockSize1);
698 } else {
699
700 score = scoreStrings(s1Second, s2First, blockSize2);
701 }
702
703 return (int) score;
704 }
705
706 @SuppressWarnings("SystemOut")
707 public static void main(final String[] args) throws Exception {
708 final Ssdeep ss = new Ssdeep();
709 for (final String f : args) {
710 try (InputStream is = Files.newInputStream(Paths.get(f))) {
711 final byte[] buffer = IOUtils.toByteArray(is);
712
713 System.out.println(ss.fuzzyHash(buffer) + ",\"" + f + "\"");
714 }
715 }
716 }
717 }