1 package emissary.kff;
2
3 import emissary.core.channels.SeekableByteChannelFactory;
4
5 import jakarta.annotation.Nullable;
6 import org.apache.commons.io.IOUtils;
7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory;
9
10 import java.io.File;
11 import java.io.IOException;
12 import java.io.InputStream;
13 import java.io.RandomAccessFile;
14 import java.nio.channels.Channels;
15 import java.nio.channels.SeekableByteChannel;
16 import java.nio.file.Files;
17 import java.nio.file.Paths;
18 import java.util.Arrays;
19
20 /**
21 * A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
22 * there that all look basically the same. This one is from
23 * https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
24 *
25 * A new ssdeep hash gets calculated and saved at each level of unwrapping.
26 */
27 public final class Ssdeep {
28
29 private static final Logger logger = LoggerFactory.getLogger(Ssdeep.class);
30
31 private static final int SPAMSUM_LENGTH = 64;
32 private static final int MIN_BLOCKSIZE = 3;
33
34 @SuppressWarnings("PMD.UselessParentheses")
35 public static final int FUZZY_MAX_RESULT = SPAMSUM_LENGTH + (SPAMSUM_LENGTH / 2 + 20);
36
37 /** The window size for the rolling hash. */
38 private static final int ROLLING_WINDOW_SIZE = 7;
39
40 /** The buffer size to use when reading data from a file. */
41 private static final int BUFFER_SIZE = 8192;
42
43 /** FNV hash initial value, 32-bit unsigned. */
44 private static final long HASH_INIT = 0x28021967;
45
46 /** FNV hash prime multiplier, 32-bit unsigned. */
47 private static final long HASH_PRIME = 0x01000193;
48
49 /** Used to mask long values to 32 bits unsigned. */
50 private static final long MASK32 = 0xffffffffL;
51
52 /**
53 * Base64 encoding table. Given a 5-bit value {@code n}, position {@code n} in the array is the code point (expressed as
54 * a byte) that should appear.
55 */
56 private static final byte[] b64Table = SpamSumSignature.getBytes("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
57
58 /**
59 * Get the base64 encoding of the low 6 bits of the given value.
60 *
61 * @param v The value to encode.
62 * @return The base64 encoding of the low 6 bits of {@code v}. The returned value is a code point expressed as a byte.
63 */
64 private static byte b64EncodeLowBits(final long v) {
65 return b64Table[((int) v) & 0x3f];
66 }
67
68 private static final class SsContext {
69
70 /** A buffer for the main hash output. */
71 private final byte[] fuzzHash1 = new byte[SPAMSUM_LENGTH + 1];
72
73 /** A buffer for the secondary hash output. */
74 private final byte[] fuzzHash2 = new byte[SPAMSUM_LENGTH / 2 + 1];
75
76 /** The count of output bytes currently stored in {@link #fuzzHash1}, initially zero. */
77 private int fuzzLen1;
78
79 /** The count of output bytes currently stored in {@link #fuzzHash2}, initially zero. */
80 private int fuzzLen2;
81
82 private long sumHash1; // Initially zero.
83 private long sumHash2; // Initially zero.
84 private long blockSize; // Initialized by constructor.
85
86 /**
87 * Estimate the block size to use.
88 *
89 * @param expectedInputLength The expected amount of data to be processed, in bytes. A 0 value can be used if the length
90 * is unknown, in which case a default block size is returned.
91 * @return The estimated block size to use.
92 */
93 private static long estimateBlockSize(final long expectedInputLength) {
94 long blockSize = MIN_BLOCKSIZE;
95 while ((blockSize * SPAMSUM_LENGTH) < expectedInputLength) {
96 blockSize *= 2;
97 }
98 return blockSize;
99 }
100
101 /**
102 * Construct a spam sum context to process a file.
103 *
104 * @param f The file that will be processed, if known. If non-{@code null}, the length of the file is used to guess the
105 * hash block size to use.
106 */
107 public SsContext(@Nullable final File f) {
108 final long expectedInputLength = (f != null) ? f.length() : 0;
109 this.blockSize = estimateBlockSize(expectedInputLength);
110 }
111
112 /**
113 * Construct a spam sum context to process a byte array.
114 *
115 * @param data The bytes that will be processed, if known. If non-{@code null}, the length of the array is used to guess
116 * the hash block size to use.
117 */
118 public SsContext(@Nullable final byte[] data) {
119 final long expectedInputLength = (data != null) ? data.length : 0;
120 this.blockSize = estimateBlockSize(expectedInputLength);
121 }
122
123 /**
124 * Construct a spam sum context to process a {@link SeekableByteChannel}
125 *
126 * @param sbcf The channel that will be processed, if known. If non-{@code null}, the length of the channel is used to
127 * guess the hash block size to use.
128 */
129 public SsContext(final SeekableByteChannelFactory sbcf) {
130 long expectedInputLength = 0;
131
132 try (SeekableByteChannel sbc = sbcf.create()) {
133 expectedInputLength = sbc.size();
134 } catch (final IOException ignored) {
135 // Ignore
136 }
137
138 this.blockSize = estimateBlockSize(expectedInputLength);
139 }
140
141 /**
142 * A simple non-rolling hash, based on the FNV hash
143 *
144 * @param b The next byte value, assumed to be in the range 0..255.
145 * @param h The existing hash value, 32-bit unsigned.
146 * @return The updated hash value, 32-bit unsigned.
147 */
148 private static long updateSumHash(final int b, final long h) {
149 return ((h * HASH_PRIME) ^ b) & MASK32;
150 }
151
152 /**
153 * Apply some bytes to a SpamSum context.
154 *
155 * @param rollState The rolling hash state to use.
156 * @param buffer A buffer containing the input bytes.
157 * @param start The starting offset in {@code buffer}, inclusive.
158 * @param end The ending offset in {@code buffer}, exclusive.
159 */
160 @SuppressWarnings("PMD.CollapsibleIfStatements")
161 private void applyBytes(final RollingState rollState, final byte[] buffer, final int start, final int end) {
162 // At each byte we update the rolling hash and the normal
163 // hash. When the rolling hash hits the reset value, we
164 // emit the normal hash as an element of the signature and
165 // reset both hashes.
166 for (int i = start; i < end; i++) {
167 // Get the next input byte and normalize to 0..255.
168 final int nextByte = ((int) buffer[i]) & 0xff;
169
170 // Apply the next byte to the hashes.
171 this.sumHash1 = updateSumHash(nextByte, this.sumHash1);
172 this.sumHash2 = updateSumHash(nextByte, this.sumHash2);
173 final long rollingHash = rollState.roll(nextByte);
174
175 if ((rollingHash % this.blockSize) == (this.blockSize - 1)) {
176 // We have hit a reset point. We now emit a hash
177 // which is based on all bytes in the input
178 // between the last reset point and this one.
179 if (this.fuzzLen1 < (SPAMSUM_LENGTH - 1)) {
180 // We can have a problem with the tail
181 // overflowing. The easiest way to cope with
182 // this is to only reset the second hash if we
183 // have room for more characters in our
184 // signature. This has the effect of combining
185 // the last few pieces of the message into a
186 // single piece
187 this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
188 this.sumHash1 = HASH_INIT;
189 }
190
191 // This produces a second signature with a block size
192 // of blockSize*2. By producing dual signatures in
193 // this way the effect of small changes in the message
194 // size near a block size boundary is greatly reduced.
195 //
196 // NOTE: we only have to check this when the main
197 // signature has hit a reset point, because
198 // mathematically:
199 //
200 // [ h === -1 (mod 2*bs) ] --implies--> [ h === -1 (mod bs) ]
201 //
202 // In other words, if this condition is true then the
203 // main signature condition must always also be true.
204 // Therefore this secondary signature condition can
205 // only potentially be true if the main signature
206 // condition (which we've already checked) is true.
207 if ((rollingHash % (this.blockSize * 2)) == ((this.blockSize * 2) - 1)) {
208 if (this.fuzzLen2 < (SPAMSUM_LENGTH / 2 - 1)) {
209 this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
210 this.sumHash2 = HASH_INIT;
211 }
212 }
213 }
214 }
215 }
216
217 /**
218 * Discard any existing hash state and prepare to compute a new hash. This should be followed by calls to
219 * {@link #applyBytes(RollingState, byte[], int, int)} to provide the data, and then
220 * {@link #finishHashing(RollingState)} to complete the computations.
221 */
222 private void beginHashing() {
223 this.fuzzLen1 = 0;
224 this.fuzzLen2 = 0;
225 this.sumHash1 = HASH_INIT;
226 this.sumHash2 = HASH_INIT;
227 }
228
229 /**
230 * Truncate an array if larger than the given length.
231 *
232 * @param input The input array.
233 * @param maxLength The desired maximum array length.
234 * @return If {@code input} is no larger than {@code maxLength}, this just returns {@code input}. Otherwise this returns
235 * a new array with the same content as {@code input} but with the length truncated to {@code maxLength}.
236 */
237 private static byte[] truncateArray(final byte[] input, final int maxLength) {
238 if (input.length == maxLength) {
239 return input;
240 } else {
241 return Arrays.copyOf(input, maxLength);
242 }
243 }
244
245 /**
246 * Finish hashing and generate the final signature. This should be done after all bytes have been applied with
247 * {@link #applyBytes(RollingState, byte[], int, int)}.
248 *
249 * @param rollState The rolling hash state used during hashing.
250 * @return The final signature.
251 */
252 private SpamSumSignature finishHashing(final RollingState rollState) {
253 if (rollState.getHash() != 0) {
254 this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
255 this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
256 }
257
258 final byte[] finalHash1 = truncateArray(this.fuzzHash1, this.fuzzLen1);
259 final byte[] finalHash2 = truncateArray(this.fuzzHash2, this.fuzzLen2);
260 return new SpamSumSignature(this.blockSize, finalHash1, finalHash2);
261 }
262
263 /**
264 * Generate the hash for some input.
265 *
266 * <p>
267 * The computations will use the current block size from the context, but any other existing hash state will be
268 * discarded.
269 *
270 * @param data The bytes to hash.
271 * @return The signature for the given data.
272 */
273 public SpamSumSignature generateHash(@Nullable final byte[] data) {
274 beginHashing();
275 final RollingState rollState = new RollingState();
276
277 if (data != null) {
278 applyBytes(rollState, data, 0, data.length);
279 }
280
281 return finishHashing(rollState);
282 }
283
284 public SpamSumSignature generateHash(final SeekableByteChannelFactory sbcf) {
285 beginHashing();
286 final RollingState rollState = new RollingState();
287
288 try (InputStream is = Channels.newInputStream(sbcf.create())) {
289 final byte[] b = new byte[1024];
290
291 int bytesRead;
292 while ((bytesRead = is.read(b)) != -1) {
293 applyBytes(rollState, b, 0, bytesRead);
294 }
295 } catch (final IOException ignored) {
296 // Ignore
297 }
298
299 return finishHashing(rollState);
300 }
301
302 /**
303 * Generate the hash for some input.
304 *
305 * <p>
306 * The computations will use the current block size from the context, but any other existing hash state will be
307 * discarded.
308 *
309 * @param stream A file containing the bytes to hash. Assumed non-{@code null}. The processing will start reading at the
310 * current file position and hash all of the data from there to the end of the file. The file position when this
311 * returns is unspecified. The file is not closed by this operation.
312 * @return The signature for the given stream content.
313 * @throws IOException If there is some I/O problem while reading the stream.
314 */
315 public SpamSumSignature generateHash(final RandomAccessFile stream) throws IOException {
316 beginHashing();
317 final RollingState rollState = new RollingState();
318
319 final byte[] buffer = new byte[BUFFER_SIZE];
320 while (true) {
321 final int bytesRead = stream.read(buffer, 0, buffer.length);
322 if (bytesRead <= 0) {
323 break; // No more input.
324 }
325 applyBytes(rollState, buffer, 0, bytesRead);
326 }
327
328 return finishHashing(rollState);
329 }
330 }
331
332 /**
333 * A rolling hash, based on the Adler checksum. By using a rolling hash we can perform auto resynchronisation after
334 * inserts/deletes.
335 */
336 private static final class RollingState {
337
338 /** Rolling window. Each value is in the range 0..255. Initially all 0. */
339 private final int[] window = new int[ROLLING_WINDOW_SIZE];
340
341 /** An index into {@link #window}. Initially 0. */
342 private int windowPosition;
343
344 /** The sum of the values in {@link #window}. Initially 0. */
345 private long h1;
346
347 /**
348 * The original documentation says this is the sum of the bytes times the index, but I'm not sure about that. 32-bit
349 * unsigned, initially 0.
350 */
351 private long h2;
352
353 /**
354 * A shift/xor based rolling hash, mostly needed to ensure that we can cope with large blocksize values. 32-bit
355 * unsigned, initially 0.
356 */
357 private long h3;
358
359 /**
360 * Construct a new rolling hash state.
361 */
362 public RollingState() {}
363
364 /**
365 * Get the current hash value.
366 *
367 * @return The current 32-bit unsigned hash value.
368 */
369 public long getHash() {
370 return (this.h1 + this.h2 + this.h3) & MASK32;
371 }
372
373 /**
374 * Update the rolling hash state with another input byte.
375 *
376 * @param b The byte value to apply. Assumed to be in the range 0..255.
377 * @return The state is updated and the resulting unsigned 32-bit hash value is returned.
378 */
379 public long roll(final int b) {
380 this.h2 = (this.h2 - this.h1 + (ROLLING_WINDOW_SIZE * ((long) b))) & MASK32;
381 this.h1 = (this.h1 + b - this.window[this.windowPosition]) & MASK32;
382 this.window[this.windowPosition] = b;
383
384 // Advance the window position, wrappping around at the end.
385 if (this.windowPosition == (ROLLING_WINDOW_SIZE - 1)) {
386 this.windowPosition = 0;
387 } else {
388 this.windowPosition++;
389 }
390
391 this.h3 = ((this.h3 << 5) & MASK32) ^ b;
392
393 return (this.h1 + this.h2 + this.h3) & MASK32;
394 }
395 }
396
397 public Ssdeep() {}
398
399 /**
400 * Calculate the SpamSum hash for a byte array.
401 *
402 * @param data The bytes to be hashed.
403 * @return The SpamSum signature for the bytes.
404 */
405 public String fuzzyHash(final byte[] data) {
406 final SsContext ctx = new SsContext(data);
407 while (true) {
408 final SpamSumSignature signature = ctx.generateHash(data);
409
410 // Our blocksize guess may have been way off, repeat with
411 // a smaller block size if necessary.
412 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
413 ctx.blockSize = ctx.blockSize / 2;
414 } else {
415 return signature.toString();
416 }
417 }
418 }
419
420 public String fuzzyHash(final SeekableByteChannelFactory sbcf) {
421 final SsContext ctx = new SsContext(sbcf);
422 while (true) {
423 final SpamSumSignature signature = ctx.generateHash(sbcf);
424
425 // Our blocksize guess may have been way off, repeat with
426 // a smaller block size if necessary.
427 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
428 ctx.blockSize = ctx.blockSize / 2;
429 } else {
430 return signature.toString();
431 }
432 }
433 }
434
435 /**
436 * Calculates the SpamSum hash for specified stream.
437 *
438 * @param file The input file to be hashed.
439 * @return The SpamSum signature for the file.
440 * @throws IOException If there is some I/O problem accessing the file.
441 */
442 public String fuzzyHashFile(final File file) throws IOException {
443 try (RandomAccessFile stream = new RandomAccessFile(file, "r")) {
444 final SsContext ctx = new SsContext(file);
445 while (true) {
446 stream.seek(0);
447 final SpamSumSignature signature = ctx.generateHash(stream);
448
449 // Our blocksize guess may have been way off, repeat with
450 // a smaller block size if necessary.
451 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
452 ctx.blockSize = ctx.blockSize / 2;
453 } else {
454 return signature.toString();
455 }
456 }
457 }
458 }
459
460 /**
461 * Calculates the SpamSum hash for specified file.
462 *
463 * @param fileName The path to the file to be hashed.
464 * @return The SpamSum signature for the file.
465 * @throws IOException If there is some I/O problem accessing the file.
466 */
467 public String fuzzyHashFile(final String fileName) throws IOException {
468 return this.fuzzyHashFile(new File(fileName));
469 }
470
471 /**
472 * Search an array for a subsequence of another array.
473 *
474 * @param haystack The array to search.
475 * @param needle The array containing the sequence to search for.
476 * @param needleStart The starting offset of the sequence to search for, inclusive. Assumed to be in range for
477 * {@code needle}.
478 * @param length The length of the sequence to search for. Assumed to be in range for {@code needle}. Assumed greater
479 * than zero.
480 * @return If the subsequence of {@code needle} is present in {@code haystack}, this returns the least offset where the
481 * subsequence occurs. Otherwise -1.
482 */
483 private static int indexOfSubSequence(final byte[] haystack, final byte[] needle, final int needleStart, final int length) {
484 final int lastCandidatePos = haystack.length - length;
485 final byte firstNeedleByte = needle[needleStart];
486 NEXT_CANDIDATE: for (int candidatePos = 0; candidatePos <= lastCandidatePos; candidatePos++) {
487 if (haystack[candidatePos] == firstNeedleByte) {
488 // The first needle byte matches at this candidate
489 // position, so look for the rest of the needle
490 // following it.
491 for (int needlePos = 1; needlePos < length; needlePos++) {
492 if (haystack[candidatePos + needlePos] != needle[needleStart + needlePos]) {
493 continue NEXT_CANDIDATE; // Needle mismatch.
494 }
495 }
496 // If we reach here, the entire needle subsequence
497 // matched in the haystack at the candidate position.
498 return candidatePos;
499 }
500 }
501 return -1;
502 }
503
504 /**
505 * Search two arrays for a common subsequence of the given length.
506 *
507 * @param s1 The first byte array for comparison.
508 * @param s2 The second byte array for comparison.
509 * @param length The substring length to look for. Assumed greater than zero.
510 * @return {@code true} iff {@code s1} and {@code s2} have at least one byte sequence of length {@code length} in common
511 * at arbitrary offsets.
512 */
513 private static boolean hasCommonSequence(final byte[] s1, final byte[] s2, final int length) {
514 if ((s1.length < length) || (s2.length < length)) {
515 return false; // The strings are not large enough.
516 }
517
518 // This is just a brute-force approach. We move a window of
519 // the specified length through s1 and check whether it exists
520 // anywhere in s2.
521 final int lastS1Pos = s1.length - length;
522 for (int s1Pos = 0; s1Pos <= lastS1Pos; s1Pos++) {
523 final int s2Pos = indexOfSubSequence(s2, s1, s1Pos, length);
524 if (s2Pos != -1) {
525 return true;
526 }
527 }
528 return false;
529 }
530
531 /**
532 * Truncate sequences of longer than 3 identical bytes. These sequences contain very little information so they tend to
533 * just bias the result unfairly.
534 *
535 * @param in The input bytes.
536 * @return An array containing the same content as {@code in}, except that any sequences of more than 3 identical bytes
537 * are truncated to 3 bytes. For example "aaabbbbcddddd" becomes "aaabbbcddd".
538 */
539 private static byte[] eliminateLongSequences(final byte[] in) {
540 if (in.length < 4) {
541 return in; // There is not enough input to require any change.
542 }
543
544 // We just need to initialize prev to something other than in[0].
545 byte prev = (in[0] != 0) ? 0 : (byte) 1;
546 int repeatCount = 0;
547
548 // Scan the input looking for the index of the first byte that
549 // will need to be removed.
550 int inPos = 0;
551 while (true) {
552 if (inPos == in.length) {
553 // We didn't find anything that needed to be removed.
554 return in;
555 } else if (in[inPos] == prev) {
556 // This is a repeat of the previous byte.
557 repeatCount++;
558 if (repeatCount == 3) {
559 break; // inPos needs to be removed.
560 }
561 } else {
562 // This is not a repeat of the previous byte.
563 prev = in[inPos];
564 repeatCount = 0;
565 }
566 inPos++;
567 }
568
569 // At this point inPos is the first index that needs to be
570 // removed, prev is set to its byte value, and repeatCount is
571 // set to 3. Start an output array and copy everything up to
572 // but not including inPos.
573 final byte[] out = new byte[in.length - 1];
574 System.arraycopy(in, 0, out, 0, inPos);
575 int outPos = inPos;
576
577 // Continue scanning and copying to output.
578 while (++inPos < in.length) {
579 if (in[inPos] == prev) {
580 repeatCount++;
581 } else {
582 prev = in[inPos];
583 repeatCount = 0;
584 }
585 if (repeatCount < 3) {
586 out[outPos++] = in[inPos];
587 }
588 }
589
590 return (outPos == out.length) ? out : Arrays.copyOf(out, outPos);
591 }
592
593 /**
594 * This is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a
595 * terrible match and 100 is a great match. The blockSize is used to cope with very small messages.
596 */
597 private static long scoreStrings(final byte[] s1, final byte[] s2, final long blockSize) {
598 final int len1 = s1.length;
599 final int len2 = s2.length;
600
601 if ((len1 > SPAMSUM_LENGTH) || (len2 > SPAMSUM_LENGTH)) {
602 // not a real spamsum signature?
603 return 0;
604 }
605
606 // The two strings must have a common substring of length
607 // ROLLING_WINDOW_SIZE to be candidates.
608 if (!hasCommonSequence(s1, s2, ROLLING_WINDOW_SIZE)) {
609 return 0;
610 }
611
612 // Compute the edit distance between the two strings. The edit
613 // distance gives us a pretty good idea of how closely related
614 // the two strings are.
615 long score = EditDistance.calculateEditDistance(s1, len1, s2, len2);
616 if (logger.isDebugEnabled()) {
617 logger.debug("edit_dist: {}", score);
618 }
619
620 // Scale the edit distance by the lengths of the two
621 // strings. This changes the score to be a measure of the
622 // proportion of the message that has changed rather than an
623 // absolute quantity. It also copes with the variability of
624 // the string lengths.
625 score = score * SPAMSUM_LENGTH / (len1 + len2);
626
627 // At this stage the score occurs roughly on a 0-64 scale,
628 // with 0 being a good match and 64 being a complete mismatch.
629
630 // Rescale to a 0-100 scale (friendlier to humans).
631 score = 100 * score / 64;
632
633 // It is possible to get a score above 100 here, but it is a
634 // really terrible match.
635 if (score >= 100) {
636 return 0;
637 }
638
639 // Now re-scale on a 0-100 scale with 0 being a poor match and
640 // 100 being a excellent match.
641 score = 100 - score;
642
643 // When the blocksize is small we don't want to exaggerate the
644 // match size.
645 if (score > (blockSize / MIN_BLOCKSIZE * Math.min(len1, len2))) {
646 score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2);
647 }
648 return score;
649 }
650
651 /**
652 * Given two spamsum signature return a value indicating the degree to which they match.
653 *
654 * @param signature1 The first signature.
655 * @param signature2 The second signature.
656 * @return The score for the two signatures. The value is in the range 0..100, where 0 is a terrible match and 100 is a
657 * great match.
658 */
659 public int compare(@Nullable final SpamSumSignature signature1, @Nullable final SpamSumSignature signature2) {
660 if ((null == signature1) || (null == signature2)) {
661 return -1;
662 }
663 final long blockSize1 = signature1.getBlockSize();
664 final long blockSize2 = signature2.getBlockSize();
665
666 // We require the block sizes to either be equal, or for one
667 // to be twice the other. If the blocksizes don't match then
668 // we are comparing apples to oranges. This isn't an 'error'
669 // per se. We could have two valid signatures, but they can't
670 // be compared.
671 if ((blockSize1 != blockSize2) && (blockSize1 != (blockSize2 * 2)) && (blockSize2 != (blockSize1 * 2))) {
672 if (logger.isDebugEnabled()) {
673 logger.debug("block sizes too different: {} {}", blockSize1, blockSize2);
674 }
675 return 0;
676 }
677
678 // There is very little information content is sequences of
679 // the same character like 'LLLLL'. Eliminate any sequences
680 // longer than 3. This is especially important when combined
681 // with the hasCommonSequence() test.
682 final byte[] s1First = eliminateLongSequences(signature1.getHashPart1());
683 final byte[] s1Second = eliminateLongSequences(signature1.getHashPart2());
684 final byte[] s2First = eliminateLongSequences(signature2.getHashPart1());
685 final byte[] s2Second = eliminateLongSequences(signature2.getHashPart2());
686
687 // Each signature has a string for two block sizes. We now
688 // choose how to combine the two block sizes. We checked above
689 // that they have at least one block size in common.
690 final long score;
691 if (blockSize1 == blockSize2) {
692 // The signature block sizes are equal.
693 final long score1 = scoreStrings(s1First, s2First, blockSize1);
694 final long score2 = scoreStrings(s1Second, s2Second, blockSize2);
695 score = Math.max(score1, score2);
696 } else if (blockSize1 == (blockSize2 * 2)) {
697 // The first signature has twice the block size of the second.
698 score = scoreStrings(s1First, s2Second, blockSize1);
699 } else {
700 // The second signature has twice the block size of the first.
701 score = scoreStrings(s1Second, s2First, blockSize2);
702 }
703
704 return (int) score;
705 }
706
707 @SuppressWarnings("SystemOut")
708 public static void main(final String[] args) throws Exception {
709 final Ssdeep ss = new Ssdeep();
710 for (final String f : args) {
711 try (InputStream is = Files.newInputStream(Paths.get(f))) {
712 final byte[] buffer = IOUtils.toByteArray(is);
713 // output format matches the original ssdeep program
714 System.out.println(ss.fuzzyHash(buffer) + ",\"" + f + "\"");
715 }
716 }
717 }
718 }