View Javadoc
1   package emissary.kff;
2   
3   import emissary.core.channels.SeekableByteChannelFactory;
4   
5   import jakarta.annotation.Nullable;
6   import org.apache.commons.io.IOUtils;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   
10  import java.io.File;
11  import java.io.IOException;
12  import java.io.InputStream;
13  import java.io.RandomAccessFile;
14  import java.nio.channels.Channels;
15  import java.nio.channels.SeekableByteChannel;
16  import java.nio.file.Files;
17  import java.nio.file.Paths;
18  import java.util.Arrays;
19  
20  /**
21   * A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
22   * there that all look basically the same. This one is from
23   * https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
24   *
25   * A new ssdeep hash gets calculated and saved at each level of unwrapping.
26   */
27  public final class Ssdeep {
28  
29      private static final Logger logger = LoggerFactory.getLogger(Ssdeep.class);
30  
31      private static final int SPAMSUM_LENGTH = 64;
32      private static final int MIN_BLOCKSIZE = 3;
33  
34      @SuppressWarnings("PMD.UselessParentheses")
35      public static final int FUZZY_MAX_RESULT = SPAMSUM_LENGTH + (SPAMSUM_LENGTH / 2 + 20);
36  
37      /** The window size for the rolling hash. */
38      private static final int ROLLING_WINDOW_SIZE = 7;
39  
40      /** The buffer size to use when reading data from a file. */
41      private static final int BUFFER_SIZE = 8192;
42  
43      /** FNV hash initial value, 32-bit unsigned. */
44      private static final long HASH_INIT = 0x28021967;
45  
46      /** FNV hash prime multiplier, 32-bit unsigned. */
47      private static final long HASH_PRIME = 0x01000193;
48  
49      /** Used to mask long values to 32 bits unsigned. */
50      private static final long MASK32 = 0xffffffffL;
51  
52      /**
53       * Base64 encoding table. Given a 5-bit value {@code n}, position {@code n} in the array is the code point (expressed as
54       * a byte) that should appear.
55       */
56      private static final byte[] b64Table = SpamSumSignature.getBytes("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
57  
58      /**
59       * Get the base64 encoding of the low 6 bits of the given value.
60       *
61       * @param v The value to encode.
62       * @return The base64 encoding of the low 6 bits of {@code v}. The returned value is a code point expressed as a byte.
63       */
64      private static byte b64EncodeLowBits(final long v) {
65          return b64Table[((int) v) & 0x3f];
66      }
67  
68      private static final class SsContext {
69  
70          /** A buffer for the main hash output. */
71          private final byte[] fuzzHash1 = new byte[SPAMSUM_LENGTH + 1];
72  
73          /** A buffer for the secondary hash output. */
74          private final byte[] fuzzHash2 = new byte[SPAMSUM_LENGTH / 2 + 1];
75  
76          /** The count of output bytes currently stored in {@link #fuzzHash1}, initially zero. */
77          private int fuzzLen1;
78  
79          /** The count of output bytes currently stored in {@link #fuzzHash2}, initially zero. */
80          private int fuzzLen2;
81  
82          private long sumHash1; // Initially zero.
83          private long sumHash2; // Initially zero.
84          private long blockSize; // Initialized by constructor.
85  
86          /**
87           * Estimate the block size to use.
88           *
89           * @param expectedInputLength The expected amount of data to be processed, in bytes. A 0 value can be used if the length
90           *        is unknown, in which case a default block size is returned.
91           * @return The estimated block size to use.
92           */
93          private static long estimateBlockSize(final long expectedInputLength) {
94              long blockSize = MIN_BLOCKSIZE;
95              while ((blockSize * SPAMSUM_LENGTH) < expectedInputLength) {
96                  blockSize *= 2;
97              }
98              return blockSize;
99          }
100 
101         /**
102          * Construct a spam sum context to process a file.
103          *
104          * @param f The file that will be processed, if known. If non-{@code null}, the length of the file is used to guess the
105          *        hash block size to use.
106          */
107         public SsContext(@Nullable final File f) {
108             final long expectedInputLength = (f != null) ? f.length() : 0;
109             this.blockSize = estimateBlockSize(expectedInputLength);
110         }
111 
112         /**
113          * Construct a spam sum context to process a byte array.
114          *
115          * @param data The bytes that will be processed, if known. If non-{@code null}, the length of the array is used to guess
116          *        the hash block size to use.
117          */
118         public SsContext(@Nullable final byte[] data) {
119             final long expectedInputLength = (data != null) ? data.length : 0;
120             this.blockSize = estimateBlockSize(expectedInputLength);
121         }
122 
123         /**
124          * Construct a spam sum context to process a {@link SeekableByteChannel}
125          *
126          * @param sbcf The channel that will be processed, if known. If non-{@code null}, the length of the channel is used to
127          *        guess the hash block size to use.
128          */
129         public SsContext(final SeekableByteChannelFactory sbcf) {
130             long expectedInputLength = 0;
131 
132             try (SeekableByteChannel sbc = sbcf.create()) {
133                 expectedInputLength = sbc.size();
134             } catch (final IOException ignored) {
135                 // Ignore
136             }
137 
138             this.blockSize = estimateBlockSize(expectedInputLength);
139         }
140 
141         /**
142          * A simple non-rolling hash, based on the FNV hash
143          * 
144          * @param b The next byte value, assumed to be in the range 0..255.
145          * @param h The existing hash value, 32-bit unsigned.
146          * @return The updated hash value, 32-bit unsigned.
147          */
148         private static long updateSumHash(final int b, final long h) {
149             return ((h * HASH_PRIME) ^ b) & MASK32;
150         }
151 
152         /**
153          * Apply some bytes to a SpamSum context.
154          *
155          * @param rollState The rolling hash state to use.
156          * @param buffer A buffer containing the input bytes.
157          * @param start The starting offset in {@code buffer}, inclusive.
158          * @param end The ending offset in {@code buffer}, exclusive.
159          */
160         private void applyBytes(final RollingState rollState, final byte[] buffer, final int start, final int end) {
161             // At each byte we update the rolling hash and the normal
162             // hash. When the rolling hash hits the reset value, we
163             // emit the normal hash as an element of the signature and
164             // reset both hashes.
165             for (int i = start; i < end; i++) {
166                 // Get the next input byte and normalize to 0..255.
167                 final int nextByte = ((int) buffer[i]) & 0xff;
168 
169                 // Apply the next byte to the hashes.
170                 this.sumHash1 = updateSumHash(nextByte, this.sumHash1);
171                 this.sumHash2 = updateSumHash(nextByte, this.sumHash2);
172                 final long rollingHash = rollState.roll(nextByte);
173 
174                 if ((rollingHash % this.blockSize) == (this.blockSize - 1)) {
175                     // We have hit a reset point. We now emit a hash
176                     // which is based on all bytes in the input
177                     // between the last reset point and this one.
178                     if (this.fuzzLen1 < (SPAMSUM_LENGTH - 1)) {
179                         // We can have a problem with the tail
180                         // overflowing. The easiest way to cope with
181                         // this is to only reset the second hash if we
182                         // have room for more characters in our
183                         // signature. This has the effect of combining
184                         // the last few pieces of the message into a
185                         // single piece
186                         this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
187                         this.sumHash1 = HASH_INIT;
188                     }
189 
190                     // This produces a second signature with a block size
191                     // of blockSize*2. By producing dual signatures in
192                     // this way the effect of small changes in the message
193                     // size near a block size boundary is greatly reduced.
194                     //
195                     // NOTE: we only have to check this when the main
196                     // signature has hit a reset point, because
197                     // mathematically:
198                     //
199                     // [ h === -1 (mod 2*bs) ] --implies--> [ h === -1 (mod bs) ]
200                     //
201                     // In other words, if this condition is true then the
202                     // main signature condition must always also be true.
203                     // Therefore this secondary signature condition can
204                     // only potentially be true if the main signature
205                     // condition (which we've already checked) is true.
206                     if ((rollingHash % (this.blockSize * 2)) == ((this.blockSize * 2) - 1)) {
207                         if (this.fuzzLen2 < (SPAMSUM_LENGTH / 2 - 1)) {
208                             this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
209                             this.sumHash2 = HASH_INIT;
210                         }
211                     }
212                 }
213             }
214         }
215 
216         /**
217          * Discard any existing hash state and prepare to compute a new hash. This should be followed by calls to
218          * {@link #applyBytes(RollingState, byte[], int, int)} to provide the data, and then
219          * {@link #finishHashing(RollingState)} to complete the computations.
220          */
221         private void beginHashing() {
222             this.fuzzLen1 = 0;
223             this.fuzzLen2 = 0;
224             this.sumHash1 = HASH_INIT;
225             this.sumHash2 = HASH_INIT;
226         }
227 
228         /**
229          * Truncate an array if larger than the given length.
230          *
231          * @param input The input array.
232          * @param maxLength The desired maximum array length.
233          * @return If {@code input} is no larger than {@code maxLength}, this just returns {@code input}. Otherwise this returns
234          *         a new array with the same content as {@code input} but with the length truncated to {@code maxLength}.
235          */
236         private static byte[] truncateArray(final byte[] input, final int maxLength) {
237             if (input.length == maxLength) {
238                 return input;
239             } else {
240                 return Arrays.copyOf(input, maxLength);
241             }
242         }
243 
244         /**
245          * Finish hashing and generate the final signature. This should be done after all bytes have been applied with
246          * {@link #applyBytes(RollingState, byte[], int, int)}.
247          *
248          * @param rollState The rolling hash state used during hashing.
249          * @return The final signature.
250          */
251         private SpamSumSignature finishHashing(final RollingState rollState) {
252             if (rollState.getHash() != 0) {
253                 this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
254                 this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
255             }
256 
257             final byte[] finalHash1 = truncateArray(this.fuzzHash1, this.fuzzLen1);
258             final byte[] finalHash2 = truncateArray(this.fuzzHash2, this.fuzzLen2);
259             return new SpamSumSignature(this.blockSize, finalHash1, finalHash2);
260         }
261 
262         /**
263          * Generate the hash for some input.
264          *
265          * <p>
266          * The computations will use the current block size from the context, but any other existing hash state will be
267          * discarded.
268          *
269          * @param data The bytes to hash.
270          * @return The signature for the given data.
271          */
272         public SpamSumSignature generateHash(@Nullable final byte[] data) {
273             beginHashing();
274             final RollingState rollState = new RollingState();
275 
276             if (data != null) {
277                 applyBytes(rollState, data, 0, data.length);
278             }
279 
280             return finishHashing(rollState);
281         }
282 
283         public SpamSumSignature generateHash(final SeekableByteChannelFactory sbcf) {
284             beginHashing();
285             final RollingState rollState = new RollingState();
286 
287             try (InputStream is = Channels.newInputStream(sbcf.create())) {
288                 final byte[] b = new byte[1024];
289 
290                 int bytesRead;
291                 while ((bytesRead = is.read(b)) != -1) {
292                     applyBytes(rollState, b, 0, bytesRead);
293                 }
294             } catch (final IOException ignored) {
295                 // Ignore
296             }
297 
298             return finishHashing(rollState);
299         }
300 
301         /**
302          * Generate the hash for some input.
303          *
304          * <p>
305          * The computations will use the current block size from the context, but any other existing hash state will be
306          * discarded.
307          *
308          * @param stream A file containing the bytes to hash. Assumed non-{@code null}. The processing will start reading at the
309          *        current file position and hash all of the data from there to the end of the file. The file position when this
310          *        returns is unspecified. The file is not closed by this operation.
311          * @return The signature for the given stream content.
312          * @throws IOException If there is some I/O problem while reading the stream.
313          */
314         public SpamSumSignature generateHash(final RandomAccessFile stream) throws IOException {
315             beginHashing();
316             final RollingState rollState = new RollingState();
317 
318             final byte[] buffer = new byte[BUFFER_SIZE];
319             while (true) {
320                 final int bytesRead = stream.read(buffer, 0, buffer.length);
321                 if (bytesRead <= 0) {
322                     break; // No more input.
323                 }
324                 applyBytes(rollState, buffer, 0, bytesRead);
325             }
326 
327             return finishHashing(rollState);
328         }
329     }
330 
331     /**
332      * A rolling hash, based on the Adler checksum. By using a rolling hash we can perform auto resynchronisation after
333      * inserts/deletes.
334      */
335     private static final class RollingState {
336 
337         /** Rolling window. Each value is in the range 0..255. Initially all 0. */
338         private final int[] window = new int[ROLLING_WINDOW_SIZE];
339 
340         /** An index into {@link #window}. Initially 0. */
341         private int windowPosition;
342 
343         /** The sum of the values in {@link #window}. Initially 0. */
344         private long h1;
345 
346         /**
347          * The original documentation says this is the sum of the bytes times the index, but I'm not sure about that. 32-bit
348          * unsigned, initially 0.
349          */
350         private long h2;
351 
352         /**
353          * A shift/xor based rolling hash, mostly needed to ensure that we can cope with large blocksize values. 32-bit
354          * unsigned, initially 0.
355          */
356         private long h3;
357 
358         /**
359          * Construct a new rolling hash state.
360          */
361         public RollingState() {}
362 
363         /**
364          * Get the current hash value.
365          *
366          * @return The current 32-bit unsigned hash value.
367          */
368         public long getHash() {
369             return (this.h1 + this.h2 + this.h3) & MASK32;
370         }
371 
372         /**
373          * Update the rolling hash state with another input byte.
374          *
375          * @param b The byte value to apply. Assumed to be in the range 0..255.
376          * @return The state is updated and the resulting unsigned 32-bit hash value is returned.
377          */
378         public long roll(final int b) {
379             this.h2 = (this.h2 - this.h1 + (ROLLING_WINDOW_SIZE * ((long) b))) & MASK32;
380             this.h1 = (this.h1 + b - this.window[this.windowPosition]) & MASK32;
381             this.window[this.windowPosition] = b;
382 
383             // Advance the window position, wrappping around at the end.
384             if (this.windowPosition == (ROLLING_WINDOW_SIZE - 1)) {
385                 this.windowPosition = 0;
386             } else {
387                 this.windowPosition++;
388             }
389 
390             this.h3 = ((this.h3 << 5) & MASK32) ^ b;
391 
392             return (this.h1 + this.h2 + this.h3) & MASK32;
393         }
394     }
395 
396     public Ssdeep() {}
397 
398     /**
399      * Calculate the SpamSum hash for a byte array.
400      *
401      * @param data The bytes to be hashed.
402      * @return The SpamSum signature for the bytes.
403      */
404     public String fuzzyHash(final byte[] data) {
405         final SsContext ctx = new SsContext(data);
406         while (true) {
407             final SpamSumSignature signature = ctx.generateHash(data);
408 
409             // Our blocksize guess may have been way off, repeat with
410             // a smaller block size if necessary.
411             if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
412                 ctx.blockSize = ctx.blockSize / 2;
413             } else {
414                 return signature.toString();
415             }
416         }
417     }
418 
419     public String fuzzyHash(final SeekableByteChannelFactory sbcf) {
420         final SsContext ctx = new SsContext(sbcf);
421         while (true) {
422             final SpamSumSignature signature = ctx.generateHash(sbcf);
423 
424             // Our blocksize guess may have been way off, repeat with
425             // a smaller block size if necessary.
426             if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
427                 ctx.blockSize = ctx.blockSize / 2;
428             } else {
429                 return signature.toString();
430             }
431         }
432     }
433 
434     /**
435      * Calculates the SpamSum hash for specified stream.
436      * 
437      * @param file The input file to be hashed.
438      * @return The SpamSum signature for the file.
439      * @throws IOException If there is some I/O problem accessing the file.
440      */
441     public String fuzzyHashFile(final File file) throws IOException {
442         try (RandomAccessFile stream = new RandomAccessFile(file, "r")) {
443             final SsContext ctx = new SsContext(file);
444             while (true) {
445                 stream.seek(0);
446                 final SpamSumSignature signature = ctx.generateHash(stream);
447 
448                 // Our blocksize guess may have been way off, repeat with
449                 // a smaller block size if necessary.
450                 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
451                     ctx.blockSize = ctx.blockSize / 2;
452                 } else {
453                     return signature.toString();
454                 }
455             }
456         }
457     }
458 
459     /**
460      * Calculates the SpamSum hash for specified file.
461      *
462      * @param fileName The path to the file to be hashed.
463      * @return The SpamSum signature for the file.
464      * @throws IOException If there is some I/O problem accessing the file.
465      */
466     public String fuzzyHashFile(final String fileName) throws IOException {
467         return this.fuzzyHashFile(new File(fileName));
468     }
469 
470     /**
471      * Search an array for a subsequence of another array.
472      *
473      * @param haystack The array to search.
474      * @param needle The array containing the sequence to search for.
475      * @param needleStart The starting offset of the sequence to search for, inclusive. Assumed to be in range for
476      *        {@code needle}.
477      * @param length The length of the sequence to search for. Assumed to be in range for {@code needle}. Assumed greater
478      *        than zero.
479      * @return If the subsequence of {@code needle} is present in {@code haystack}, this returns the least offset where the
480      *         subsequence occurs. Otherwise -1.
481      */
482     private static int indexOfSubSequence(final byte[] haystack, final byte[] needle, final int needleStart, final int length) {
483         final int lastCandidatePos = haystack.length - length;
484         final byte firstNeedleByte = needle[needleStart];
485         NEXT_CANDIDATE: for (int candidatePos = 0; candidatePos <= lastCandidatePos; candidatePos++) {
486             if (haystack[candidatePos] == firstNeedleByte) {
487                 // The first needle byte matches at this candidate
488                 // position, so look for the rest of the needle
489                 // following it.
490                 for (int needlePos = 1; needlePos < length; needlePos++) {
491                     if (haystack[candidatePos + needlePos] != needle[needleStart + needlePos]) {
492                         continue NEXT_CANDIDATE; // Needle mismatch.
493                     }
494                 }
495                 // If we reach here, the entire needle subsequence
496                 // matched in the haystack at the candidate position.
497                 return candidatePos;
498             }
499         }
500         return -1;
501     }
502 
503     /**
504      * Search two arrays for a common subsequence of the given length.
505      *
506      * @param s1 The first byte array for comparison.
507      * @param s2 The second byte array for comparison.
508      * @param length The substring length to look for. Assumed greater than zero.
509      * @return {@code true} iff {@code s1} and {@code s2} have at least one byte sequence of length {@code length} in common
510      *         at arbitrary offsets.
511      */
512     private static boolean hasCommonSequence(final byte[] s1, final byte[] s2, final int length) {
513         if ((s1.length < length) || (s2.length < length)) {
514             return false; // The strings are not large enough.
515         }
516 
517         // This is just a brute-force approach. We move a window of
518         // the specified length through s1 and check whether it exists
519         // anywhere in s2.
520         final int lastS1Pos = s1.length - length;
521         for (int s1Pos = 0; s1Pos <= lastS1Pos; s1Pos++) {
522             final int s2Pos = indexOfSubSequence(s2, s1, s1Pos, length);
523             if (s2Pos != -1) {
524                 return true;
525             }
526         }
527         return false;
528     }
529 
530     /**
531      * Truncate sequences of longer than 3 identical bytes. These sequences contain very little information so they tend to
532      * just bias the result unfairly.
533      *
534      * @param in The input bytes.
535      * @return An array containing the same content as {@code in}, except that any sequences of more than 3 identical bytes
536      *         are truncated to 3 bytes. For example "aaabbbbcddddd" becomes "aaabbbcddd".
537      */
538     private static byte[] eliminateLongSequences(final byte[] in) {
539         if (in.length < 4) {
540             return in; // There is not enough input to require any change.
541         }
542 
543         // We just need to initialize prev to something other than in[0].
544         byte prev = (in[0] != 0) ? 0 : (byte) 1;
545         int repeatCount = 0;
546 
547         // Scan the input looking for the index of the first byte that
548         // will need to be removed.
549         int inPos = 0;
550         while (true) {
551             if (inPos == in.length) {
552                 // We didn't find anything that needed to be removed.
553                 return in;
554             } else if (in[inPos] == prev) {
555                 // This is a repeat of the previous byte.
556                 repeatCount++;
557                 if (repeatCount == 3) {
558                     break; // inPos needs to be removed.
559                 }
560             } else {
561                 // This is not a repeat of the previous byte.
562                 prev = in[inPos];
563                 repeatCount = 0;
564             }
565             inPos++;
566         }
567 
568         // At this point inPos is the first index that needs to be
569         // removed, prev is set to its byte value, and repeatCount is
570         // set to 3. Start an output array and copy everything up to
571         // but not including inPos.
572         final byte[] out = new byte[in.length - 1];
573         System.arraycopy(in, 0, out, 0, inPos);
574         int outPos = inPos;
575 
576         // Continue scanning and copying to output.
577         while (++inPos < in.length) {
578             if (in[inPos] == prev) {
579                 repeatCount++;
580             } else {
581                 prev = in[inPos];
582                 repeatCount = 0;
583             }
584             if (repeatCount < 3) {
585                 out[outPos++] = in[inPos];
586             }
587         }
588 
589         return (outPos == out.length) ? out : Arrays.copyOf(out, outPos);
590     }
591 
592     /**
593      * This is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a
594      * terrible match and 100 is a great match. The blockSize is used to cope with very small messages.
595      */
596     private static long scoreStrings(final byte[] s1, final byte[] s2, final long blockSize) {
597         final int len1 = s1.length;
598         final int len2 = s2.length;
599 
600         if ((len1 > SPAMSUM_LENGTH) || (len2 > SPAMSUM_LENGTH)) {
601             // not a real spamsum signature?
602             return 0;
603         }
604 
605         // The two strings must have a common substring of length
606         // ROLLING_WINDOW_SIZE to be candidates.
607         if (!hasCommonSequence(s1, s2, ROLLING_WINDOW_SIZE)) {
608             return 0;
609         }
610 
611         // Compute the edit distance between the two strings. The edit
612         // distance gives us a pretty good idea of how closely related
613         // the two strings are.
614         long score = EditDistance.calculateEditDistance(s1, len1, s2, len2);
615         if (logger.isDebugEnabled()) {
616             logger.debug("edit_dist: {}", score);
617         }
618 
619         // Scale the edit distance by the lengths of the two
620         // strings. This changes the score to be a measure of the
621         // proportion of the message that has changed rather than an
622         // absolute quantity. It also copes with the variability of
623         // the string lengths.
624         score = score * SPAMSUM_LENGTH / (len1 + len2);
625 
626         // At this stage the score occurs roughly on a 0-64 scale,
627         // with 0 being a good match and 64 being a complete mismatch.
628 
629         // Rescale to a 0-100 scale (friendlier to humans).
630         score = 100 * score / 64;
631 
632         // It is possible to get a score above 100 here, but it is a
633         // really terrible match.
634         if (score >= 100) {
635             return 0;
636         }
637 
638         // Now re-scale on a 0-100 scale with 0 being a poor match and
639         // 100 being a excellent match.
640         score = 100 - score;
641 
642         // When the blocksize is small we don't want to exaggerate the
643         // match size.
644         if (score > (blockSize / MIN_BLOCKSIZE * Math.min(len1, len2))) {
645             score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2);
646         }
647         return score;
648     }
649 
650     /**
651      * Given two spamsum signature return a value indicating the degree to which they match.
652      *
653      * @param signature1 The first signature.
654      * @param signature2 The second signature.
655      * @return The score for the two signatures. The value is in the range 0..100, where 0 is a terrible match and 100 is a
656      *         great match.
657      */
658     public int compare(@Nullable final SpamSumSignature signature1, @Nullable final SpamSumSignature signature2) {
659         if ((null == signature1) || (null == signature2)) {
660             return -1;
661         }
662         final long blockSize1 = signature1.getBlockSize();
663         final long blockSize2 = signature2.getBlockSize();
664 
665         // We require the block sizes to either be equal, or for one
666         // to be twice the other. If the blocksizes don't match then
667         // we are comparing apples to oranges. This isn't an 'error'
668         // per se. We could have two valid signatures, but they can't
669         // be compared.
670         if ((blockSize1 != blockSize2) && (blockSize1 != (blockSize2 * 2)) && (blockSize2 != (blockSize1 * 2))) {
671             if (logger.isDebugEnabled()) {
672                 logger.debug("block sizes too different: {} {}", blockSize1, blockSize2);
673             }
674             return 0;
675         }
676 
677         // There is very little information content is sequences of
678         // the same character like 'LLLLL'. Eliminate any sequences
679         // longer than 3. This is especially important when combined
680         // with the hasCommonSequence() test.
681         final byte[] s1First = eliminateLongSequences(signature1.getHashPart1());
682         final byte[] s1Second = eliminateLongSequences(signature1.getHashPart2());
683         final byte[] s2First = eliminateLongSequences(signature2.getHashPart1());
684         final byte[] s2Second = eliminateLongSequences(signature2.getHashPart2());
685 
686         // Each signature has a string for two block sizes. We now
687         // choose how to combine the two block sizes. We checked above
688         // that they have at least one block size in common.
689         final long score;
690         if (blockSize1 == blockSize2) {
691             // The signature block sizes are equal.
692             final long score1 = scoreStrings(s1First, s2First, blockSize1);
693             final long score2 = scoreStrings(s1Second, s2Second, blockSize2);
694             score = Math.max(score1, score2);
695         } else if (blockSize1 == (blockSize2 * 2)) {
696             // The first signature has twice the block size of the second.
697             score = scoreStrings(s1First, s2Second, blockSize1);
698         } else {
699             // The second signature has twice the block size of the first.
700             score = scoreStrings(s1Second, s2First, blockSize2);
701         }
702 
703         return (int) score;
704     }
705 
706     @SuppressWarnings("SystemOut")
707     public static void main(final String[] args) throws Exception {
708         final Ssdeep ss = new Ssdeep();
709         for (final String f : args) {
710             try (InputStream is = Files.newInputStream(Paths.get(f))) {
711                 final byte[] buffer = IOUtils.toByteArray(is);
712                 // output format matches the original ssdeep program
713                 System.out.println(ss.fuzzyHash(buffer) + ",\"" + f + "\"");
714             }
715         }
716     }
717 }