View Javadoc
1   package emissary.kff;
2   
3   import emissary.core.channels.SeekableByteChannelFactory;
4   
5   import jakarta.annotation.Nullable;
6   import org.apache.commons.io.IOUtils;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   
10  import java.io.File;
11  import java.io.IOException;
12  import java.io.InputStream;
13  import java.io.RandomAccessFile;
14  import java.nio.channels.Channels;
15  import java.nio.channels.SeekableByteChannel;
16  import java.nio.file.Files;
17  import java.nio.file.Paths;
18  import java.util.Arrays;
19  
20  /**
21   * A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
22   * there that all look basically the same. This one is from
23   * https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
24   *
25   * A new ssdeep hash gets calculated and saved at each level of unwrapping.
26   */
27  public final class Ssdeep {
28  
29      private static final Logger logger = LoggerFactory.getLogger(Ssdeep.class);
30  
31      private static final int SPAMSUM_LENGTH = 64;
32      private static final int MIN_BLOCKSIZE = 3;
33  
34      @SuppressWarnings("PMD.UselessParentheses")
35      public static final int FUZZY_MAX_RESULT = SPAMSUM_LENGTH + (SPAMSUM_LENGTH / 2 + 20);
36  
37      /** The window size for the rolling hash. */
38      private static final int ROLLING_WINDOW_SIZE = 7;
39  
40      /** The buffer size to use when reading data from a file. */
41      private static final int BUFFER_SIZE = 8192;
42  
43      /** FNV hash initial value, 32-bit unsigned. */
44      private static final long HASH_INIT = 0x28021967;
45  
46      /** FNV hash prime multiplier, 32-bit unsigned. */
47      private static final long HASH_PRIME = 0x01000193;
48  
49      /** Used to mask long values to 32 bits unsigned. */
50      private static final long MASK32 = 0xffffffffL;
51  
52      /**
53       * Base64 encoding table. Given a 5-bit value {@code n}, position {@code n} in the array is the code point (expressed as
54       * a byte) that should appear.
55       */
56      private static final byte[] b64Table = SpamSumSignature.getBytes("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
57  
58      /**
59       * Get the base64 encoding of the low 6 bits of the given value.
60       *
61       * @param v The value to encode.
62       * @return The base64 encoding of the low 6 bits of {@code v}. The returned value is a code point expressed as a byte.
63       */
64      private static byte b64EncodeLowBits(final long v) {
65          return b64Table[((int) v) & 0x3f];
66      }
67  
68      private static final class SsContext {
69  
70          /** A buffer for the main hash output. */
71          private final byte[] fuzzHash1 = new byte[SPAMSUM_LENGTH + 1];
72  
73          /** A buffer for the secondary hash output. */
74          private final byte[] fuzzHash2 = new byte[SPAMSUM_LENGTH / 2 + 1];
75  
76          /** The count of output bytes currently stored in {@link #fuzzHash1}, initially zero. */
77          private int fuzzLen1;
78  
79          /** The count of output bytes currently stored in {@link #fuzzHash2}, initially zero. */
80          private int fuzzLen2;
81  
82          private long sumHash1; // Initially zero.
83          private long sumHash2; // Initially zero.
84          private long blockSize; // Initialized by constructor.
85  
86          /**
87           * Estimate the block size to use.
88           *
89           * @param expectedInputLength The expected amount of data to be processed, in bytes. A 0 value can be used if the length
90           *        is unknown, in which case a default block size is returned.
91           * @return The estimated block size to use.
92           */
93          private static long estimateBlockSize(final long expectedInputLength) {
94              long blockSize = MIN_BLOCKSIZE;
95              while ((blockSize * SPAMSUM_LENGTH) < expectedInputLength) {
96                  blockSize *= 2;
97              }
98              return blockSize;
99          }
100 
101         /**
102          * Construct a spam sum context to process a file.
103          *
104          * @param f The file that will be processed, if known. If non-{@code null}, the length of the file is used to guess the
105          *        hash block size to use.
106          */
107         public SsContext(@Nullable final File f) {
108             final long expectedInputLength = (f != null) ? f.length() : 0;
109             this.blockSize = estimateBlockSize(expectedInputLength);
110         }
111 
112         /**
113          * Construct a spam sum context to process a byte array.
114          *
115          * @param data The bytes that will be processed, if known. If non-{@code null}, the length of the array is used to guess
116          *        the hash block size to use.
117          */
118         public SsContext(@Nullable final byte[] data) {
119             final long expectedInputLength = (data != null) ? data.length : 0;
120             this.blockSize = estimateBlockSize(expectedInputLength);
121         }
122 
123         /**
124          * Construct a spam sum context to process a {@link SeekableByteChannel}
125          *
126          * @param sbcf The channel that will be processed, if known. If non-{@code null}, the length of the channel is used to
127          *        guess the hash block size to use.
128          */
129         public SsContext(final SeekableByteChannelFactory sbcf) {
130             long expectedInputLength = 0;
131 
132             try (SeekableByteChannel sbc = sbcf.create()) {
133                 expectedInputLength = sbc.size();
134             } catch (final IOException ignored) {
135                 // Ignore
136             }
137 
138             this.blockSize = estimateBlockSize(expectedInputLength);
139         }
140 
141         /**
142          * A simple non-rolling hash, based on the FNV hash
143          * 
144          * @param b The next byte value, assumed to be in the range 0..255.
145          * @param h The existing hash value, 32-bit unsigned.
146          * @return The updated hash value, 32-bit unsigned.
147          */
148         private static long updateSumHash(final int b, final long h) {
149             return ((h * HASH_PRIME) ^ b) & MASK32;
150         }
151 
152         /**
153          * Apply some bytes to a SpamSum context.
154          *
155          * @param rollState The rolling hash state to use.
156          * @param buffer A buffer containing the input bytes.
157          * @param start The starting offset in {@code buffer}, inclusive.
158          * @param end The ending offset in {@code buffer}, exclusive.
159          */
160         @SuppressWarnings("PMD.CollapsibleIfStatements")
161         private void applyBytes(final RollingState rollState, final byte[] buffer, final int start, final int end) {
162             // At each byte we update the rolling hash and the normal
163             // hash. When the rolling hash hits the reset value, we
164             // emit the normal hash as an element of the signature and
165             // reset both hashes.
166             for (int i = start; i < end; i++) {
167                 // Get the next input byte and normalize to 0..255.
168                 final int nextByte = ((int) buffer[i]) & 0xff;
169 
170                 // Apply the next byte to the hashes.
171                 this.sumHash1 = updateSumHash(nextByte, this.sumHash1);
172                 this.sumHash2 = updateSumHash(nextByte, this.sumHash2);
173                 final long rollingHash = rollState.roll(nextByte);
174 
175                 if ((rollingHash % this.blockSize) == (this.blockSize - 1)) {
176                     // We have hit a reset point. We now emit a hash
177                     // which is based on all bytes in the input
178                     // between the last reset point and this one.
179                     if (this.fuzzLen1 < (SPAMSUM_LENGTH - 1)) {
180                         // We can have a problem with the tail
181                         // overflowing. The easiest way to cope with
182                         // this is to only reset the second hash if we
183                         // have room for more characters in our
184                         // signature. This has the effect of combining
185                         // the last few pieces of the message into a
186                         // single piece
187                         this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
188                         this.sumHash1 = HASH_INIT;
189                     }
190 
191                     // This produces a second signature with a block size
192                     // of blockSize*2. By producing dual signatures in
193                     // this way the effect of small changes in the message
194                     // size near a block size boundary is greatly reduced.
195                     //
196                     // NOTE: we only have to check this when the main
197                     // signature has hit a reset point, because
198                     // mathematically:
199                     //
200                     // [ h === -1 (mod 2*bs) ] --implies--> [ h === -1 (mod bs) ]
201                     //
202                     // In other words, if this condition is true then the
203                     // main signature condition must always also be true.
204                     // Therefore this secondary signature condition can
205                     // only potentially be true if the main signature
206                     // condition (which we've already checked) is true.
207                     if ((rollingHash % (this.blockSize * 2)) == ((this.blockSize * 2) - 1)) {
208                         if (this.fuzzLen2 < (SPAMSUM_LENGTH / 2 - 1)) {
209                             this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
210                             this.sumHash2 = HASH_INIT;
211                         }
212                     }
213                 }
214             }
215         }
216 
217         /**
218          * Discard any existing hash state and prepare to compute a new hash. This should be followed by calls to
219          * {@link #applyBytes(RollingState, byte[], int, int)} to provide the data, and then
220          * {@link #finishHashing(RollingState)} to complete the computations.
221          */
222         private void beginHashing() {
223             this.fuzzLen1 = 0;
224             this.fuzzLen2 = 0;
225             this.sumHash1 = HASH_INIT;
226             this.sumHash2 = HASH_INIT;
227         }
228 
229         /**
230          * Truncate an array if larger than the given length.
231          *
232          * @param input The input array.
233          * @param maxLength The desired maximum array length.
234          * @return If {@code input} is no larger than {@code maxLength}, this just returns {@code input}. Otherwise this returns
235          *         a new array with the same content as {@code input} but with the length truncated to {@code maxLength}.
236          */
237         private static byte[] truncateArray(final byte[] input, final int maxLength) {
238             if (input.length == maxLength) {
239                 return input;
240             } else {
241                 return Arrays.copyOf(input, maxLength);
242             }
243         }
244 
245         /**
246          * Finish hashing and generate the final signature. This should be done after all bytes have been applied with
247          * {@link #applyBytes(RollingState, byte[], int, int)}.
248          *
249          * @param rollState The rolling hash state used during hashing.
250          * @return The final signature.
251          */
252         private SpamSumSignature finishHashing(final RollingState rollState) {
253             if (rollState.getHash() != 0) {
254                 this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
255                 this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
256             }
257 
258             final byte[] finalHash1 = truncateArray(this.fuzzHash1, this.fuzzLen1);
259             final byte[] finalHash2 = truncateArray(this.fuzzHash2, this.fuzzLen2);
260             return new SpamSumSignature(this.blockSize, finalHash1, finalHash2);
261         }
262 
263         /**
264          * Generate the hash for some input.
265          *
266          * <p>
267          * The computations will use the current block size from the context, but any other existing hash state will be
268          * discarded.
269          *
270          * @param data The bytes to hash.
271          * @return The signature for the given data.
272          */
273         public SpamSumSignature generateHash(@Nullable final byte[] data) {
274             beginHashing();
275             final RollingState rollState = new RollingState();
276 
277             if (data != null) {
278                 applyBytes(rollState, data, 0, data.length);
279             }
280 
281             return finishHashing(rollState);
282         }
283 
284         public SpamSumSignature generateHash(final SeekableByteChannelFactory sbcf) {
285             beginHashing();
286             final RollingState rollState = new RollingState();
287 
288             try (InputStream is = Channels.newInputStream(sbcf.create())) {
289                 final byte[] b = new byte[1024];
290 
291                 int bytesRead;
292                 while ((bytesRead = is.read(b)) != -1) {
293                     applyBytes(rollState, b, 0, bytesRead);
294                 }
295             } catch (final IOException ignored) {
296                 // Ignore
297             }
298 
299             return finishHashing(rollState);
300         }
301 
302         /**
303          * Generate the hash for some input.
304          *
305          * <p>
306          * The computations will use the current block size from the context, but any other existing hash state will be
307          * discarded.
308          *
309          * @param stream A file containing the bytes to hash. Assumed non-{@code null}. The processing will start reading at the
310          *        current file position and hash all of the data from there to the end of the file. The file position when this
311          *        returns is unspecified. The file is not closed by this operation.
312          * @return The signature for the given stream content.
313          * @throws IOException If there is some I/O problem while reading the stream.
314          */
315         public SpamSumSignature generateHash(final RandomAccessFile stream) throws IOException {
316             beginHashing();
317             final RollingState rollState = new RollingState();
318 
319             final byte[] buffer = new byte[BUFFER_SIZE];
320             while (true) {
321                 final int bytesRead = stream.read(buffer, 0, buffer.length);
322                 if (bytesRead <= 0) {
323                     break; // No more input.
324                 }
325                 applyBytes(rollState, buffer, 0, bytesRead);
326             }
327 
328             return finishHashing(rollState);
329         }
330     }
331 
332     /**
333      * A rolling hash, based on the Adler checksum. By using a rolling hash we can perform auto resynchronisation after
334      * inserts/deletes.
335      */
336     private static final class RollingState {
337 
338         /** Rolling window. Each value is in the range 0..255. Initially all 0. */
339         private final int[] window = new int[ROLLING_WINDOW_SIZE];
340 
341         /** An index into {@link #window}. Initially 0. */
342         private int windowPosition;
343 
344         /** The sum of the values in {@link #window}. Initially 0. */
345         private long h1;
346 
347         /**
348          * The original documentation says this is the sum of the bytes times the index, but I'm not sure about that. 32-bit
349          * unsigned, initially 0.
350          */
351         private long h2;
352 
353         /**
354          * A shift/xor based rolling hash, mostly needed to ensure that we can cope with large blocksize values. 32-bit
355          * unsigned, initially 0.
356          */
357         private long h3;
358 
359         /**
360          * Construct a new rolling hash state.
361          */
362         public RollingState() {}
363 
364         /**
365          * Get the current hash value.
366          *
367          * @return The current 32-bit unsigned hash value.
368          */
369         public long getHash() {
370             return (this.h1 + this.h2 + this.h3) & MASK32;
371         }
372 
373         /**
374          * Update the rolling hash state with another input byte.
375          *
376          * @param b The byte value to apply. Assumed to be in the range 0..255.
377          * @return The state is updated and the resulting unsigned 32-bit hash value is returned.
378          */
379         public long roll(final int b) {
380             this.h2 = (this.h2 - this.h1 + (ROLLING_WINDOW_SIZE * ((long) b))) & MASK32;
381             this.h1 = (this.h1 + b - this.window[this.windowPosition]) & MASK32;
382             this.window[this.windowPosition] = b;
383 
384             // Advance the window position, wrappping around at the end.
385             if (this.windowPosition == (ROLLING_WINDOW_SIZE - 1)) {
386                 this.windowPosition = 0;
387             } else {
388                 this.windowPosition++;
389             }
390 
391             this.h3 = ((this.h3 << 5) & MASK32) ^ b;
392 
393             return (this.h1 + this.h2 + this.h3) & MASK32;
394         }
395     }
396 
397     public Ssdeep() {}
398 
399     /**
400      * Calculate the SpamSum hash for a byte array.
401      *
402      * @param data The bytes to be hashed.
403      * @return The SpamSum signature for the bytes.
404      */
405     public String fuzzyHash(final byte[] data) {
406         final SsContext ctx = new SsContext(data);
407         while (true) {
408             final SpamSumSignature signature = ctx.generateHash(data);
409 
410             // Our blocksize guess may have been way off, repeat with
411             // a smaller block size if necessary.
412             if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
413                 ctx.blockSize = ctx.blockSize / 2;
414             } else {
415                 return signature.toString();
416             }
417         }
418     }
419 
420     public String fuzzyHash(final SeekableByteChannelFactory sbcf) {
421         final SsContext ctx = new SsContext(sbcf);
422         while (true) {
423             final SpamSumSignature signature = ctx.generateHash(sbcf);
424 
425             // Our blocksize guess may have been way off, repeat with
426             // a smaller block size if necessary.
427             if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
428                 ctx.blockSize = ctx.blockSize / 2;
429             } else {
430                 return signature.toString();
431             }
432         }
433     }
434 
435     /**
436      * Calculates the SpamSum hash for specified stream.
437      * 
438      * @param file The input file to be hashed.
439      * @return The SpamSum signature for the file.
440      * @throws IOException If there is some I/O problem accessing the file.
441      */
442     public String fuzzyHashFile(final File file) throws IOException {
443         try (RandomAccessFile stream = new RandomAccessFile(file, "r")) {
444             final SsContext ctx = new SsContext(file);
445             while (true) {
446                 stream.seek(0);
447                 final SpamSumSignature signature = ctx.generateHash(stream);
448 
449                 // Our blocksize guess may have been way off, repeat with
450                 // a smaller block size if necessary.
451                 if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
452                     ctx.blockSize = ctx.blockSize / 2;
453                 } else {
454                     return signature.toString();
455                 }
456             }
457         }
458     }
459 
460     /**
461      * Calculates the SpamSum hash for specified file.
462      *
463      * @param fileName The path to the file to be hashed.
464      * @return The SpamSum signature for the file.
465      * @throws IOException If there is some I/O problem accessing the file.
466      */
467     public String fuzzyHashFile(final String fileName) throws IOException {
468         return this.fuzzyHashFile(new File(fileName));
469     }
470 
471     /**
472      * Search an array for a subsequence of another array.
473      *
474      * @param haystack The array to search.
475      * @param needle The array containing the sequence to search for.
476      * @param needleStart The starting offset of the sequence to search for, inclusive. Assumed to be in range for
477      *        {@code needle}.
478      * @param length The length of the sequence to search for. Assumed to be in range for {@code needle}. Assumed greater
479      *        than zero.
480      * @return If the subsequence of {@code needle} is present in {@code haystack}, this returns the least offset where the
481      *         subsequence occurs. Otherwise -1.
482      */
483     private static int indexOfSubSequence(final byte[] haystack, final byte[] needle, final int needleStart, final int length) {
484         final int lastCandidatePos = haystack.length - length;
485         final byte firstNeedleByte = needle[needleStart];
486         NEXT_CANDIDATE: for (int candidatePos = 0; candidatePos <= lastCandidatePos; candidatePos++) {
487             if (haystack[candidatePos] == firstNeedleByte) {
488                 // The first needle byte matches at this candidate
489                 // position, so look for the rest of the needle
490                 // following it.
491                 for (int needlePos = 1; needlePos < length; needlePos++) {
492                     if (haystack[candidatePos + needlePos] != needle[needleStart + needlePos]) {
493                         continue NEXT_CANDIDATE; // Needle mismatch.
494                     }
495                 }
496                 // If we reach here, the entire needle subsequence
497                 // matched in the haystack at the candidate position.
498                 return candidatePos;
499             }
500         }
501         return -1;
502     }
503 
504     /**
505      * Search two arrays for a common subsequence of the given length.
506      *
507      * @param s1 The first byte array for comparison.
508      * @param s2 The second byte array for comparison.
509      * @param length The substring length to look for. Assumed greater than zero.
510      * @return {@code true} iff {@code s1} and {@code s2} have at least one byte sequence of length {@code length} in common
511      *         at arbitrary offsets.
512      */
513     private static boolean hasCommonSequence(final byte[] s1, final byte[] s2, final int length) {
514         if ((s1.length < length) || (s2.length < length)) {
515             return false; // The strings are not large enough.
516         }
517 
518         // This is just a brute-force approach. We move a window of
519         // the specified length through s1 and check whether it exists
520         // anywhere in s2.
521         final int lastS1Pos = s1.length - length;
522         for (int s1Pos = 0; s1Pos <= lastS1Pos; s1Pos++) {
523             final int s2Pos = indexOfSubSequence(s2, s1, s1Pos, length);
524             if (s2Pos != -1) {
525                 return true;
526             }
527         }
528         return false;
529     }
530 
531     /**
532      * Truncate sequences of longer than 3 identical bytes. These sequences contain very little information so they tend to
533      * just bias the result unfairly.
534      *
535      * @param in The input bytes.
536      * @return An array containing the same content as {@code in}, except that any sequences of more than 3 identical bytes
537      *         are truncated to 3 bytes. For example "aaabbbbcddddd" becomes "aaabbbcddd".
538      */
539     private static byte[] eliminateLongSequences(final byte[] in) {
540         if (in.length < 4) {
541             return in; // There is not enough input to require any change.
542         }
543 
544         // We just need to initialize prev to something other than in[0].
545         byte prev = (in[0] != 0) ? 0 : (byte) 1;
546         int repeatCount = 0;
547 
548         // Scan the input looking for the index of the first byte that
549         // will need to be removed.
550         int inPos = 0;
551         while (true) {
552             if (inPos == in.length) {
553                 // We didn't find anything that needed to be removed.
554                 return in;
555             } else if (in[inPos] == prev) {
556                 // This is a repeat of the previous byte.
557                 repeatCount++;
558                 if (repeatCount == 3) {
559                     break; // inPos needs to be removed.
560                 }
561             } else {
562                 // This is not a repeat of the previous byte.
563                 prev = in[inPos];
564                 repeatCount = 0;
565             }
566             inPos++;
567         }
568 
569         // At this point inPos is the first index that needs to be
570         // removed, prev is set to its byte value, and repeatCount is
571         // set to 3. Start an output array and copy everything up to
572         // but not including inPos.
573         final byte[] out = new byte[in.length - 1];
574         System.arraycopy(in, 0, out, 0, inPos);
575         int outPos = inPos;
576 
577         // Continue scanning and copying to output.
578         while (++inPos < in.length) {
579             if (in[inPos] == prev) {
580                 repeatCount++;
581             } else {
582                 prev = in[inPos];
583                 repeatCount = 0;
584             }
585             if (repeatCount < 3) {
586                 out[outPos++] = in[inPos];
587             }
588         }
589 
590         return (outPos == out.length) ? out : Arrays.copyOf(out, outPos);
591     }
592 
593     /**
594      * This is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a
595      * terrible match and 100 is a great match. The blockSize is used to cope with very small messages.
596      */
597     private static long scoreStrings(final byte[] s1, final byte[] s2, final long blockSize) {
598         final int len1 = s1.length;
599         final int len2 = s2.length;
600 
601         if ((len1 > SPAMSUM_LENGTH) || (len2 > SPAMSUM_LENGTH)) {
602             // not a real spamsum signature?
603             return 0;
604         }
605 
606         // The two strings must have a common substring of length
607         // ROLLING_WINDOW_SIZE to be candidates.
608         if (!hasCommonSequence(s1, s2, ROLLING_WINDOW_SIZE)) {
609             return 0;
610         }
611 
612         // Compute the edit distance between the two strings. The edit
613         // distance gives us a pretty good idea of how closely related
614         // the two strings are.
615         long score = EditDistance.calculateEditDistance(s1, len1, s2, len2);
616         if (logger.isDebugEnabled()) {
617             logger.debug("edit_dist: {}", score);
618         }
619 
620         // Scale the edit distance by the lengths of the two
621         // strings. This changes the score to be a measure of the
622         // proportion of the message that has changed rather than an
623         // absolute quantity. It also copes with the variability of
624         // the string lengths.
625         score = score * SPAMSUM_LENGTH / (len1 + len2);
626 
627         // At this stage the score occurs roughly on a 0-64 scale,
628         // with 0 being a good match and 64 being a complete mismatch.
629 
630         // Rescale to a 0-100 scale (friendlier to humans).
631         score = 100 * score / 64;
632 
633         // It is possible to get a score above 100 here, but it is a
634         // really terrible match.
635         if (score >= 100) {
636             return 0;
637         }
638 
639         // Now re-scale on a 0-100 scale with 0 being a poor match and
640         // 100 being a excellent match.
641         score = 100 - score;
642 
643         // When the blocksize is small we don't want to exaggerate the
644         // match size.
645         if (score > (blockSize / MIN_BLOCKSIZE * Math.min(len1, len2))) {
646             score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2);
647         }
648         return score;
649     }
650 
651     /**
652      * Given two spamsum signature return a value indicating the degree to which they match.
653      *
654      * @param signature1 The first signature.
655      * @param signature2 The second signature.
656      * @return The score for the two signatures. The value is in the range 0..100, where 0 is a terrible match and 100 is a
657      *         great match.
658      */
659     public int compare(@Nullable final SpamSumSignature signature1, @Nullable final SpamSumSignature signature2) {
660         if ((null == signature1) || (null == signature2)) {
661             return -1;
662         }
663         final long blockSize1 = signature1.getBlockSize();
664         final long blockSize2 = signature2.getBlockSize();
665 
666         // We require the block sizes to either be equal, or for one
667         // to be twice the other. If the blocksizes don't match then
668         // we are comparing apples to oranges. This isn't an 'error'
669         // per se. We could have two valid signatures, but they can't
670         // be compared.
671         if ((blockSize1 != blockSize2) && (blockSize1 != (blockSize2 * 2)) && (blockSize2 != (blockSize1 * 2))) {
672             if (logger.isDebugEnabled()) {
673                 logger.debug("block sizes too different: {} {}", blockSize1, blockSize2);
674             }
675             return 0;
676         }
677 
678         // There is very little information content is sequences of
679         // the same character like 'LLLLL'. Eliminate any sequences
680         // longer than 3. This is especially important when combined
681         // with the hasCommonSequence() test.
682         final byte[] s1First = eliminateLongSequences(signature1.getHashPart1());
683         final byte[] s1Second = eliminateLongSequences(signature1.getHashPart2());
684         final byte[] s2First = eliminateLongSequences(signature2.getHashPart1());
685         final byte[] s2Second = eliminateLongSequences(signature2.getHashPart2());
686 
687         // Each signature has a string for two block sizes. We now
688         // choose how to combine the two block sizes. We checked above
689         // that they have at least one block size in common.
690         final long score;
691         if (blockSize1 == blockSize2) {
692             // The signature block sizes are equal.
693             final long score1 = scoreStrings(s1First, s2First, blockSize1);
694             final long score2 = scoreStrings(s1Second, s2Second, blockSize2);
695             score = Math.max(score1, score2);
696         } else if (blockSize1 == (blockSize2 * 2)) {
697             // The first signature has twice the block size of the second.
698             score = scoreStrings(s1First, s2Second, blockSize1);
699         } else {
700             // The second signature has twice the block size of the first.
701             score = scoreStrings(s1Second, s2First, blockSize2);
702         }
703 
704         return (int) score;
705     }
706 
707     @SuppressWarnings("SystemOut")
708     public static void main(final String[] args) throws Exception {
709         final Ssdeep ss = new Ssdeep();
710         for (final String f : args) {
711             try (InputStream is = Files.newInputStream(Paths.get(f))) {
712                 final byte[] buffer = IOUtils.toByteArray(is);
713                 // output format matches the original ssdeep program
714                 System.out.println(ss.fuzzyHash(buffer) + ",\"" + f + "\"");
715             }
716         }
717     }
718 }