Ssdeep.java
package emissary.kff;
import emissary.core.channels.SeekableByteChannelFactory;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import javax.annotation.Nullable;
/**
* A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
* there that all look basically the same. This one is from
* https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
*
* A new ssdeep hash gets calculated and saved at each level of unwrapping.
*/
public final class Ssdeep {
private static final Logger logger = LoggerFactory.getLogger(Ssdeep.class);
private static final int SPAMSUM_LENGTH = 64;
private static final int MIN_BLOCKSIZE = 3;
public static final int FUZZY_MAX_RESULT = (SPAMSUM_LENGTH + (SPAMSUM_LENGTH / 2 + 20));
/** The window size for the rolling hash. */
private static final int ROLLING_WINDOW_SIZE = 7;
/** The buffer size to use when reading data from a file. */
private static final int BUFFER_SIZE = 8192;
/** FNV hash initial value, 32-bit unsigned. */
private static final long HASH_INIT = 0x28021967;
/** FNV hash prime multiplier, 32-bit unsigned. */
private static final long HASH_PRIME = 0x01000193;
/** Used to mask long values to 32 bits unsigned. */
private static final long MASK32 = 0xffffffffL;
/**
* Base64 encoding table. Given a 5-bit value {@code n}, position {@code n} in the array is the code point (expressed as
* a byte) that should appear.
*/
private static final byte[] b64Table = SpamSumSignature.getBytes("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
/**
* Get the base64 encoding of the low 6 bits of the given value.
*
* @param v The value to encode.
* @return The base64 encoding of the low 6 bits of {@code v}. The returned value is a code point expressed as a byte.
*/
private static byte b64EncodeLowBits(final long v) {
return b64Table[((int) v) & 0x3f];
}
private static final class SsContext {
/** A buffer for the main hash output. */
private final byte[] fuzzHash1 = new byte[SPAMSUM_LENGTH + 1];
/** A buffer for the secondary hash output. */
private final byte[] fuzzHash2 = new byte[SPAMSUM_LENGTH / 2 + 1];
/** The count of output bytes currently stored in {@link #fuzzHash1}, initially zero. */
private int fuzzLen1;
/** The count of output bytes currently stored in {@link #fuzzHash2}, initially zero. */
private int fuzzLen2;
private long sumHash1; // Initially zero.
private long sumHash2; // Initially zero.
private long blockSize; // Initialized by constructor.
/**
* Estimate the block size to use.
*
* @param expectedInputLength The expected amount of data to be processed, in bytes. A 0 value can be used if the length
* is unknown, in which case a default block size is returned.
* @return The estimated block size to use.
*/
private static long estimateBlockSize(final long expectedInputLength) {
long blockSize = MIN_BLOCKSIZE;
while ((blockSize * SPAMSUM_LENGTH) < expectedInputLength) {
blockSize *= 2;
}
return blockSize;
}
/**
* Construct a spam sum context to process a file.
*
* @param f The file that will be processed, if known. If non-{@code null}, the length of the file is used to guess the
* hash block size to use.
*/
public SsContext(@Nullable final File f) {
final long expectedInputLength = (f != null) ? f.length() : 0;
this.blockSize = estimateBlockSize(expectedInputLength);
}
/**
* Construct a spam sum context to process a byte array.
*
* @param data The bytes that will be processed, if known. If non-{@code null}, the length of the array is used to guess
* the hash block size to use.
*/
public SsContext(@Nullable final byte[] data) {
final long expectedInputLength = (data != null) ? data.length : 0;
this.blockSize = estimateBlockSize(expectedInputLength);
}
/**
* Construct a spam sum context to process a {@link SeekableByteChannel}
*
* @param sbcf The channel that will be processed, if known. If non-{@code null}, the length of the channel is used to
* guess the hash block size to use.
*/
public SsContext(final SeekableByteChannelFactory sbcf) {
long expectedInputLength = 0;
try (SeekableByteChannel sbc = sbcf.create()) {
expectedInputLength = sbc.size();
} catch (final IOException ioe) {
// Ignore
}
this.blockSize = estimateBlockSize(expectedInputLength);
}
/**
* A simple non-rolling hash, based on the FNV hash
*
* @param b The next byte value, assumed to be in the range 0..255.
* @param h The existing hash value, 32-bit unsigned.
* @return The updated hash value, 32-bit unsigned.
*/
private static long updateSumHash(final int b, final long h) {
return ((h * HASH_PRIME) ^ b) & MASK32;
}
/**
* Apply some bytes to a SpamSum context.
*
* @param rollState The rolling hash state to use.
* @param buffer A buffer containing the input bytes.
* @param start The starting offset in {@code buffer}, inclusive.
* @param end The ending offset in {@code buffer}, exclusive.
*/
private void applyBytes(final RollingState rollState, final byte[] buffer, final int start, final int end) {
// At each byte we update the rolling hash and the normal
// hash. When the rolling hash hits the reset value, we
// emit the normal hash as an element of the signature and
// reset both hashes.
for (int i = start; i < end; i++) {
// Get the next input byte and normalize to 0..255.
final int nextByte = ((int) buffer[i]) & 0xff;
// Apply the next byte to the hashes.
this.sumHash1 = updateSumHash(nextByte, this.sumHash1);
this.sumHash2 = updateSumHash(nextByte, this.sumHash2);
final long rollingHash = rollState.roll(nextByte);
if ((rollingHash % this.blockSize) == (this.blockSize - 1)) {
// We have hit a reset point. We now emit a hash
// which is based on all bytes in the input
// between the last reset point and this one.
if (this.fuzzLen1 < (SPAMSUM_LENGTH - 1)) {
// We can have a problem with the tail
// overflowing. The easiest way to cope with
// this is to only reset the second hash if we
// have room for more characters in our
// signature. This has the effect of combining
// the last few pieces of the message into a
// single piece
this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
this.sumHash1 = HASH_INIT;
}
// This produces a second signature with a block size
// of blockSize*2. By producing dual signatures in
// this way the effect of small changes in the message
// size near a block size boundary is greatly reduced.
//
// NOTE: we only have to check this when the main
// signature has hit a reset point, because
// mathematically:
//
// [ h === -1 (mod 2*bs) ] --implies--> [ h === -1 (mod bs) ]
//
// In other words, if this condition is true then the
// main signature condition must always also be true.
// Therefore this secondary signature condition can
// only potentially be true if the main signature
// condition (which we've already checked) is true.
if ((rollingHash % (this.blockSize * 2)) == ((this.blockSize * 2) - 1)) {
if (this.fuzzLen2 < (SPAMSUM_LENGTH / 2 - 1)) {
this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
this.sumHash2 = HASH_INIT;
}
}
}
}
}
/**
* Discard any existing hash state and prepare to compute a new hash. This should be followed by calls to
* {@link #applyBytes(RollingState, byte[], int, int)} to provide the data, and then
* {@link #finishHashing(RollingState)} to complete the computations.
*/
private void beginHashing() {
this.fuzzLen1 = 0;
this.fuzzLen2 = 0;
this.sumHash1 = HASH_INIT;
this.sumHash2 = HASH_INIT;
}
/**
* Truncate an array if larger than the given length.
*
* @param input The input array.
* @param maxLength The desired maximum array length.
* @return If {@code input} is no larger than {@code maxLength}, this just returns {@code input}. Otherwise this returns
* a new array with the same content as {@code input} but with the length truncated to {@code maxLength}.
*/
private static byte[] truncateArray(final byte[] input, final int maxLength) {
if (input.length == maxLength) {
return input;
} else {
return Arrays.copyOf(input, maxLength);
}
}
/**
* Finish hashing and generate the final signature. This should be done after all bytes have been applied with
* {@link #applyBytes(RollingState, byte[], int, int)}.
*
* @param rollState The rolling hash state used during hashing.
* @return The final signature.
*/
private SpamSumSignature finishHashing(final RollingState rollState) {
if (rollState.getHash() != 0) {
this.fuzzHash1[this.fuzzLen1++] = b64EncodeLowBits(this.sumHash1);
this.fuzzHash2[this.fuzzLen2++] = b64EncodeLowBits(this.sumHash2);
}
final byte[] finalHash1 = truncateArray(this.fuzzHash1, this.fuzzLen1);
final byte[] finalHash2 = truncateArray(this.fuzzHash2, this.fuzzLen2);
return new SpamSumSignature(this.blockSize, finalHash1, finalHash2);
}
/**
* Generate the hash for some input.
*
* <p>
* The computations will use the current block size from the context, but any other existing hash state will be
* discarded.
*
* @param data The bytes to hash.
* @return The signature for the given data.
*/
public SpamSumSignature generateHash(@Nullable final byte[] data) {
beginHashing();
final RollingState rollState = new RollingState();
if (data != null) {
applyBytes(rollState, data, 0, data.length);
}
return finishHashing(rollState);
}
public SpamSumSignature generateHash(final SeekableByteChannelFactory sbcf) {
beginHashing();
final RollingState rollState = new RollingState();
try (InputStream is = Channels.newInputStream(sbcf.create())) {
final byte[] b = new byte[1024];
int bytesRead;
while ((bytesRead = is.read(b)) != -1) {
applyBytes(rollState, b, 0, bytesRead);
}
} catch (final IOException e) {
// Ignore
}
return finishHashing(rollState);
}
/**
* Generate the hash for some input.
*
* <p>
* The computations will use the current block size from the context, but any other existing hash state will be
* discarded.
*
* @param stream A file containing the bytes to hash. Assumed non-{@code null}. The processing will start reading at the
* current file position and hash all of the data from there to the end of the file. The file position when this
* returns is unspecified. The file is not closed by this operation.
* @return The signature for the given stream content.
* @throws IOException If there is some I/O problem while reading the stream.
*/
public SpamSumSignature generateHash(final RandomAccessFile stream) throws IOException {
beginHashing();
final RollingState rollState = new RollingState();
final byte[] buffer = new byte[BUFFER_SIZE];
while (true) {
final int bytesRead = stream.read(buffer, 0, buffer.length);
if (bytesRead <= 0) {
break; // No more input.
}
applyBytes(rollState, buffer, 0, bytesRead);
}
return finishHashing(rollState);
}
}
/**
* A rolling hash, based on the Adler checksum. By using a rolling hash we can perform auto resynchronisation after
* inserts/deletes.
*/
private static final class RollingState {
/** Rolling window. Each value is in the range 0..255. Initially all 0. */
private final int[] window = new int[ROLLING_WINDOW_SIZE];
/** An index into {@link #window}. Initially 0. */
private int windowPosition;
/** The sum of the values in {@link #window}. Initially 0. */
private long h1;
/**
* The original documentation says this is the sum of the bytes times the index, but I'm not sure about that. 32-bit
* unsigned, initially 0.
*/
private long h2;
/**
* A shift/xor based rolling hash, mostly needed to ensure that we can cope with large blocksize values. 32-bit
* unsigned, initially 0.
*/
private long h3;
/**
* Construct a new rolling hash state.
*/
public RollingState() {}
/**
* Get the current hash value.
*
* @return The current 32-bit unsigned hash value.
*/
public long getHash() {
return (this.h1 + this.h2 + this.h3) & MASK32;
}
/**
* Update the rolling hash state with another input byte.
*
* @param b The byte value to apply. Assumed to be in the range 0..255.
* @return The state is updated and the resulting unsigned 32-bit hash value is returned.
*/
public long roll(final int b) {
this.h2 = (this.h2 - this.h1 + (ROLLING_WINDOW_SIZE * ((long) b))) & MASK32;
this.h1 = (this.h1 + b - this.window[this.windowPosition]) & MASK32;
this.window[this.windowPosition] = b;
// Advance the window position, wrappping around at the end.
if (this.windowPosition == (ROLLING_WINDOW_SIZE - 1)) {
this.windowPosition = 0;
} else {
this.windowPosition++;
}
this.h3 = ((this.h3 << 5) & MASK32) ^ b;
return (this.h1 + this.h2 + this.h3) & MASK32;
}
}
public Ssdeep() {}
/**
* Calculate the SpamSum hash for a byte array.
*
* @param data The bytes to be hashed.
* @return The SpamSum signature for the bytes.
*/
public String fuzzyHash(final byte[] data) {
final SsContext ctx = new SsContext(data);
while (true) {
final SpamSumSignature signature = ctx.generateHash(data);
// Our blocksize guess may have been way off, repeat with
// a smaller block size if necessary.
if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
ctx.blockSize = ctx.blockSize / 2;
} else {
return signature.toString();
}
}
}
public String fuzzyHash(final SeekableByteChannelFactory sbcf) {
final SsContext ctx = new SsContext(sbcf);
while (true) {
final SpamSumSignature signature = ctx.generateHash(sbcf);
// Our blocksize guess may have been way off, repeat with
// a smaller block size if necessary.
if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
ctx.blockSize = ctx.blockSize / 2;
} else {
return signature.toString();
}
}
}
/**
* Calculates the SpamSum hash for specified stream.
*
* @param file The input file to be hashed.
* @return The SpamSum signature for the file.
* @throws IOException If there is some I/O problem accessing the file.
*/
public String fuzzyHashFile(final File file) throws IOException {
try (RandomAccessFile stream = new RandomAccessFile(file, "r")) {
final SsContext ctx = new SsContext(file);
while (true) {
stream.seek(0);
final SpamSumSignature signature = ctx.generateHash(stream);
// Our blocksize guess may have been way off, repeat with
// a smaller block size if necessary.
if ((ctx.blockSize > MIN_BLOCKSIZE) && (ctx.fuzzLen1 < (SPAMSUM_LENGTH / 2))) {
ctx.blockSize = ctx.blockSize / 2;
} else {
return signature.toString();
}
}
}
}
/**
* Calculates the SpamSum hash for specified file.
*
* @param fileName The path to the file to be hashed.
* @return The SpamSum signature for the file.
* @throws IOException If there is some I/O problem accessing the file.
*/
public String fuzzyHashFile(final String fileName) throws IOException {
return this.fuzzyHashFile(new File(fileName));
}
/**
* Search an array for a subsequence of another array.
*
* @param haystack The array to search.
* @param needle The array containing the sequence to search for.
* @param needleStart The starting offset of the sequence to search for, inclusive. Assumed to be in range for
* {@code needle}.
* @param length The length of the sequence to search for. Assumed to be in range for {@code needle}. Assumed greater
* than zero.
* @return If the subsequence of {@code needle} is present in {@code haystack}, this returns the least offset where the
* subsequence occurs. Otherwise -1.
*/
private static int indexOfSubSequence(final byte[] haystack, final byte[] needle, final int needleStart, final int length) {
final int lastCandidatePos = haystack.length - length;
final byte firstNeedleByte = needle[needleStart];
NEXT_CANDIDATE: for (int candidatePos = 0; candidatePos <= lastCandidatePos; candidatePos++) {
if (haystack[candidatePos] == firstNeedleByte) {
// The first needle byte matches at this candidate
// position, so look for the rest of the needle
// following it.
for (int needlePos = 1; needlePos < length; needlePos++) {
if (haystack[candidatePos + needlePos] != needle[needleStart + needlePos]) {
continue NEXT_CANDIDATE; // Needle mismatch.
}
}
// If we reach here, the entire needle subsequence
// matched in the haystack at the candidate position.
return candidatePos;
}
}
return -1;
}
/**
* Search two arrays for a common subsequence of the given length.
*
* @param s1 The first byte array for comparison.
* @param s2 The second byte array for comparison.
* @param length The substring length to look for. Assumed greater than zero.
* @return {@code true} iff {@code s1} and {@code s2} have at least one byte sequence of length {@code length} in common
* at arbitrary offsets.
*/
private static boolean hasCommonSequence(final byte[] s1, final byte[] s2, final int length) {
if ((s1.length < length) || (s2.length < length)) {
return false; // The strings are not large enough.
}
// This is just a brute-force approach. We move a window of
// the specified length through s1 and check whether it exists
// anywhere in s2.
final int lastS1Pos = s1.length - length;
for (int s1Pos = 0; s1Pos <= lastS1Pos; s1Pos++) {
final int s2Pos = indexOfSubSequence(s2, s1, s1Pos, length);
if (s2Pos != -1) {
return true;
}
}
return false;
}
/**
* Truncate sequences of longer than 3 identical bytes. These sequences contain very little information so they tend to
* just bias the result unfairly.
*
* @param in The input bytes.
* @return An array containing the same content as {@code in}, except that any sequences of more than 3 identical bytes
* are truncated to 3 bytes. For example "aaabbbbcddddd" becomes "aaabbbcddd".
*/
private static byte[] eliminateLongSequences(final byte[] in) {
if (in.length < 4) {
return in; // There is not enough input to require any change.
}
// We just need to initialize prev to something other than in[0].
byte prev = (in[0] != 0) ? 0 : (byte) 1;
int repeatCount = 0;
// Scan the input looking for the index of the first byte that
// will need to be removed.
int inPos = 0;
while (true) {
if (inPos == in.length) {
// We didn't find anything that needed to be removed.
return in;
} else if (in[inPos] == prev) {
// This is a repeat of the previous byte.
repeatCount++;
if (repeatCount == 3) {
break; // inPos needs to be removed.
}
} else {
// This is not a repeat of the previous byte.
prev = in[inPos];
repeatCount = 0;
}
inPos++;
}
// At this point inPos is the first index that needs to be
// removed, prev is set to its byte value, and repeatCount is
// set to 3. Start an output array and copy everything up to
// but not including inPos.
final byte[] out = new byte[in.length - 1];
System.arraycopy(in, 0, out, 0, inPos);
int outPos = inPos;
// Continue scanning and copying to output.
while (++inPos < in.length) {
if (in[inPos] == prev) {
repeatCount++;
} else {
prev = in[inPos];
repeatCount = 0;
}
if (repeatCount < 3) {
out[outPos++] = in[inPos];
}
}
return (outPos == out.length) ? out : Arrays.copyOf(out, outPos);
}
/**
* This is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a
* terrible match and 100 is a great match. The blockSize is used to cope with very small messages.
*/
private static long scoreStrings(final byte[] s1, final byte[] s2, final long blockSize) {
final int len1 = s1.length;
final int len2 = s2.length;
if ((len1 > SPAMSUM_LENGTH) || (len2 > SPAMSUM_LENGTH)) {
// not a real spamsum signature?
return 0;
}
// The two strings must have a common substring of length
// ROLLING_WINDOW_SIZE to be candidates.
if (!hasCommonSequence(s1, s2, ROLLING_WINDOW_SIZE)) {
return 0;
}
// Compute the edit distance between the two strings. The edit
// distance gives us a pretty good idea of how closely related
// the two strings are.
long score = EditDistance.calculateEditDistance(s1, len1, s2, len2);
if (logger.isDebugEnabled()) {
logger.debug("edit_dist: {}", score);
}
// Scale the edit distance by the lengths of the two
// strings. This changes the score to be a measure of the
// proportion of the message that has changed rather than an
// absolute quantity. It also copes with the variability of
// the string lengths.
score = (score * SPAMSUM_LENGTH) / (len1 + len2);
// At this stage the score occurs roughly on a 0-64 scale,
// with 0 being a good match and 64 being a complete mismatch.
// Rescale to a 0-100 scale (friendlier to humans).
score = (100 * score) / 64;
// It is possible to get a score above 100 here, but it is a
// really terrible match.
if (score >= 100) {
return 0;
}
// Now re-scale on a 0-100 scale with 0 being a poor match and
// 100 being a excellent match.
score = 100 - score;
// When the blocksize is small we don't want to exaggerate the
// match size.
if (score > (blockSize / MIN_BLOCKSIZE * Math.min(len1, len2))) {
score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2);
}
return score;
}
/**
* Given two spamsum signature return a value indicating the degree to which they match.
*
* @param signature1 The first signature.
* @param signature2 The second signature.
* @return The score for the two signatures. The value is in the range 0..100, where 0 is a terrible match and 100 is a
* great match.
*/
public int compare(@Nullable final SpamSumSignature signature1, @Nullable final SpamSumSignature signature2) {
if ((null == signature1) || (null == signature2)) {
return -1;
}
final long blockSize1 = signature1.getBlockSize();
final long blockSize2 = signature2.getBlockSize();
// We require the block sizes to either be equal, or for one
// to be twice the other. If the blocksizes don't match then
// we are comparing apples to oranges. This isn't an 'error'
// per se. We could have two valid signatures, but they can't
// be compared.
if ((blockSize1 != blockSize2) && (blockSize1 != (blockSize2 * 2)) && (blockSize2 != (blockSize1 * 2))) {
if (logger.isDebugEnabled()) {
logger.debug("block sizes too different: {} {}", blockSize1, blockSize2);
}
return 0;
}
// There is very little information content is sequences of
// the same character like 'LLLLL'. Eliminate any sequences
// longer than 3. This is especially important when combined
// with the hasCommonSequence() test.
final byte[] s1First = eliminateLongSequences(signature1.getHashPart1());
final byte[] s1Second = eliminateLongSequences(signature1.getHashPart2());
final byte[] s2First = eliminateLongSequences(signature2.getHashPart1());
final byte[] s2Second = eliminateLongSequences(signature2.getHashPart2());
// Each signature has a string for two block sizes. We now
// choose how to combine the two block sizes. We checked above
// that they have at least one block size in common.
final long score;
if (blockSize1 == blockSize2) {
// The signature block sizes are equal.
final long score1 = scoreStrings(s1First, s2First, blockSize1);
final long score2 = scoreStrings(s1Second, s2Second, blockSize2);
score = Math.max(score1, score2);
} else if (blockSize1 == (blockSize2 * 2)) {
// The first signature has twice the block size of the second.
score = scoreStrings(s1First, s2Second, blockSize1);
} else {
// The second signature has twice the block size of the first.
score = scoreStrings(s1Second, s2First, blockSize2);
}
return (int) score;
}
@SuppressWarnings("SystemOut")
public static void main(final String[] args) throws Exception {
final Ssdeep ss = new Ssdeep();
for (final String f : args) {
try (InputStream is = Files.newInputStream(Paths.get(f))) {
final byte[] buffer = IOUtils.toByteArray(is);
// output format matches the original ssdeep program
System.out.println(ss.fuzzyHash(buffer) + ",\"" + f + "\"");
}
}
}
}