EditDistance.java

package emissary.kff;

import javax.annotation.Nullable;

/**
 * A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
 * there that all look basically the same. This one is from
 * https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
 * 
 * A new ssdeep hash gets calculated and saved at each level of unwrapping.
 */
@SuppressWarnings("NonFinalStaticField")
public class EditDistance {
    /*
     * This edit distance code is taken from trn3.6. A few minor modifications have been made by Andrew Tridgell
     * <tridge@samba.org> for use in spamsum.
     */


    /***************************************************************************/


    /*
     * The authors make no claims as to the fitness or correctness of this software for any use whatsoever, and it is
     * provided as is. Any use of this software is at the user's own risk.
     */

    /*
     * edit_dist -- returns the minimum edit distance between two strings
     * 
     * Program by: Mark Maimone CMU Computer Science 13 Nov 89 Last Modified: 28 Jan 90
     * 
     * If the input strings have length n and m, the algorithm runs in time O(nm) and space O(min(m,n)).
     * 
     * HISTORY 13 Nov 89 (mwm) Created edit_dist() and set_costs().
     * 
     * 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD computations will work even when THRESHOLD is not a
     * multiple of sizeof(int).
     * 
     * 17 May 93 (mwm) Improved performance when used with trn's newsgroup processing; assume all costs are 1, and you can
     * terminate when a threshold is exceeded.
     */

    private static final int MIN_DIST = 100;

    /*
     * Use a less-general version of the routine, one that's better for trn. All change costs are 1, and it's okay to
     * terminate if the edit distance is known to exceed MIN_DIST
     */

    private static final int THRESHOLD = 4000; /*
                                                * worry about allocating more memory only when this # of bytes is exceeded
                                                */

    private static final int STRLENTHRESHOLD = (THRESHOLD / (Integer.SIZE / 8) - 3) / 2;

    // #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))

    // #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
    private static void swapInt(int[] x, int[] y) {
        int iswap = x[0];
        x[0] = y[0];
        y[0] = iswap;
    }

    // #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
    private static void swapChar(/* ref */byte[][] x, /* ref */byte[][] y) {
        byte[] cswap = x[0];
        x[0] = y[0];
        y[0] = cswap;
    }

    // #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz :
    // _my))
    private static int min3(int x, int y, int z) {
        int mx = x;
        int my = y;
        int mz = z;
        return (mx < my ? (mx < mz ? mx : mz) : (mz < my) ? mz : my);
    }

    // #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
    private static int min2(int x, int y) {
        int mx = x;
        int my = y;
        return (mx < my ? mx : my);
    }

    static int insertCost = 1;
    static int deleteCost = 1;

    // dynamic programming counters
    static int row;
    static int col;
    static int index = 0;
    static int radix; // radix for modular indexing
    static int low;
    static int[] buffer; /*
                          * pointer to storage for one row of the d.p. array
                          */

    static int[] store = new int[THRESHOLD / (Integer.SIZE / 8)];
    /*
     * a small amount of static storage, to be used when the input strings are small enough
     */

    /* Handle trivial cases when one string is empty */

    static int ins = 1;
    static int del = 1;
    static int ch = 3;
    static int swapCost = 5;

    static int fromLen;
    static int toLen;

    private static int ar(int x, int y, int index) {
        return ((x == 0) ? y * del : ((y == 0) ? x * ins : buffer[mod(index)]));
    }

    private static int nw(int x, int y) {
        return ar(x, y, index + fromLen + 2);
    }

    private static int n(int x, int y) {
        return ar(x, y, index + fromLen + 3);
    }

    private static int w(int x, int y) {
        return ar(x, y, index + radix - 1);
    }

    private static int nnww(int x, int y) {
        return ar(x, y, index + 1);
    }

    private static int mod(int x) {
        return (x % radix);
    }

    /*
     * returns the edit distance between two strings, or -1 on failure
     */
    public static int calculateEditDistance(@Nullable byte[] from, int fromLen, @Nullable byte[] to, int toLen) {
        EditDistance.fromLen = fromLen;
        EditDistance.toLen = toLen;

        if (from == null) {
            if (to == null) {
                return 0;
            } else {
                return EditDistance.toLen * insertCost;
            }
        } else if (to == null) {
            return EditDistance.fromLen * deleteCost;
        }

        /* Initialize registers */

        radix = 2 * EditDistance.fromLen + 3;

        /* Make from short enough to fit in the static storage, if it's at all possible */

        if (EditDistance.fromLen > EditDistance.toLen && EditDistance.fromLen > STRLENTHRESHOLD) {
            int[] x = new int[1];
            int[] y = new int[1];
            x[0] = EditDistance.fromLen;
            y[0] = EditDistance.toLen;
            swapInt(x, y);
            byte[][] xx = new byte[1][];
            byte[][] yy = new byte[1][];
            xx[0] = from;
            yy[0] = to;
            swapChar(xx, yy);
        } // if from_len > to_len

        /* Allocate the array storage (from the heap if necessary) */

        if (EditDistance.fromLen <= STRLENTHRESHOLD) {
            buffer = store;
        } else {
            buffer = new int[radix];
        }

        /*
         * Here's where the fun begins. We will find the minimum edit distance using dynamic programming. We only need to store
         * two rows of the matrix at a time, since we always progress down the matrix. For example, given the strings "one" and
         * "two", and insert, delete and change costs equal to 1:
         * 
         * _ o n e _ 0 1 2 3 t 1 1 2 3 w 2 2 2 3 o 3 2 3 3
         * 
         * The dynamic programming recursion is defined as follows:
         * 
         * ar(x,0) := x * insert_cost ar(0,y) := y * delete_cost ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 :
         * change), a(x - 1, y) + insert_cost, a(x, y - 1) + delete_cost, a(x - 2, y - 2) + (from[x] == to[y-1] && from[x-1] ==
         * to[y] ? swap_cost : infinity))
         * 
         * Since this only looks at most two rows and three columns back, we need only store the values for the two preceeding
         * rows. In this implementation, we do not explicitly store the zero column, so only 2 * from_len + 2 words are needed.
         * However, in the implementation of the swap_cost check, the current matrix value is used as a buffer; we can't
         * overwrite the earlier value until the swap_cost check has been performed. So we use 2 * from_len + 3 elements in the
         * buffer.
         */

        // /#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins :
        // \ buffer[mod(index)]))
        // /#define NW(x,y) ar(x, y, index + from_len + 2)
        // /#define N(x,y) ar(x, y, index + from_len + 3)
        // /#define W(x,y) ar(x, y, index + radix - 1)
        // /#define NNWW(x,y) ar(x, y, index + 1)
        // /#define mod(x) ((x) % radix)


        buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));

        low = buffer[mod(index + radix - 1)];
        for (col = 1; col < EditDistance.fromLen; col++) {
            buffer[index] = min3(col * del + ((from[col] == to[0]) ? 0 : ch), (col + 1) * del + ins, buffer[index - 1] + del);
            if (buffer[index] < low) {
                low = buffer[index];
            }
            index++;
        }

        /* Now handle the rest of the matrix */
        for (row = 1; row < EditDistance.toLen; row++) {
            for (col = 0; col < EditDistance.fromLen; col++) {
                buffer[index] = min3(nw(row, col) + ((from[col] == to[row]) ? 0 : ch), n(row, col + 1) + ins, w(row + 1, col) + del);

                if (from[col] == to[row - 1] && col > 0 && from[col - 1] == to[row]) {
                    buffer[index] = min2(buffer[index], nnww(row - 1, col - 1) + swapCost);
                }

                if (buffer[index] < low || col == 0) {
                    low = buffer[index];
                }
                index = mod(index + 1);
            }
            if (low > MIN_DIST) {
                break;
            }
        }

        row = buffer[mod(index + radix - 1)];

        return row;
    } // edit_distn

    /** This class is not meant to be instantiated. */
    private EditDistance() {}
}