View Javadoc
1   package emissary.kff;
2   
3   import jakarta.annotation.Nullable;
4   
5   /**
6    * A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
7    * there that all look basically the same. This one is from
8    * https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
9    * 
10   * A new ssdeep hash gets calculated and saved at each level of unwrapping.
11   */
12  @SuppressWarnings("NonFinalStaticField")
13  public class EditDistance {
14      /*
15       * This edit distance code is taken from trn3.6. A few minor modifications have been made by Andrew Tridgell
16       * <tridge@samba.org> for use in spamsum.
17       */
18  
19  
20      /***************************************************************************/
21  
22  
23      /*
24       * The authors make no claims as to the fitness or correctness of this software for any use whatsoever, and it is
25       * provided as is. Any use of this software is at the user's own risk.
26       */
27  
28      /*
29       * edit_dist -- returns the minimum edit distance between two strings
30       * 
31       * Program by: Mark Maimone CMU Computer Science 13 Nov 89 Last Modified: 28 Jan 90
32       * 
33       * If the input strings have length n and m, the algorithm runs in time O(nm) and space O(min(m,n)).
34       * 
35       * HISTORY 13 Nov 89 (mwm) Created edit_dist() and set_costs().
36       * 
37       * 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD computations will work even when THRESHOLD is not a
38       * multiple of sizeof(int).
39       * 
40       * 17 May 93 (mwm) Improved performance when used with trn's newsgroup processing; assume all costs are 1, and you can
41       * terminate when a threshold is exceeded.
42       */
43  
44      private static final int MIN_DIST = 100;
45  
46      /*
47       * Use a less-general version of the routine, one that's better for trn. All change costs are 1, and it's okay to
48       * terminate if the edit distance is known to exceed MIN_DIST
49       */
50  
51      private static final int THRESHOLD = 4000; /*
52                                                  * worry about allocating more memory only when this # of bytes is exceeded
53                                                  */
54  
55      private static final int STRLENTHRESHOLD = (THRESHOLD / (Integer.SIZE / 8) - 3) / 2;
56  
57      // #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
58  
59      // #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
60      private static void swapInt(int[] x, int[] y) {
61          int iswap = x[0];
62          x[0] = y[0];
63          y[0] = iswap;
64      }
65  
66      // #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
67      private static void swapChar(/* ref */byte[][] x, /* ref */byte[][] y) {
68          byte[] cswap = x[0];
69          x[0] = y[0];
70          y[0] = cswap;
71      }
72  
73      // #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz :
74      // _my))
75      private static int min3(int x, int y, int z) {
76          int mx = x;
77          int my = y;
78          int mz = z;
79          return mx < my ? (mx < mz ? mx : mz) : (mz < my) ? mz : my;
80      }
81  
82      // #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
83      private static int min2(int x, int y) {
84          int mx = x;
85          int my = y;
86          return mx < my ? mx : my;
87      }
88  
89      static int insertCost = 1;
90      static int deleteCost = 1;
91  
92      // dynamic programming counters
93      static int row;
94      static int col;
95      static int index = 0;
96      static int radix; // radix for modular indexing
97      static int low;
98      static int[] buffer; /*
99                            * pointer to storage for one row of the d.p. array
100                           */
101 
102     static int[] store = new int[THRESHOLD / (Integer.SIZE / 8)];
103     /*
104      * a small amount of static storage, to be used when the input strings are small enough
105      */
106 
107     /* Handle trivial cases when one string is empty */
108 
109     static int ins = 1;
110     static int del = 1;
111     static int ch = 3;
112     static int swapCost = 5;
113 
114     static int fromLen;
115     static int toLen;
116 
117     private static int ar(int x, int y, int index) {
118         return (x == 0) ? y * del : (y == 0) ? x * ins : buffer[mod(index)];
119     }
120 
121     private static int nw(int x, int y) {
122         return ar(x, y, index + fromLen + 2);
123     }
124 
125     private static int n(int x, int y) {
126         return ar(x, y, index + fromLen + 3);
127     }
128 
129     private static int w(int x, int y) {
130         return ar(x, y, index + radix - 1);
131     }
132 
133     private static int nnww(int x, int y) {
134         return ar(x, y, index + 1);
135     }
136 
137     private static int mod(int x) {
138         return x % radix;
139     }
140 
141     /*
142      * returns the edit distance between two strings, or -1 on failure
143      */
144     public static int calculateEditDistance(@Nullable byte[] from, int fromLen, @Nullable byte[] to, int toLen) {
145         EditDistance.fromLen = fromLen;
146         EditDistance.toLen = toLen;
147 
148         if (from == null) {
149             if (to == null) {
150                 return 0;
151             } else {
152                 return EditDistance.toLen * insertCost;
153             }
154         } else if (to == null) {
155             return EditDistance.fromLen * deleteCost;
156         }
157 
158         /* Initialize registers */
159 
160         radix = 2 * EditDistance.fromLen + 3;
161 
162         /* Make from short enough to fit in the static storage, if it's at all possible */
163 
164         if (EditDistance.fromLen > EditDistance.toLen && EditDistance.fromLen > STRLENTHRESHOLD) {
165             int[] x = new int[1];
166             int[] y = new int[1];
167             x[0] = EditDistance.fromLen;
168             y[0] = EditDistance.toLen;
169             swapInt(x, y);
170             byte[][] xx = new byte[1][];
171             byte[][] yy = new byte[1][];
172             xx[0] = from;
173             yy[0] = to;
174             swapChar(xx, yy);
175         } // if from_len > to_len
176 
177         /* Allocate the array storage (from the heap if necessary) */
178 
179         if (EditDistance.fromLen <= STRLENTHRESHOLD) {
180             buffer = store;
181         } else {
182             buffer = new int[radix];
183         }
184 
185         /*
186          * Here's where the fun begins. We will find the minimum edit distance using dynamic programming. We only need to store
187          * two rows of the matrix at a time, since we always progress down the matrix. For example, given the strings "one" and
188          * "two", and insert, delete and change costs equal to 1:
189          * 
190          * _ o n e _ 0 1 2 3 t 1 1 2 3 w 2 2 2 3 o 3 2 3 3
191          * 
192          * The dynamic programming recursion is defined as follows:
193          * 
194          * ar(x,0) := x * insert_cost ar(0,y) := y * delete_cost ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 :
195          * change), a(x - 1, y) + insert_cost, a(x, y - 1) + delete_cost, a(x - 2, y - 2) + (from[x] == to[y-1] && from[x-1] ==
196          * to[y] ? swap_cost : infinity))
197          * 
198          * Since this only looks at most two rows and three columns back, we need only store the values for the two preceeding
199          * rows. In this implementation, we do not explicitly store the zero column, so only 2 * from_len + 2 words are needed.
200          * However, in the implementation of the swap_cost check, the current matrix value is used as a buffer; we can't
201          * overwrite the earlier value until the swap_cost check has been performed. So we use 2 * from_len + 3 elements in the
202          * buffer.
203          */
204 
205         // /#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins :
206         // \ buffer[mod(index)]))
207         // /#define NW(x,y) ar(x, y, index + from_len + 2)
208         // /#define N(x,y) ar(x, y, index + from_len + 3)
209         // /#define W(x,y) ar(x, y, index + radix - 1)
210         // /#define NNWW(x,y) ar(x, y, index + 1)
211         // /#define mod(x) ((x) % radix)
212 
213 
214         buffer[index++] = min2(ins + del, from[0] == to[0] ? 0 : ch);
215 
216         low = buffer[mod(index + radix - 1)];
217         for (col = 1; col < EditDistance.fromLen; col++) {
218             buffer[index] = min3(col * del + ((from[col] == to[0]) ? 0 : ch), (col + 1) * del + ins, buffer[index - 1] + del);
219             if (buffer[index] < low) {
220                 low = buffer[index];
221             }
222             index++;
223         }
224 
225         /* Now handle the rest of the matrix */
226         for (row = 1; row < EditDistance.toLen; row++) {
227             for (col = 0; col < EditDistance.fromLen; col++) {
228                 buffer[index] = min3(nw(row, col) + ((from[col] == to[row]) ? 0 : ch), n(row, col + 1) + ins, w(row + 1, col) + del);
229 
230                 if (from[col] == to[row - 1] && col > 0 && from[col - 1] == to[row]) {
231                     buffer[index] = min2(buffer[index], nnww(row - 1, col - 1) + swapCost);
232                 }
233 
234                 if (buffer[index] < low || col == 0) {
235                     low = buffer[index];
236                 }
237                 index = mod(index + 1);
238             }
239             if (low > MIN_DIST) {
240                 break;
241             }
242         }
243 
244         row = buffer[mod(index + radix - 1)];
245 
246         return row;
247     } // edit_distn
248 
249     /** This class is not meant to be instantiated. */
250     private EditDistance() {}
251 }