1 package emissary.kff;
2
3 import jakarta.annotation.Nullable;
4
5 /**
6 * A java port of the ssdeep code for "fuzzy hashing". http://ssdeep.sourceforge.net There are a number of ports out
7 * there that all look basically the same. This one is from
8 * https://opensourceprojects.eu/p/triplecheck/code/23/tree/tool/src/ssdeep/
9 *
10 * A new ssdeep hash gets calculated and saved at each level of unwrapping.
11 */
12 @SuppressWarnings("NonFinalStaticField")
13 public class EditDistance {
14 /*
15 * This edit distance code is taken from trn3.6. A few minor modifications have been made by Andrew Tridgell
16 * <tridge@samba.org> for use in spamsum.
17 */
18
19
20 /***************************************************************************/
21
22
23 /*
24 * The authors make no claims as to the fitness or correctness of this software for any use whatsoever, and it is
25 * provided as is. Any use of this software is at the user's own risk.
26 */
27
28 /*
29 * edit_dist -- returns the minimum edit distance between two strings
30 *
31 * Program by: Mark Maimone CMU Computer Science 13 Nov 89 Last Modified: 28 Jan 90
32 *
33 * If the input strings have length n and m, the algorithm runs in time O(nm) and space O(min(m,n)).
34 *
35 * HISTORY 13 Nov 89 (mwm) Created edit_dist() and set_costs().
36 *
37 * 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD computations will work even when THRESHOLD is not a
38 * multiple of sizeof(int).
39 *
40 * 17 May 93 (mwm) Improved performance when used with trn's newsgroup processing; assume all costs are 1, and you can
41 * terminate when a threshold is exceeded.
42 */
43
44 private static final int MIN_DIST = 100;
45
46 /*
47 * Use a less-general version of the routine, one that's better for trn. All change costs are 1, and it's okay to
48 * terminate if the edit distance is known to exceed MIN_DIST
49 */
50
51 private static final int THRESHOLD = 4000; /*
52 * worry about allocating more memory only when this # of bytes is exceeded
53 */
54
55 private static final int STRLENTHRESHOLD = (THRESHOLD / (Integer.SIZE / 8) - 3) / 2;
56
57 // #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
58
59 // #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
60 private static void swapInt(int[] x, int[] y) {
61 int iswap = x[0];
62 x[0] = y[0];
63 y[0] = iswap;
64 }
65
66 // #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
67 private static void swapChar(/* ref */byte[][] x, /* ref */byte[][] y) {
68 byte[] cswap = x[0];
69 x[0] = y[0];
70 y[0] = cswap;
71 }
72
73 // #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz :
74 // _my))
75 private static int min3(int x, int y, int z) {
76 int mx = x;
77 int my = y;
78 int mz = z;
79 return mx < my ? (mx < mz ? mx : mz) : (mz < my) ? mz : my;
80 }
81
82 // #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
83 private static int min2(int x, int y) {
84 int mx = x;
85 int my = y;
86 return mx < my ? mx : my;
87 }
88
89 static int insertCost = 1;
90 static int deleteCost = 1;
91
92 // dynamic programming counters
93 static int row;
94 static int col;
95 static int index = 0;
96 static int radix; // radix for modular indexing
97 static int low;
98 static int[] buffer; /*
99 * pointer to storage for one row of the d.p. array
100 */
101
102 static int[] store = new int[THRESHOLD / (Integer.SIZE / 8)];
103 /*
104 * a small amount of static storage, to be used when the input strings are small enough
105 */
106
107 /* Handle trivial cases when one string is empty */
108
109 static int ins = 1;
110 static int del = 1;
111 static int ch = 3;
112 static int swapCost = 5;
113
114 static int fromLen;
115 static int toLen;
116
117 private static int ar(int x, int y, int index) {
118 return (x == 0) ? y * del : (y == 0) ? x * ins : buffer[mod(index)];
119 }
120
121 private static int nw(int x, int y) {
122 return ar(x, y, index + fromLen + 2);
123 }
124
125 private static int n(int x, int y) {
126 return ar(x, y, index + fromLen + 3);
127 }
128
129 private static int w(int x, int y) {
130 return ar(x, y, index + radix - 1);
131 }
132
133 private static int nnww(int x, int y) {
134 return ar(x, y, index + 1);
135 }
136
137 private static int mod(int x) {
138 return x % radix;
139 }
140
141 /*
142 * returns the edit distance between two strings, or -1 on failure
143 */
144 public static int calculateEditDistance(@Nullable byte[] from, int fromLen, @Nullable byte[] to, int toLen) {
145 EditDistance.fromLen = fromLen;
146 EditDistance.toLen = toLen;
147
148 if (from == null) {
149 if (to == null) {
150 return 0;
151 } else {
152 return EditDistance.toLen * insertCost;
153 }
154 } else if (to == null) {
155 return EditDistance.fromLen * deleteCost;
156 }
157
158 /* Initialize registers */
159
160 radix = 2 * EditDistance.fromLen + 3;
161
162 /* Make from short enough to fit in the static storage, if it's at all possible */
163
164 if (EditDistance.fromLen > EditDistance.toLen && EditDistance.fromLen > STRLENTHRESHOLD) {
165 int[] x = new int[1];
166 int[] y = new int[1];
167 x[0] = EditDistance.fromLen;
168 y[0] = EditDistance.toLen;
169 swapInt(x, y);
170 byte[][] xx = new byte[1][];
171 byte[][] yy = new byte[1][];
172 xx[0] = from;
173 yy[0] = to;
174 swapChar(xx, yy);
175 } // if from_len > to_len
176
177 /* Allocate the array storage (from the heap if necessary) */
178
179 if (EditDistance.fromLen <= STRLENTHRESHOLD) {
180 buffer = store;
181 } else {
182 buffer = new int[radix];
183 }
184
185 /*
186 * Here's where the fun begins. We will find the minimum edit distance using dynamic programming. We only need to store
187 * two rows of the matrix at a time, since we always progress down the matrix. For example, given the strings "one" and
188 * "two", and insert, delete and change costs equal to 1:
189 *
190 * _ o n e _ 0 1 2 3 t 1 1 2 3 w 2 2 2 3 o 3 2 3 3
191 *
192 * The dynamic programming recursion is defined as follows:
193 *
194 * ar(x,0) := x * insert_cost ar(0,y) := y * delete_cost ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 :
195 * change), a(x - 1, y) + insert_cost, a(x, y - 1) + delete_cost, a(x - 2, y - 2) + (from[x] == to[y-1] && from[x-1] ==
196 * to[y] ? swap_cost : infinity))
197 *
198 * Since this only looks at most two rows and three columns back, we need only store the values for the two preceeding
199 * rows. In this implementation, we do not explicitly store the zero column, so only 2 * from_len + 2 words are needed.
200 * However, in the implementation of the swap_cost check, the current matrix value is used as a buffer; we can't
201 * overwrite the earlier value until the swap_cost check has been performed. So we use 2 * from_len + 3 elements in the
202 * buffer.
203 */
204
205 // /#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins :
206 // \ buffer[mod(index)]))
207 // /#define NW(x,y) ar(x, y, index + from_len + 2)
208 // /#define N(x,y) ar(x, y, index + from_len + 3)
209 // /#define W(x,y) ar(x, y, index + radix - 1)
210 // /#define NNWW(x,y) ar(x, y, index + 1)
211 // /#define mod(x) ((x) % radix)
212
213
214 buffer[index++] = min2(ins + del, from[0] == to[0] ? 0 : ch);
215
216 low = buffer[mod(index + radix - 1)];
217 for (col = 1; col < EditDistance.fromLen; col++) {
218 buffer[index] = min3(col * del + ((from[col] == to[0]) ? 0 : ch), (col + 1) * del + ins, buffer[index - 1] + del);
219 if (buffer[index] < low) {
220 low = buffer[index];
221 }
222 index++;
223 }
224
225 /* Now handle the rest of the matrix */
226 for (row = 1; row < EditDistance.toLen; row++) {
227 for (col = 0; col < EditDistance.fromLen; col++) {
228 buffer[index] = min3(nw(row, col) + ((from[col] == to[row]) ? 0 : ch), n(row, col + 1) + ins, w(row + 1, col) + del);
229
230 if (from[col] == to[row - 1] && col > 0 && from[col - 1] == to[row]) {
231 buffer[index] = min2(buffer[index], nnww(row - 1, col - 1) + swapCost);
232 }
233
234 if (buffer[index] < low || col == 0) {
235 low = buffer[index];
236 }
237 index = mod(index + 1);
238 }
239 if (low > MIN_DIST) {
240 break;
241 }
242 }
243
244 row = buffer[mod(index + radix - 1)];
245
246 return row;
247 } // edit_distn
248
249 /** This class is not meant to be instantiated. */
250 private EditDistance() {}
251 }