1 package emissary.transform.decode;
2
3 import emissary.util.ByteUtil;
4 import emissary.util.CharacterCounterSet;
5 import emissary.util.HtmlEntityMap;
6 import emissary.util.shell.Executrix;
7
8 import jakarta.annotation.Nullable;
9 import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory;
11
12 import java.io.ByteArrayOutputStream;
13 import java.io.IOException;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17 @Deprecated
18 @SuppressWarnings("all")
19 public class HtmlEscape {
20
21
22 private static final Logger logger = LoggerFactory.getLogger(HtmlEscape.class);
23
24 private static final int LONGEST_ENTITY_NAME = 33;
25
26
27
28
29 private final static HtmlEntityMap HTML_ENTITY_MAP = new HtmlEntityMap();
30
31
32
33
34 private final static Pattern HESC_PATTERN = Pattern.compile("&#([xX]?)(\\p{XDigit}{2,5});");
35
36
37
38
39
40
41
42 public static byte[] unescapeHtml(byte[] data) {
43 return unescapeHtml(data, null);
44 }
45
46
47
48
49
50
51
52
53
54
55
56 public static byte[] unescapeHtml(@Nullable byte[] data, @Nullable CharacterCounterSet counters) {
57
58 ByteArrayOutputStream baos = null;
59 byte[] returnBytes = null;
60 if (data == null || data.length == 0) {
61 return new byte[0];
62 }
63
64 try {
65 baos = new ByteArrayOutputStream();
66 for (int i = 0; i < data.length; i++) {
67
68 if (data[i] == '&' && i + 3 < data.length && data[i + 1] == '#') {
69 int j = i + 2;
70 boolean isHex = false;
71
72
73 if (data[j] == 'X' || data[j] == 'x') {
74 j++;
75 isHex = true;
76 }
77
78 int startPos = j;
79
80
81 while (j < data.length && ByteUtil.isHexadecimal(data[j]) && j < startPos + 5) {
82 j++;
83 }
84
85 if (j < data.length && data[j] == ';') {
86
87 char[] c = unescapeHtmlChar(new String(data, startPos, j - startPos), isHex);
88 if (c != null) {
89
90 String s = new String(c);
91 baos.write(s.getBytes());
92 if (counters != null) {
93 counters.count(s);
94 }
95 i = j;
96 } else {
97
98 baos.write(data[i]);
99 }
100 } else {
101 baos.write(data[i]);
102 }
103 } else {
104 baos.write(data[i]);
105 }
106 }
107 returnBytes = baos.toByteArray();
108 baos.close();
109 } catch (IOException e) {
110 logger.debug("Cannot decode HTML bytes", e);
111 }
112 return returnBytes;
113 }
114
115
116
117
118
119
120
121 public static String unescapeHtml(String s) {
122 return unescapeHtml(s, null);
123 }
124
125
126
127
128
129
130
131
132 public static String unescapeHtml(@Nullable String s, @Nullable CharacterCounterSet counters) {
133 if (s == null || s.length() == 0) {
134 return "";
135 }
136
137 StringBuffer sb = new StringBuffer(s.length());
138 Matcher m = HESC_PATTERN.matcher(s);
139
140
141 while (m.find()) {
142
143 String hexModifier = m.group(1);
144 boolean isHex = hexModifier != null && hexModifier.length() > 0;
145 String encodedChar = m.group(2);
146 char[] c = unescapeHtmlChar(encodedChar, isHex);
147 if (c != null) {
148
149 m.appendReplacement(sb, "");
150 sb.append(c);
151 if (counters != null) {
152 counters.count(new String(c));
153 }
154 } else {
155
156 m.appendReplacement(sb, "$0");
157 }
158 }
159
160
161 m.appendTail(sb);
162
163
164 return sb.toString();
165 }
166
167
168
169
170
171
172
173
174
175 @Nullable
176 public static char[] unescapeHtmlChar(String s, boolean isHex) {
177 int num = -1;
178 try {
179 num = Integer.parseInt(s, isHex ? 16 : 10);
180 } catch (NumberFormatException ex) {
181 logger.debug("Failed to parse char", ex);
182 return null;
183 }
184
185 try {
186 return Character.toChars(num);
187 } catch (Exception ex) {
188 logger.debug("Cannot toChars: not a valid Unicode code point", ex);
189 return null;
190 }
191 }
192
193
194
195
196
197
198 public static String unescapeEntities(String s) {
199 return unescapeEntities(s, null);
200 }
201
202
203
204
205
206
207
208
209 public static String unescapeEntities(String s, @Nullable CharacterCounterSet counters) {
210 int slen = s.length();
211 StringBuilder sb = new StringBuilder(s.length());
212
213 for (int i = 0; i < slen; i++) {
214 char c = s.charAt(i);
215 if (c != '&') {
216 sb.append(c);
217 } else {
218 int spos = i;
219 int j = spos + 1;
220 while (j < slen && j < (spos + LONGEST_ENTITY_NAME) && s.charAt(j) != ';' && s.charAt(j) != ' ') {
221 j++;
222 }
223
224 if (j < slen && j == spos + 1)
225 {
226 spos++;
227 j = spos + 1;
228 while (j < slen && j < (spos + LONGEST_ENTITY_NAME) && s.charAt(j) != ';' && s.charAt(j) != ' ') {
229 j++;
230 }
231 }
232
233 if (j < slen + 1) {
234 String ent = null;
235 if (j < slen) {
236 if (s.charAt(j) == ';' || s.charAt(j) == ' ') {
237 ent = s.substring(spos + 1, j);
238 }
239 } else {
240
241 ent = s.substring(spos + 1);
242 }
243
244 if (ent != null) {
245 String val = getValueForHtmlEntity(ent);
246 if (val != null) {
247 sb.append(val);
248 if (counters != null) {
249 counters.count(val);
250 }
251 if (j >= slen) {
252 i = slen;
253 } else {
254 i = s.charAt(j) == ' ' ? (j - 1) : j;
255 }
256 } else {
257 sb.append(c);
258 }
259 } else {
260 sb.append(c);
261 }
262 } else {
263 sb.append(c);
264 }
265 }
266 }
267 return sb.toString();
268 }
269
270
271
272
273
274 public static byte[] unescapeEntities(byte[] s) {
275 return unescapeEntities(s, null);
276 }
277
278
279
280
281
282 public static byte[] unescapeEntities(byte[] s, @Nullable CharacterCounterSet counters) {
283 ByteArrayOutputStream baos = new ByteArrayOutputStream();
284 int slen = s.length;
285
286 for (int i = 0; i < slen; i++) {
287 if (i + 4 < slen && s[i] == '&') {
288 int spos = i;
289 int epos = spos + 1;
290 while (epos < slen && epos < spos + LONGEST_ENTITY_NAME && s[epos] != ';' && s[epos] != ' ') {
291 epos++;
292 }
293
294 if (epos == spos + 1)
295 {
296 spos++;
297 epos = spos + 1;
298 while (epos < slen && epos < spos + LONGEST_ENTITY_NAME && s[epos] != ';' && s[epos] != ' ') {
299 epos++;
300 }
301 }
302
303 String val = HTML_ENTITY_MAP.getValueForHtmlEntity(new String(s, spos + 1, epos - (spos + 1)));
304 if (val != null) {
305 try {
306 baos.write(val.getBytes());
307 if (counters != null) {
308 counters.count(val);
309 }
310
311
312 if (epos < slen) {
313 i = s[epos] == ' ' ? (epos - 1) : epos;
314 } else {
315 i = slen;
316 }
317 } catch (IOException iox) {
318 logger.debug("Error writing unescaped bytes", iox);
319 baos.write(s[i]);
320 }
321 } else {
322 baos.write(s[i]);
323 }
324 } else {
325 baos.write(s[i]);
326 }
327 }
328 return baos.toByteArray();
329 }
330
331 @Nullable
332 private static String getValueForHtmlEntity(String entity) {
333 String s = HTML_ENTITY_MAP.getValueForHtmlEntity(entity);
334 if (s != null) {
335 return s;
336 }
337 return null;
338 }
339
340
341 private HtmlEscape() {}
342
343 @SuppressWarnings("SystemOut")
344 public static void main(String[] args) {
345 boolean useString = false;
346 int i = 0;
347 if (args.length > 0 && args[i].equals("-s")) {
348 System.out.println("Switching to string mode");
349 useString = true;
350 i++;
351 }
352
353 for (; i < args.length; i++) {
354 byte[] content = Executrix.readDataFromFile(args[i]);
355 if (content == null) {
356 System.out.println(args[i] + ": Unreadable");
357 continue;
358 }
359
360 System.out.println(args[i]);
361 if (useString) {
362 String escaped = HtmlEscape.unescapeHtml(new String(content));
363 escaped = HtmlEscape.unescapeEntities(escaped);
364 System.out.println(escaped);
365 } else {
366 byte[] escaped = HtmlEscape.unescapeHtml(content);
367 escaped = HtmlEscape.unescapeEntities(escaped);
368 System.out.write(escaped, 0, escaped.length);
369 System.out.println();
370 }
371 }
372 }
373 }