View Javadoc
1   package emissary.transform.decode;
2   
3   import emissary.util.ByteUtil;
4   import emissary.util.CharacterCounterSet;
5   import emissary.util.HtmlEntityMap;
6   import emissary.util.shell.Executrix;
7   
8   import jakarta.annotation.Nullable;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  
12  import java.io.ByteArrayOutputStream;
13  import java.io.IOException;
14  import java.util.regex.Matcher;
15  import java.util.regex.Pattern;
16  
17  @Deprecated
18  @SuppressWarnings("all")
19  public class HtmlEscape {
20  
21      /* our logger */
22      private static final Logger logger = LoggerFactory.getLogger(HtmlEscape.class);
23  
24      private static final int LONGEST_ENTITY_NAME = 33; // &CounterClockwiseContourIntegral;
25  
26      /**
27       * Html Entity mapper
28       */
29      private final static HtmlEntityMap HTML_ENTITY_MAP = new HtmlEntityMap();
30  
31      /**
32       * Pattern for HTML escaped char finding in strings
33       */
34      private final static Pattern HESC_PATTERN = Pattern.compile("&#([xX]?)(\\p{XDigit}{2,5});");
35  
36      /**
37       * Unescape some HTML data without counting what was done
38       *
39       * @param data the array of bytes containing HTML escaped characters
40       * @return modified byte array
41       */
42      public static byte[] unescapeHtml(byte[] data) {
43          return unescapeHtml(data, null);
44      }
45  
46      /**
47       * Unescape some HTML data, turning <code>&amp;#xxxx;</code> into UNICODE characters Because this operation inserts java
48       * Character objects into the byte array, it probably only makes sense to send in data that already matches the platform
49       * encoding (i.e. UTF-8 for normal usage). Otherwise the result will be a mixed up mess of multiple character sets that
50       * cannot possibly be understood or displayed properly.
51       *
52       * @param data the array of bytes containing HTML escaped characters
53       * @param counters to measure what is changed
54       * @return modified byte array
55       */
56      public static byte[] unescapeHtml(@Nullable byte[] data, @Nullable CharacterCounterSet counters) {
57  
58          ByteArrayOutputStream baos = null;
59          byte[] returnBytes = null;
60          if (data == null || data.length == 0) {
61              return new byte[0];
62          }
63  
64          try {
65              baos = new ByteArrayOutputStream();
66              for (int i = 0; i < data.length; i++) {
67                  // Grab one encoded character
68                  if (data[i] == '&' && i + 3 < data.length && data[i + 1] == '#') {
69                      int j = i + 2;
70                      boolean isHex = false;
71  
72                      // Determine if &#xnnnn; or &#nnnn;
73                      if (data[j] == 'X' || data[j] == 'x') {
74                          j++;
75                          isHex = true;
76                      }
77  
78                      int startPos = j;
79  
80                      // Jump to end of digits, find a semi-colon
81                      while (j < data.length && ByteUtil.isHexadecimal(data[j]) && j < startPos + 5) {
82                          j++;
83                      }
84  
85                      if (j < data.length && data[j] == ';') {
86                          // Try to convert it
87                          char[] c = unescapeHtmlChar(new String(data, startPos, j - startPos), isHex);
88                          if (c != null) {
89                              // write a codepoint
90                              String s = new String(c);
91                              baos.write(s.getBytes());
92                              if (counters != null) {
93                                  counters.count(s);
94                              }
95                              i = j;
96                          } else {
97                              // Do no harm if the conversion fails
98                              baos.write(data[i]);
99                          }
100                     } else {
101                         baos.write(data[i]);
102                     }
103                 } else {
104                     baos.write(data[i]);
105                 }
106             }
107             returnBytes = baos.toByteArray();
108             baos.close();
109         } catch (IOException e) {
110             logger.debug("Cannot decode HTML bytes", e);
111         }
112         return returnBytes;
113     }
114 
115     /**
116      * Unescape some HTML data without counting what was changed
117      *
118      * @param s the string of characters possibly containing escaped HTML
119      * @return the new String without escaped HTML
120      */
121     public static String unescapeHtml(String s) {
122         return unescapeHtml(s, null);
123     }
124 
125     /**
126      * Unescape some HTML data, turning <code>$#xxxx;</code> into UNICODE characters
127      *
128      * @param s the string of characters possibly containing escaped HTML
129      * @param counters to measure what is changed
130      * @return the new String without escaped HTML
131      */
132     public static String unescapeHtml(@Nullable String s, @Nullable CharacterCounterSet counters) {
133         if (s == null || s.length() == 0) {
134             return "";
135         }
136 
137         StringBuffer sb = new StringBuffer(s.length());
138         Matcher m = HESC_PATTERN.matcher(s);
139 
140         // Match each occurrence
141         while (m.find()) {
142             // Grab digits from first match group
143             String hexModifier = m.group(1);
144             boolean isHex = hexModifier != null && hexModifier.length() > 0;
145             String encodedChar = m.group(2);
146             char[] c = unescapeHtmlChar(encodedChar, isHex);
147             if (c != null) {
148                 // Append non-matching portion plus decoded char
149                 m.appendReplacement(sb, "");
150                 sb.append(c);
151                 if (counters != null) {
152                     counters.count(new String(c));
153                 }
154             } else {
155                 // It failed, append non-match plus original match
156                 m.appendReplacement(sb, "$0");
157             }
158         }
159 
160         // Append terminal portion
161         m.appendTail(sb);
162 
163         // Return the new string
164         return sb.toString();
165     }
166 
167 
168     /**
169      * Unescape one HTML character or null if we cannot convert
170      *
171      * @param s the four digit number from the HTML encoding
172      * @param isHex true if the digits are in hex
173      * @return the Unicode codepoint in a[] char or null
174      */
175     @Nullable
176     public static char[] unescapeHtmlChar(String s, boolean isHex) {
177         int num = -1;
178         try {
179             num = Integer.parseInt(s, isHex ? 16 : 10);
180         } catch (NumberFormatException ex) {
181             logger.debug("Failed to parse char", ex);
182             return null;
183         }
184         // Turn the number into a Unicode codepoint
185         try {
186             return Character.toChars(num);
187         } catch (Exception ex) {
188             logger.debug("Cannot toChars: not a valid Unicode code point", ex);
189             return null;
190         }
191     }
192 
193     /**
194      * Unescape HTML entities without counting what was changed
195      *
196      * @param s the string to find entities in
197      */
198     public static String unescapeEntities(String s) {
199         return unescapeEntities(s, null);
200     }
201 
202     /**
203      * Unescape HTML entities like &amp;nbsp; into normal characters Also handle broken entities like &amp;;nbsp; and
204      * &amp;nbsp (extra semi-colon and missing semi-colon respectively)
205      *
206      * @param s the string to find entities in
207      * @param counters to measure what was changed
208      */
209     public static String unescapeEntities(String s, @Nullable CharacterCounterSet counters) {
210         int slen = s.length();
211         StringBuilder sb = new StringBuilder(s.length());
212 
213         for (int i = 0; i < slen; i++) {
214             char c = s.charAt(i);
215             if (c != '&') {
216                 sb.append(c);
217             } else {
218                 int spos = i;
219                 int j = spos + 1;
220                 while (j < slen && j < (spos + LONGEST_ENTITY_NAME) && s.charAt(j) != ';' && s.charAt(j) != ' ') {
221                     j++;
222                 }
223 
224                 if (j < slen && j == spos + 1) // broken case with extra semicolon
225                 {
226                     spos++;
227                     j = spos + 1;
228                     while (j < slen && j < (spos + LONGEST_ENTITY_NAME) && s.charAt(j) != ';' && s.charAt(j) != ' ') {
229                         j++;
230                     }
231                 }
232 
233                 if (j < slen + 1) {
234                     String ent = null;
235                     if (j < slen) {
236                         if (s.charAt(j) == ';' || s.charAt(j) == ' ') {
237                             ent = s.substring(spos + 1, j);
238                         }
239                     } else {
240                         // all the rest
241                         ent = s.substring(spos + 1);
242                     }
243 
244                     if (ent != null) {
245                         String val = getValueForHtmlEntity(ent);
246                         if (val != null) {
247                             sb.append(val);
248                             if (counters != null) {
249                                 counters.count(val);
250                             }
251                             if (j >= slen) {
252                                 i = slen; // all done
253                             } else {
254                                 i = s.charAt(j) == ' ' ? (j - 1) : j;
255                             }
256                         } else {
257                             sb.append(c);
258                         }
259                     } else {
260                         sb.append(c);
261                     }
262                 } else {
263                     sb.append(c);
264                 }
265             }
266         }
267         return sb.toString();
268     }
269 
270 
271     /**
272      * Unescape HTML entities without measuring what was changed
273      */
274     public static byte[] unescapeEntities(byte[] s) {
275         return unescapeEntities(s, null);
276     }
277 
278     /**
279      * Unescape HTML Entities like &amp;nbsp; into normal characters Also handle broken entities like &amp;;nbsp; and
280      * &amp;nbsp (extra semi-colon and missing semi-colon respectively)
281      */
282     public static byte[] unescapeEntities(byte[] s, @Nullable CharacterCounterSet counters) {
283         ByteArrayOutputStream baos = new ByteArrayOutputStream();
284         int slen = s.length;
285 
286         for (int i = 0; i < slen; i++) {
287             if (i + 4 < slen && s[i] == '&') {
288                 int spos = i;
289                 int epos = spos + 1;
290                 while (epos < slen && epos < spos + LONGEST_ENTITY_NAME && s[epos] != ';' && s[epos] != ' ') {
291                     epos++;
292                 }
293 
294                 if (epos == spos + 1) // broken case with extra semi-colon
295                 {
296                     spos++;
297                     epos = spos + 1;
298                     while (epos < slen && epos < spos + LONGEST_ENTITY_NAME && s[epos] != ';' && s[epos] != ' ') {
299                         epos++;
300                     }
301                 }
302 
303                 String val = HTML_ENTITY_MAP.getValueForHtmlEntity(new String(s, spos + 1, epos - (spos + 1)));
304                 if (val != null) {
305                     try {
306                         baos.write(val.getBytes());
307                         if (counters != null) {
308                             counters.count(val);
309                         }
310                         // if we used the space as a terminator, keep the
311                         // space in the output, even though we consumed it
312                         if (epos < slen) {
313                             i = s[epos] == ' ' ? (epos - 1) : epos;
314                         } else {
315                             i = slen;
316                         }
317                     } catch (IOException iox) {
318                         logger.debug("Error writing unescaped bytes", iox);
319                         baos.write(s[i]);
320                     }
321                 } else {
322                     baos.write(s[i]);
323                 }
324             } else {
325                 baos.write(s[i]);
326             }
327         }
328         return baos.toByteArray();
329     }
330 
331     @Nullable
332     private static String getValueForHtmlEntity(String entity) {
333         String s = HTML_ENTITY_MAP.getValueForHtmlEntity(entity);
334         if (s != null) {
335             return s;
336         }
337         return null;
338     }
339 
340     /** This class is not meant to be instantiated. */
341     private HtmlEscape() {}
342 
343     @SuppressWarnings("SystemOut")
344     public static void main(String[] args) {
345         boolean useString = false;
346         int i = 0;
347         if (args.length > 0 && args[i].equals("-s")) {
348             System.out.println("Switching to string mode");
349             useString = true;
350             i++;
351         }
352 
353         for (; i < args.length; i++) {
354             byte[] content = Executrix.readDataFromFile(args[i]);
355             if (content == null) {
356                 System.out.println(args[i] + ": Unreadable");
357                 continue;
358             }
359 
360             System.out.println(args[i]);
361             if (useString) {
362                 String escaped = HtmlEscape.unescapeHtml(new String(content));
363                 escaped = HtmlEscape.unescapeEntities(escaped);
364                 System.out.println(escaped);
365             } else {
366                 byte[] escaped = HtmlEscape.unescapeHtml(content);
367                 escaped = HtmlEscape.unescapeEntities(escaped);
368                 System.out.write(escaped, 0, escaped.length);
369                 System.out.println();
370             }
371         }
372     }
373 }