HtmlEscape.java
package emissary.transform.decode;
import emissary.util.ByteUtil;
import emissary.util.CharacterCounterSet;
import emissary.util.HtmlEntityMap;
import emissary.util.shell.Executrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
@Deprecated
@SuppressWarnings("all")
public class HtmlEscape {
/* our logger */
private static final Logger logger = LoggerFactory.getLogger(HtmlEscape.class);
private static final int LONGEST_ENTITY_NAME = 33; // ∳
/**
* Html Entity mapper
*/
private final static HtmlEntityMap HTML_ENTITY_MAP = new HtmlEntityMap();
/**
* Pattern for HTML escaped char finding in strings
*/
private final static Pattern HESC_PATTERN = Pattern.compile("&#([xX]?)(\\p{XDigit}{2,5});");
/**
* Unescape some HTML data without counting what was done
*
* @param data the array of bytes containing HTML escaped characters
* @return modified byte array
*/
public static byte[] unescapeHtml(byte[] data) {
return unescapeHtml(data, null);
}
/**
* Unescape some HTML data, turning <code>&#xxxx;</code> into UNICODE characters Because this operation inserts java
* Character objects into the byte array, it probably only makes sense to send in data that already matches the platform
* encoding (i.e. UTF-8 for normal usage). Otherwise the result will be a mixed up mess of multiple character sets that
* cannot possibly be understood or displayed properly.
*
* @param data the array of bytes containing HTML escaped characters
* @param counters to measure what is changed
* @return modified byte array
*/
public static byte[] unescapeHtml(@Nullable byte[] data, @Nullable CharacterCounterSet counters) {
ByteArrayOutputStream baos = null;
byte[] returnBytes = null;
if (data == null || data.length == 0) {
return new byte[0];
}
try {
baos = new ByteArrayOutputStream();
for (int i = 0; i < data.length; i++) {
// Grab one encoded character
if (data[i] == '&' && i + 3 < data.length && data[i + 1] == '#') {
int j = i + 2;
boolean isHex = false;
// Determine if &#xnnnn; or &#nnnn;
if (data[j] == 'X' || data[j] == 'x') {
j++;
isHex = true;
}
int startPos = j;
// Jump to end of digits, find a semi-colon
while (j < data.length && ByteUtil.isHexadecimal(data[j]) && j < startPos + 5) {
j++;
}
if (j < data.length && data[j] == ';') {
// Try to convert it
char[] c = unescapeHtmlChar(new String(data, startPos, j - startPos), isHex);
if (c != null) {
// write a codepoint
String s = new String(c);
baos.write(s.getBytes());
if (counters != null) {
counters.count(s);
}
i = j;
} else {
// Do no harm if the conversion fails
baos.write(data[i]);
}
} else {
baos.write(data[i]);
}
} else {
baos.write(data[i]);
}
}
returnBytes = baos.toByteArray();
baos.close();
} catch (IOException e) {
logger.debug("Cannot decode HTML bytes", e);
}
return returnBytes;
}
/**
* Unescape some HTML data without counting what was changed
*
* @param s the string of characters possibly containing escaped HTML
* @return the new String without escaped HTML
*/
public static String unescapeHtml(String s) {
return unescapeHtml(s, null);
}
/**
* Unescape some HTML data, turning <code>$#xxxx;</code> into UNICODE characters
*
* @param s the string of characters possibly containing escaped HTML
* @param counters to measure what is changed
* @return the new String without escaped HTML
*/
public static String unescapeHtml(@Nullable String s, @Nullable CharacterCounterSet counters) {
if (s == null || s.length() == 0) {
return "";
}
StringBuffer sb = new StringBuffer(s.length());
Matcher m = HESC_PATTERN.matcher(s);
// Match each occurrence
while (m.find()) {
// Grab digits from first match group
String hexModifier = m.group(1);
boolean isHex = hexModifier != null && hexModifier.length() > 0;
String encodedChar = m.group(2);
char[] c = unescapeHtmlChar(encodedChar, isHex);
if (c != null) {
// Append non-matching portion plus decoded char
m.appendReplacement(sb, "");
sb.append(c);
if (counters != null) {
counters.count(new String(c));
}
} else {
// It failed, append non-match plus original match
m.appendReplacement(sb, "$0");
}
}
// Append terminal portion
m.appendTail(sb);
// Return the new string
return sb.toString();
}
/**
* Unescape one HTML character or null if we cannot convert
*
* @param s the four digit number from the HTML encoding
* @param isHex true if the digits are in hex
* @return the Unicode codepoint in a[] char or null
*/
@Nullable
public static char[] unescapeHtmlChar(String s, boolean isHex) {
int num = -1;
try {
num = Integer.parseInt(s, isHex ? 16 : 10);
} catch (NumberFormatException ex) {
logger.debug("Failed to parse char", ex);
return null;
}
// Turn the number into a Unicode codepoint
try {
return Character.toChars(num);
} catch (Exception ex) {
logger.debug("Cannot toChars: not a valid Unicode code point", ex);
return null;
}
}
/**
* Unescape HTML entities without counting what was changed
*
* @param s the string to find entities in
*/
public static String unescapeEntities(String s) {
return unescapeEntities(s, null);
}
/**
* Unescape HTML entities like &nbsp; into normal characters Also handle broken entities like &;nbsp; and
* &nbsp (extra semi-colon and missing semi-colon respectively)
*
* @param s the string to find entities in
* @param counters to measure what was changed
*/
public static String unescapeEntities(String s, @Nullable CharacterCounterSet counters) {
int slen = s.length();
StringBuilder sb = new StringBuilder(s.length());
for (int i = 0; i < slen; i++) {
char c = s.charAt(i);
if (c != '&') {
sb.append(c);
} else {
int spos = i;
int j = spos + 1;
while (j < slen && j < (spos + LONGEST_ENTITY_NAME) && s.charAt(j) != ';' && s.charAt(j) != ' ') {
j++;
}
if (j < slen && j == spos + 1) // broken case with extra semicolon
{
spos++;
j = spos + 1;
while (j < slen && j < (spos + LONGEST_ENTITY_NAME) && s.charAt(j) != ';' && s.charAt(j) != ' ') {
j++;
}
}
if (j < slen + 1) {
String ent = null;
if (j < slen) {
if (s.charAt(j) == ';' || s.charAt(j) == ' ') {
ent = s.substring(spos + 1, j);
}
} else {
// all the rest
ent = s.substring(spos + 1);
}
if (ent != null) {
String val = getValueForHtmlEntity(ent);
if (val != null) {
sb.append(val);
if (counters != null) {
counters.count(val);
}
if (j >= slen) {
i = slen; // all done
} else {
i = s.charAt(j) == ' ' ? (j - 1) : j;
}
} else {
sb.append(c);
}
} else {
sb.append(c);
}
} else {
sb.append(c);
}
}
}
return sb.toString();
}
/**
* Unescape HTML entities without measuring what was changed
*/
public static byte[] unescapeEntities(byte[] s) {
return unescapeEntities(s, null);
}
/**
* Unescape HTML Entities like &nbsp; into normal characters Also handle broken entities like &;nbsp; and
* &nbsp (extra semi-colon and missing semi-colon respectively)
*/
public static byte[] unescapeEntities(byte[] s, @Nullable CharacterCounterSet counters) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
int slen = s.length;
for (int i = 0; i < slen; i++) {
if (i + 4 < slen && s[i] == '&') {
int spos = i;
int epos = spos + 1;
while (epos < slen && epos < spos + LONGEST_ENTITY_NAME && s[epos] != ';' && s[epos] != ' ') {
epos++;
}
if (epos == spos + 1) // broken case with extra semi-colon
{
spos++;
epos = spos + 1;
while (epos < slen && epos < spos + LONGEST_ENTITY_NAME && s[epos] != ';' && s[epos] != ' ') {
epos++;
}
}
String val = HTML_ENTITY_MAP.getValueForHtmlEntity(new String(s, spos + 1, epos - (spos + 1)));
if (val != null) {
try {
baos.write(val.getBytes());
if (counters != null) {
counters.count(val);
}
// if we used the space as a terminator, keep the
// space in the output, even though we consumed it
if (epos < slen) {
i = s[epos] == ' ' ? (epos - 1) : epos;
} else {
i = slen;
}
} catch (IOException iox) {
logger.debug("Error writing unescaped bytes", iox);
baos.write(s[i]);
}
} else {
baos.write(s[i]);
}
} else {
baos.write(s[i]);
}
}
return baos.toByteArray();
}
@Nullable
private static String getValueForHtmlEntity(String entity) {
String s = HTML_ENTITY_MAP.getValueForHtmlEntity(entity);
if (s != null) {
return s;
}
return null;
}
/** This class is not meant to be instantiated. */
private HtmlEscape() {}
@SuppressWarnings("SystemOut")
public static void main(String[] args) {
boolean useString = false;
int i = 0;
if (args.length > 0 && args[i].equals("-s")) {
System.out.println("Switching to string mode");
useString = true;
i++;
}
for (; i < args.length; i++) {
byte[] content = Executrix.readDataFromFile(args[i]);
if (content == null) {
System.out.println(args[i] + ": Unreadable");
continue;
}
System.out.println(args[i]);
if (useString) {
String escaped = HtmlEscape.unescapeHtml(new String(content));
escaped = HtmlEscape.unescapeEntities(escaped);
System.out.println(escaped);
} else {
byte[] escaped = HtmlEscape.unescapeHtml(content);
escaped = HtmlEscape.unescapeEntities(escaped);
System.out.write(escaped, 0, escaped.length);
System.out.println();
}
}
}
}