HtmlEscapePlace.java

/***********************************************************
 * This place transforms &#xxxx; formatted HTML Escape
 * stuff into normal unicode (utf-8 characters)
 **/

package emissary.transform;

import emissary.core.Form;
import emissary.core.IBaseDataObject;
import emissary.place.ServiceProviderPlace;
import emissary.transform.decode.HtmlEscape;
import emissary.util.CharacterCounterSet;
import emissary.util.DataUtil;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import javax.annotation.Nullable;

import static emissary.core.Form.HTML;
import static emissary.core.Form.PREFIXES_LANG;
import static emissary.core.Form.SUFFIXES_HTMLESC;
import static emissary.core.Form.TEXT;
import static emissary.core.constants.Configurations.OUTPUT_FORM;
import static emissary.core.constants.Parameters.DOCUMENT_TITLE;
import static emissary.core.constants.Parameters.SUFFIXES_HTML_ESCAPE;
import static emissary.core.constants.Parameters.SUMMARY;

@Deprecated
public class HtmlEscapePlace extends ServiceProviderPlace {

    /**
     * Can be overridden from config file
     */
    @Nullable
    private String outputForm = null;

    /**
     * The remote constructor
     */
    public HtmlEscapePlace(String cfgInfo, String dir, String placeLoc) throws IOException {
        super(cfgInfo, dir, placeLoc);
        configurePlace();
    }

    /**
     * The test constructor
     */
    public HtmlEscapePlace(String cfgInfo) throws IOException {
        super(cfgInfo, "TestHtmlEscapePlace.example.com:8001");
        configurePlace();
    }

    /**
     * Create with the default configuration
     */
    public HtmlEscapePlace() throws IOException {
        super();
        configurePlace();
    }

    /**
     * Take care of special place configuration
     */
    protected void configurePlace() {
        outputForm = configG.findStringEntry(OUTPUT_FORM, null);

        // Force statics to load
        HtmlEscape.unescapeHtml(new byte[0]);
    }

    /**
     * Consume a dataObject and return a modified one.
     */
    @Override
    public void process(IBaseDataObject d) {
        if (DataUtil.isEmpty(d)) {
            logger.debug("empty data");
            return;
        }
        String incomingForm = d.currentForm();
        CharacterCounterSet counters = new CharacterCounterSet();

        logger.debug("Just got a payload with form {}", incomingForm);

        byte[] newData = HtmlEscape.unescapeHtml(d.data(), counters);

        if (newData != null && newData.length > 0) {
            newData = HtmlEscape.unescapeEntities(newData, counters);
            if (outputForm != null) {
                d.setCurrentForm(outputForm);
            }
            // Track how much change in size there was
            int variance = d.dataLength() - newData.length;
            if (variance < 0) {
                variance *= -1;
            }
            d.setParameter("HTML_Entity_Decode_Variance", Integer.toString(variance));
            d.setData(newData);
            d.setFileTypeIfEmpty(HTML);

            for (String key : counters.getKeys()) {
                d.putParameter(key + SUFFIXES_HTML_ESCAPE, Integer.toString(counters.get(key)));
            }

        } else {
            logger.warn("error doing HtmlEscape, unable to decode");
            d.pushCurrentForm(Form.ERROR);
        }

        unescapeAltViews(d);
        unescapeSummary(d);
        unescapeDocTitle(d);
        processEncoding(d);
        processCurrentForms(d);
        nukeMyProxies(d);
    }

    protected void unescapeAltViews(IBaseDataObject d) {
        // Unescape any TEXT alt views we may have
        d.getAlternateViewNames().stream().filter(v -> v.startsWith(TEXT)).forEach(viewName -> {
            byte[] textView = d.getAlternateView(viewName);
            if (ArrayUtils.isNotEmpty(textView)) {
                byte[] s = HtmlEscape.unescapeHtml(textView);
                if (ArrayUtils.isNotEmpty(s)) {
                    s = HtmlEscape.unescapeEntities(s);
                    if (ArrayUtils.isNotEmpty(s)) {
                        d.addAlternateView(viewName, s);
                    }
                }
            }
        });
    }

    protected void unescapeSummary(IBaseDataObject d) {
        // Unescape the Summary if present
        String summary = d.getStringParameter(SUMMARY);
        if (StringUtils.contains(summary, "&#")) {
            logger.debug("Working on summary "/* + summary */);
            String s = makeString(HtmlEscape.unescapeHtml(summary.getBytes()));
            if (StringUtils.isNotBlank(s)) {
                s = HtmlEscape.unescapeEntities(s);
                d.deleteParameter(SUMMARY);
                d.putParameter(SUMMARY, s);
            }
        }
    }

    protected void unescapeDocTitle(IBaseDataObject d) {
        // Unescape the Document Title
        String title = d.getStringParameter(DOCUMENT_TITLE);
        if (StringUtils.contains(title, "&#")) {
            logger.debug("Working on title "/* + title */);
            String s = makeString(HtmlEscape.unescapeHtml(title.getBytes()));
            if (StringUtils.isNotBlank(s)) {
                d.deleteParameter(DOCUMENT_TITLE);
                s = HtmlEscape.unescapeEntities(s);
                d.putParameter(DOCUMENT_TITLE, s);
            }
        }
        logger.debug("Retrieved new title ");
    }

    protected void processEncoding(IBaseDataObject d) {
        // If the encoding or the LANG- form has -HTMLESC remove it
        String enc = d.getFontEncoding();
        if (StringUtils.contains(enc, SUFFIXES_HTMLESC)) {
            d.setFontEncoding(enc.replaceFirst(SUFFIXES_HTMLESC, ""));
        }
    }

    protected void processCurrentForms(IBaseDataObject d) {
        for (String cf : d.getAllCurrentForms()) {
            if (cf.contains(PREFIXES_LANG) && cf.contains(SUFFIXES_HTMLESC)) {
                // Get the old pos
                int pos = d.searchCurrentForm(cf);
                d.deleteCurrentForm(cf);
                cf = cf.replaceFirst(SUFFIXES_HTMLESC, "");
                d.addCurrentFormAt(pos, cf);
                break;
            }
        }
    }

    public static String makeString(byte[] s) {
        return new String(s, StandardCharsets.UTF_8);
    }


}