Entropy.java

package emissary.util;

public class Entropy {

    /**
     * Determine if the data is text or binary using an entropy based function.
     */
    public static boolean checkText(final byte[] data) {
        return checkText(data, data.length);
    }

    public static boolean checkText(final byte[] data, int length) {
        int bytes = 0;
        double entropy = 0;
        double relativeFreq;
        int size = Math.min(length, data.length);
        int[] histogramArray = new int[256];


        // ******************************************************************
        // ****************** scan the document ******************
        // ******************************************************************
        for (int curPos = 0; curPos < size; ++curPos) {
            ++histogramArray[data[curPos] & 0xff];
            ++bytes;
        }

        // ***********************************************************************
        // ***** Use relative freqs. to estimate the entropy of the file. ******
        // ***********************************************************************

        for (int i = 0; i < 256; ++i) {
            if (histogramArray[i] != 0 && bytes > 0) {
                relativeFreq = (double) histogramArray[i] / (double) bytes;
                entropy = entropy + (relativeFreq * Math.log(1 / relativeFreq));
            }
        }

        entropy = entropy / Math.log(2);

        return entropy < 6.0;
    }

    /** This class is not meant to be instantiated. */
    private Entropy() {}
}