CharsetUtil.java

package emissary.util;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import javax.annotation.Nullable;

/**
 * A collection of utilities for dealing with different character sets in Java. Mainly with the aim of getting to UTF-8.
 * The j* routines generally take Java CharSet names while the non j* routines take derived charset names.
 *
 * This class contains an interpretation in Java of the GPL method isUTF8, available in C from
 * http://billposer.org/Software/unidesc.html and the copied routine is called LegalUTF8P in Get_UTF32_From_UTF8i.c
 *
 * Copyright (C) 2003-2006 William J. Poser (billposer@alum.mit.edu)
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA or go to the web page:
 * http://www.gnu.org/licenses/gpl.txt.
 *
 * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 *
 * This class contains the Apache Licensed isUnicodeString which is from Jakarta POI http://jakarta.apache.org/poi
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file
 * to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
 * License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
public class CharsetUtil {
    private static final int[] trailingBytesForUtF8 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};

    @SuppressWarnings("unused")
    private static final long[] offsetsFromUtF8 = {0x00000000L, 0x00003080L, 0x000e2080L, 0x03c82080L, 0xfa082080L, 0x82082080L};

    // Our logger
    private static final Logger logger = LoggerFactory.getLogger(CharsetUtil.class);


    /**
     * Get an array of UTF-8 characters from the input bytes
     *
     * @param byteArray the input bytes
     * @param charSet <em>derived</em> charSet of the input array
     * @param start index into input array to start copying
     * @param end index into input array to stop copying
     * @return array of UTF8 char
     */
    public static char[] getUtfCharArray(final byte[] byteArray, final String charSet, final int start, final int end) {
        String actualCharSet = charSet;
        if (actualCharSet != null) {
            final String jcs = JavaCharSet.get(actualCharSet);
            if (jcs != null) {
                actualCharSet = jcs;
            }
        }
        return jGetUtfCharArray(byteArray, actualCharSet, start, end);
    }

    /**
     * Get an array of UTF-8 characters from the input bytes
     *
     * @param byteArray the input bytes
     * @param charSet <em>JAVA</em> charSet of the input array
     * @param start byte index into input array to start copying
     * @param end byte index into input array to stop copying
     * @return array of UTF8 char
     */
    public static char[] jGetUtfCharArray(final byte[] byteArray, @Nullable final String charSet, final int start, final int end) {
        char[] cbuffer = null;

        if (byteArray != null) {

            // Check start value
            if (start > byteArray.length || start < 0) {
                throw new ArrayIndexOutOfBoundsException("start : " + start + ", actual length " + byteArray.length);
            }

            // Check end value
            final int actualEnd;
            if ((end == -1) || (end > byteArray.length)) {
                actualEnd = byteArray.length;
            } else {
                actualEnd = end;
            }

            if (actualEnd <= start) {
                throw new ArrayIndexOutOfBoundsException("start : " + start + ", end : " + actualEnd + ", actual length " + byteArray.length);
            }

            // Convert byteArray to UTF-8
            if (charSet != null) {
                try {
                    final String converted = new String(byteArray, start, actualEnd - start, charSet);
                    cbuffer = converted.toCharArray();
                } catch (UnsupportedEncodingException uee) {
                    logger.warn("Unable to convert to " + charSet);
                    // Convert from Byte Array to Char Array...
                    cbuffer = byteToCharArray(byteArray, start, actualEnd);
                }
            } else {
                // Convert from Byte Array to Char Array...
                cbuffer = byteToCharArray(byteArray, start, actualEnd);
            }
        }

        return cbuffer;
    }

    /**
     * Get a string in the specified encoding from the input String
     */
    public static String getUtfString(final String s, final String charSet) {
        try {
            return new String(s.getBytes(StandardCharsets.ISO_8859_1), charSet);
        } catch (UnsupportedEncodingException uue) {
            logger.warn("Unable to convert to " + charSet);
        }
        return s;
    }

    /**
     * Get a string in the specified encoding
     *
     * @param data input bytes
     * @param charSet the JAVA charset
     * @return JUCS2 string or null if error
     */
    @Nullable
    public static String getUtfString(final byte[] data, final String charSet) {
        try {
            return new String(data, charSet);
        } catch (UnsupportedEncodingException uue) {
            logger.warn("Unable to convert to " + charSet);
        }
        return null;
    }

    /**
     * Convert bytes to chars using platform default encoding
     *
     * @param bArray the input data
     */
    public static char[] byteToCharArray(final byte[] bArray) {
        final String theContent = new String(bArray);
        final char[] cArray = theContent.toCharArray();
        return cArray;
    }

    /**
     * Convert bytes to chars using platform default encoding with begin and end points
     *
     * @param bArray the input data
     * @param start byte index into input to start copy
     * @param end byte index into input to end copy or -1 for end
     */
    private static char[] byteToCharArray(final byte[] bArray, final int start, final int end) {
        final int len = (end == -1 ? bArray.length : (bArray.length < end) ? bArray.length : end) - start;
        final char[] cArray = new char[len];

        for (int j = start; j < len; j++) {
            cArray[j] = (char) (0xFF & bArray[j]);
        }
        return cArray;
    }

    /**
     * test for ascii ness
     *
     * @param s string to test
     * @return true if string is ascii
     */
    public static boolean isAscii(final String s) {
        try {
            final int len = s.length();
            for (int i = 0; i < len; i++) {
                final char c = s.charAt(i);
                if (c <= 0 || c > 127) {
                    return false;
                }
            }
        } catch (RuntimeException e) {
            return false;
        }
        return true;
    }

    /**
     * Do the bytes behind this string represent valid utf8?
     *
     * @param s string to test
     * @return true if string is utf8
     */
    public static boolean isUtf8(final String s) {
        return isUtf8(s.getBytes(StandardCharsets.UTF_8));
    }

    /**
     * do these bytes represent a valid utf8 string?
     *
     * @param data the bytes to check
     * @return true if valid utf8
     */

    public static boolean isUtf8(final byte[] data) {
        return isUtf8(data, 0, data.length);
    }

    /**
     * Check for valid utf8 data. Borrowed from the unidesc package (GPL) by Bill Poser, converted from C to Java. The check
     * runs from offs to dlen-1
     *
     * @param data the bytes to check for validity
     * @param offs beginning offset to check
     * @param dlen ending offset of the range
     * @return true if valid utf8
     */
    @SuppressWarnings("fallthrough")
    public static boolean isUtf8(final byte[] data, final int offs, final int dlen) {
        int pos = offs;
        int a;

        while (pos < dlen) {
            try {
                final int val = data[pos] & 0xff;
                final int len = trailingBytesForUtF8[val] + 1;
                int srcptr = pos + len;

                switch (len) {
                    // everything falls through when true
                    case 4:
                        a = (data[--srcptr] & 0xff);
                        if (a < 0x80 || a > 0xbf) {
                            return false;
                        }
                    case 3:
                        a = (data[--srcptr] & 0xff);
                        if (a < 0x80 || a > 0xbf) {
                            return false;
                        }
                    case 2:
                        a = (data[--srcptr] & 0xff);
                        if (a > 0xbf) {
                            return false;
                        }

                        switch (val) {
                            // no fall through in this one
                            case 0xe0:
                                if (a < 0xa0) {
                                    return false;
                                }
                                break;
                            case 0xf0:
                                if (a < 0x90) {
                                    return false;
                                }
                                break;
                            case 0xf4:
                                if (a > 0x8f) {
                                    return false;
                                }
                                break;
                            default:
                                if (a < 0x80) {
                                    return false;
                                }
                        }
                    case 1:
                        if (val >= 0x80 && val < 0xc0) {
                            return false;
                        }
                        if (val >= 0xf0) {
                            return false;
                        }
                        break;
                    default:
                        return false;
                }
                pos += len;
            } catch (ArrayIndexOutOfBoundsException x) {
                logger.warn("ooops", x);
                return false;
            }
        }
        return true;
    }

    /**
     * See if string has multibyte chars (No longer based on org.apache.poi.util.StringUtil) It would be a bad idea to call
     * this with a very large string
     *
     * @param value string to test
     * @return true if string has at least one multibyte char
     */
    public static boolean hasMultibyte(@Nullable final String value) {
        if (value == null) {
            return false;
        }
        final int cpc = value.codePointCount(0, value.length());
        final int bc = value.getBytes().length;
        return cpc != bc;
    }

    /** This class is not meant to be instantiated. */
    private CharsetUtil() {}
}