ByteUtil.java

package emissary.util;

import jakarta.annotation.Nullable;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;

/**
 * Common place for the logic to glue byte arrays back together. This is error-prone and shouldn't be thought about any
 * more than necessary
 */
public class ByteUtil {
    public static final byte ASCII_0 = '0';
    public static final byte ASCII_9 = '9';
    public static final byte ASCII_A_LC = 'a';
    public static final byte ASCII_B_LC = 'b';
    public static final byte ASCII_F_LC = 'f';
    public static final byte ASCII_Z_LC = 'z';
    public static final byte ASCII_A_UC = 'A';
    public static final byte ASCII_F_UC = 'F';
    public static final byte ASCII_Z_UC = 'Z';
    public static final byte ASCII_SLASH = '/';
    public static final byte ASCII_ESC = 0x1b;
    public static final byte ASCII_SP = 0x20;
    public static final byte ASCII_DEL = 0x7f;
    public static final String HEX = "0123456789abcdefABCDEF";

    /**
     * Check if byte is hexadecimal
     *
     * @param b a byte
     * @return true if b is a hexadecimal
     */
    public static boolean isHexadecimal(byte b) {
        return (b >= ASCII_A_UC && b <= ASCII_F_UC) || (b >= ASCII_A_LC && b <= ASCII_F_LC) || isDigit(b);
    }

    /**
     * Check if all bytes in array are hexadecimal
     *
     * @param array a byte array
     * @return true if all bytes in array are hexadecimal
     */
    public static boolean isHexadecimal(byte[] array) {
        for (byte b : array) {
            if (!isHexadecimal(b)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Check if character is hexadecimal
     *
     * @param c a char
     * @return true if c is a hexadecimal
     */
    public static boolean isHexadecimal(char c) {
        return HEX.indexOf(c) > -1;
    }

    /**
     * Check if all bytes are alphabetical
     *
     * @param array a byte array
     * @return true if all bytes in array are alpha
     */
    public static boolean isAlpha(byte[] array) {
        for (byte b : array) {
            if (!isAlpha(b)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Check if byte is alphabetical
     *
     * @param b a byte
     * @return true if b is alphabetical
     */
    public static boolean isAlpha(byte b) {
        return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z');
    }

    /**
     * Check if byte is alphanumeric
     *
     * @param b a byte
     * @return true if b is alphanumeric
     */
    public static boolean isAlNum(byte b) {
        return isAlpha(b) || isDigit(b);
    }

    /**
     * Check if byte is a digit
     *
     * @param b a byte
     * @return true if b is a digit
     */
    public static boolean isDigit(byte b) {
        // check ascii value of b for digit-ness
        return b >= '0' && b <= '9';
    }

    /**
     * Check if all bytes are digits
     *
     * @param array a byte array
     * @return true if all bytes in array are digits
     */
    public static boolean isDigit(byte[] array) {
        for (byte b : array) {
            if (!isDigit(b)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Check if byte at position in array is a control or blank space byte
     *
     * @param b a byte array
     * @param pos a position in the byte array
     * @return true if byte at pos in array b is a control or blank space byte
     */
    public static boolean isControlOrBlankSpace(byte[] b, int pos) {
        if (b[pos] == ASCII_DEL || b[pos] <= ASCII_SP) {
            return true;
        }
        if (b[pos] == ASCII_B_LC && pos > 0 && b[pos - 1] == ASCII_ESC) {
            return true;
        }

        // Check if the current pos is the first byte in a UTF-8 C1
        // control character (U+0080..U+009f).
        final int curr = b[pos] & 0xff;
        final int next = (pos < (b.length - 1)) ? (b[pos + 1] & 0xff) : -1;
        if ((curr == 0xc2) && (next >= 0x80) && (next <= 0x9f)) {
            return true;
        }

        // Check if the current pos is the second byte in a UTF-8 C1
        // control character (U+0080..U+009f).
        final int prev = (pos > 0) ? (b[pos - 1] & 0xff) : -1;
        return (prev == 0xc2) && (curr >= 0x80) && (curr <= 0x9f);
    }

    /**
     * Glue two byte arrays together into one
     * 
     * @param a the first byte array
     * @param b the second byte array
     * @return the whole
     */
    public static byte[] glue(@Nullable byte[] a, @Nullable byte[] b) {
        if (a == null) {
            return b;
        }
        if (b == null) {
            return a;
        }
        return glue(a, 0, a.length - 1, b, 0, b.length - 1);
    }

    /**
     * Glue three byte arrays together into one
     * 
     * @param a the first byte array
     * @param b the second byte array
     * @param c the third byte array
     * @return the whole
     */
    public static byte[] glue(@Nullable byte[] a, @Nullable byte[] b, @Nullable byte[] c) {
        if (a == null) {
            return glue(b, c);
        }
        if (b == null) {
            return glue(a, c);
        }
        if (c == null) {
            return glue(a, b);
        }
        return glue(a, 0, a.length - 1, b, 0, b.length - 1, c, 0, c.length - 1);
    }

    /**
     * Glue two byte arrays together into one
     * 
     * @param a the first byte array
     * @param astart starting position in a
     * @param aend ending position in a
     * @param b the second byte array
     * @param bstart starting position in b
     * @param bend ending position in b
     * @return the whole
     */
    @SuppressWarnings("InconsistentOverloads")
    public static byte[] glue(byte[] a, int astart, int aend, byte[] b, int bstart, int bend) {
        int alen = aend - astart + 1;
        int blen = bend - bstart + 1;

        byte[] rslt = new byte[alen + blen];
        System.arraycopy(a, astart, rslt, 0, alen);
        System.arraycopy(b, bstart, rslt, alen, blen);
        return rslt;
    }

    /**
     * Glue three byte arrays together into one
     * 
     * @param a the first byte array
     * @param astart starting position in a
     * @param aend ending position in a
     * @param b the second byte array
     * @param bstart starting position in b
     * @param bend ending position in b
     * @param c the third byte array
     * @param cstart starting position in c
     * @param cend ending position in c
     * @return the whole
     */
    @SuppressWarnings("InconsistentOverloads")
    public static byte[] glue(byte[] a, int astart, int aend, byte[] b, int bstart, int bend, byte[] c, int cstart, int cend) {
        int alen = aend - astart + 1;
        int blen = bend - bstart + 1;
        int clen = cend - cstart + 1;

        byte[] rslt = new byte[alen + blen + clen];
        System.arraycopy(a, astart, rslt, 0, alen);
        System.arraycopy(b, bstart, rslt, alen, blen);
        System.arraycopy(c, cstart, rslt, alen + blen, clen);
        return rslt;
    }

    /**
     * Split a byte array at the specified position
     * 
     * @param a the byte array
     * @param pos the split position (a[pos] goes to the second part)
     */
    public static List<byte[]> split(@Nullable byte[] a, int pos) {
        List<byte[]> list = new ArrayList<>();
        if (a != null && pos > 0 && pos <= a.length) {
            byte[] part1 = new byte[pos];
            byte[] part2 = new byte[a.length - pos];
            System.arraycopy(a, 0, part1, 0, pos);
            System.arraycopy(a, pos, part2, 0, part2.length);
            list.add(part1);
            list.add(part2);
        } else {
            // Just give back the original
            list.add(a);
        }
        return list;
    }

    /**
     * Given a byte-array and a start offset, return a string of the bytes between the start position and a carriage return
     * byte. In essence, this is grabbing a line of input where the byte array is composed of several lines of input.
     * 
     * @param data The byte array of input data.
     * @param pos The initial start offset.
     * @return A string created from the bytes found from the start offset to the carriage return byte.
     */
    public static String grabLine(byte[] data, int pos) {
        String ret = null;
        int eolnPos = -1;
        for (int i = pos; i < data.length; i++) {
            if (data[i] == '\n') {
                eolnPos = i;
                break;
            }
        }
        if (eolnPos != -1) {
            // String up to the found \n pos
            ret = new String(data, pos, eolnPos - pos + 1);
        } else {
            // String to end of buffer
            ret = new String(data, pos, data.length - pos);
        }
        return ret;
    }

    /**
     * Scans a byte array looking for non-printable values.
     * 
     * @param bytes the bytes to be scanned.
     * @return whether or not there were non-printable values.
     */
    public static boolean hasNonPrintableValues(final byte[] bytes) {
        boolean badCharacters = false;

        for (byte aByte : bytes) {
            if (aByte < 9 || (aByte > 13 && aByte < 32)) {
                badCharacters = true;
                break;
            }
        }

        return badCharacters;
    }

    /**
     * Creates a hex string of a sha256 hash for a byte[].
     * 
     * @param bytes to be hashed
     * @return the hex string of a sha256 hash of the bytes.
     */
    @Nullable
    public static String sha256Bytes(final byte[] bytes) {
        try {
            final MessageDigest md = MessageDigest.getInstance("SHA-256");
            final byte[] hash = md.digest(bytes);

            final StringBuilder hexString = new StringBuilder(2 * hash.length);
            for (byte b : hash) {
                final String hex = Integer.toHexString(0xff & b);

                if (hex.length() == 1) {
                    hexString.append('0');
                }
                hexString.append(hex);
            }
            return hexString.toString();
        } catch (NoSuchAlgorithmException e) {
            return null;
        }
    }

    /**
     * Check if the bytes contains a non-indexable characters
     *
     * @param utf8Bytes the bytes to be scanned
     * @return whether there were non-indexable characters
     */
    public static boolean containsNonIndexableBytes(final byte[] utf8Bytes) {
        // Wrap the byte array in a ByteArrayInputStream
        final InputStream inputStream = new ByteArrayInputStream(utf8Bytes);
        return containsNonIndexableBytes(inputStream);
    }

    /**
     * Check if the input stream contains a non-indexable characters
     *
     * @param inputStream the input stream to be scanned
     * @return whether there were non-indexable characters
     */
    public static boolean containsNonIndexableBytes(final InputStream inputStream) {
        // Create an InputStreamReader to read the bytes as characters
        try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
            int codePoint;
            // Use the read() method of the InputStreamReader to read code points. The read() method automatically handles
            // surrogate pairs, returning a single code point even for characters represented by multiple code units.
            while ((codePoint = reader.read()) != -1) {
                // Check if the code point is indexable
                if (isNotIndexable(codePoint)) {
                    return true;
                }
            }
            return false;
        } catch (IOException e) {
            return true;
        }
    }

    /**
     * Check if the code point is a control character or surrogate pair
     * <a href="https://en.wikipedia.org/wiki/Unicode_block">Unicode Block</a>
     * <a href="https://www.unicode.org/charts/PDF/U0000.pdf">U0000</a>
     * <a href="https://www.unicode.org/charts/PDF/U2000.pdf">U2000</a>
     * <a href="https://www.unicode.org/charts/PDF/U3000.pdf">U3000</a>
     * <a href="https://www.unicode.org/charts/PDF/UFE70.pdf">UFE70</a>
     * <a href="https://www.unicode.org/charts/PDF/UFFF0.pdf">UFFF0</a>
     *
     * @param codepoint numerical value that maps to a specific character to check
     * @return if code-point is a valid text character
     */
    private static boolean isNotIndexable(final int codepoint) {
        return ('\u0000' <= codepoint && codepoint <= '\u0008')
                || ('\u000E' <= codepoint && codepoint <= '\u001F')
                || ('\u007F' <= codepoint && codepoint <= '\u009F')
                || ('\u2000' <= codepoint && codepoint <= '\u200F')
                || ('\u2028' <= codepoint && codepoint <= '\u202F')
                || ('\u205F' <= codepoint && codepoint <= '\u206F')
                || codepoint == '\u3000'
                || codepoint == '\uFEFF'
                || codepoint == '\uFFFD';
    }

    /** This class is not meant to be instantiated. */
    private ByteUtil() {}
}