FlexibleDateTimeParser.java

package emissary.util;

import emissary.config.ConfigEntry;
import emissary.config.ConfigUtil;
import emissary.config.Configurator;

import jakarta.annotation.Nullable;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.time.DateTimeException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.DateTimeParseException;
import java.time.temporal.TemporalAccessor;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.stream.Collectors.toList;

/**
 * Attempt to parse a date in an unknown format. This will loop through a set of configured formats and convert it into
 * a {@link ZonedDateTime}.
 * <p>
 * Other parsing libs:
 * <p>
 * Natty - It handled a good chunk of the formats but not all.
 */
public final class FlexibleDateTimeParser {

    /* Logger */
    private static final Logger logger = LoggerFactory.getLogger(FlexibleDateTimeParser.class);

    /* Configuration Variables */
    private static final String CFG_FORMAT_MAIN = "FORMAT_DATETIME_MAIN";
    private static final String CFG_FORMAT_EXTRA = "FORMAT_DATETIME_EXTRA";
    private static final String CFG_TIMEZONE = "TIMEZONE";
    private static final String CFG_REMOVE_REGEX = "REMOVE_REGEX";
    private static final String CFG_EXTRA_TEXT_REMOVE_REGEX = "EXTRA_TEXT_REMOVE_REGEX";
    private static final String DEFAULT_TIMEZONE = "GMT";
    private static final String SPACE = " ";
    private static final String EMPTY = "";

    /* Remove all tabs and extra spaces */
    private static final Pattern REPLACE = Pattern.compile("\t+|[ ]+", Pattern.DOTALL);

    /*
     * Remove other junk -- anything in an html tag, all parenthesis and quotes, and any non-word characters at the
     * beginning or end
     */
    private static final Pattern remove;

    /*
     * This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time
     * zone offset
     */
    private static final Pattern extraTextRemove;

    /* timezone - config var: TIMEZONE */
    private static final ZoneId timezone;

    /* date time formats - vars: FORMAT_DATETIME_MAIN */
    private static final List<DateTimeFormatter> dateFormatsMain;

    /* Extra date time formats - list to try if our main list has failed - vars: FORMAT_DATETIME_EXTRA */
    private static final List<DateTimeFormatter> dateFormatsExtra;

    /* init */
    static {
        try {
            // fire up the configurator
            Configurator configurator = ConfigUtil.getConfigInfo(FlexibleDateTimeParser.class);
            timezone = setupTimezone(configurator.findStringEntry(CFG_TIMEZONE, DEFAULT_TIMEZONE));

            List<ConfigEntry> configEntriesMain = configurator.findStringMatchEntries(CFG_FORMAT_MAIN);
            dateFormatsMain = setupDateFormats(configEntriesMain, getConfigFormats(configEntriesMain));

            List<ConfigEntry> configEntriesExtra = configurator.findStringMatchEntries(CFG_FORMAT_EXTRA);
            dateFormatsExtra = setupDateFormats(configEntriesExtra, getConfigFormats(configEntriesExtra));

            String removeRegex = configurator.findStringEntry(CFG_REMOVE_REGEX, "<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+");
            remove = Pattern.compile(removeRegex, Pattern.DOTALL);

            // last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric timezone offset
            String extraTextRemoveRegex = configurator.findStringEntry(CFG_EXTRA_TEXT_REMOVE_REGEX, "((\\+|-)\\d{4}).*$");
            extraTextRemove = Pattern.compile(extraTextRemoveRegex);
        } catch (IOException e) {
            throw new IllegalArgumentException("Could not configure parser!!", e);
        }
    }

    /**
     * Get the default timezone id for the application
     *
     * @return the configured immutable and thread-safe zone id
     */
    public static ZoneId getTimezone() {
        return timezone;
    }

    /**
     * Attempts to parse a string date using pre-configured patterns. Default not trying the extensive date/time format list
     *
     * @param dateString the string to parse
     * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
     */
    public static ZonedDateTime parse(final String dateString) {
        return parse(dateString, false);
    }

    /**
     * Attempts to parse a string date using pre-configured patterns
     *
     * @param dateString the string to parse
     * @param tryExtensiveParsing True if we want to try out complete list of date/time formats False if we only want to
     *        attempt the most common date/time formats
     * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
     */
    public static ZonedDateTime parse(final String dateString, boolean tryExtensiveParsing) {
        ZonedDateTime zdt = parseToZonedDateTime(dateString, tryExtensiveParsing);

        if (zdt != null || !tryExtensiveParsing) {
            return zdt;
        } else {
            // if that all failed and we want to attempt extensive parsing, attempt the last ditch efforts we can try
            return lastDitchParsingEffort(dateString);
        }
    }

    /**
     * Attempts to parse a string date
     *
     * @param dateString the string to parse
     * @param format the date/time formats to use
     * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
     */
    public static ZonedDateTime parse(final String dateString, final DateTimeFormatter format) {
        return parse(dateString, Collections.singletonList(format));
    }

    /**
     * Attempts to parse a string date
     *
     * @param dateString the string to parse
     * @param formats the date/time formats to use
     * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
     */
    @Nullable
    public static ZonedDateTime parse(final String dateString, final List<DateTimeFormatter> formats) {
        String cleanedDateString = cleanDateString(dateString);

        if (StringUtils.isBlank(cleanedDateString) || CollectionUtils.isEmpty(formats)) {
            return null;
        }

        for (DateTimeFormatter formatter : formats) {
            if (formatter == null) {
                continue;
            }

            try {
                // try for a zoned date (has timezone), local date time (no time zone), or just a local date (no time)
                TemporalAccessor accessor =
                        formatter.parseBest(cleanedDateString, ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from, LocalDate::from);
                if (accessor instanceof ZonedDateTime) {
                    return (ZonedDateTime) accessor; // return the date time w/ timezone
                } else if (accessor instanceof OffsetDateTime) {
                    return ((OffsetDateTime) accessor).atZoneSameInstant(timezone);
                } else if (accessor instanceof LocalDateTime) {
                    return ((LocalDateTime) accessor).atZone(timezone); // set the timezone
                } else if (accessor instanceof LocalDate) {
                    return ((LocalDate) accessor).atStartOfDay(timezone); // add zeroed out time
                }

            } catch (NullPointerException | IllegalArgumentException | DateTimeParseException e) {
                // Ignore b/c failures are expected -> set to trace otherwise will be noisy
                logger.trace("Error parsing date {} with format {}", dateString, formatter);
            }
        }
        return null;
    }

    /* Private Methods */

    /**
     * If all our formats failed to parse a date string, give it one last try to parse it. Look for a numeric offset (e.g.
     * +0000) and remove all text afterward. This should cover another set of cases where there is random text appended to
     * the end of the string, as well as removing invalid non-numeric time zone offsets while still picking up the numeric
     * offset Assumption - that tryExtensiveParsing is true - we should only get to this point if we want to try our best to
     * parse
     *
     * @param date The date string to parse
     * @return the ZonedDateTime object if removing text at the end was successful, or null otherwise
     */
    @Nullable
    static ZonedDateTime lastDitchParsingEffort(final String date) {

        // Attempt to remove all text after the numeric offset and try again - this should give us a valid date string
        // to work with
        Matcher matcher = extraTextRemove.matcher(date);
        if (matcher.find()) {
            String secondChanceDate = matcher.replaceAll(matcher.group(1));
            // if we removed text, attempt to parse again to see if we are more successful this time
            return parseToZonedDateTime(secondChanceDate, true);
        }
        return null;
    }

    /**
     * Created to help against code duplication. Calls parse with the standard set of date formats, and then if that fails,
     * attempt the extra set of date formats if tryExtensiveParsing is set to true.
     *
     * @param dateString The string we are attempting to parse
     * @param tryExtensiveParsing Whether to use the extensive set of date formats
     * @return The ZonedDateTime object if our parsing was successful, or null if not
     */
    private static ZonedDateTime parseToZonedDateTime(final String dateString, boolean tryExtensiveParsing) {
        ZonedDateTime zdt = parse(dateString, dateFormatsMain);

        // if we got a successful parse or we don't want to attempt "extensive parsing", return here
        if (!tryExtensiveParsing || zdt != null) {
            return zdt;
        }
        zdt = parse(dateString, dateFormatsExtra);
        return zdt;
    }

    /**
     * Get the timezone to use for parsing (needed for DateTimes that do not have timezone information)
     *
     * @param configTimezone timezone string ["GMT" or "UTC" or "+0000" or "+00:00" ...]
     * @return timezone
     */
    private static ZoneId setupTimezone(final String configTimezone) {
        try {
            if (StringUtils.isNotBlank(configTimezone)) {
                // parse the timezone from the config
                return ZoneId.of(configTimezone);
            }
        } catch (DateTimeException e) {
            logger.error("Error parsing timezone {}, using default {}", configTimezone, timezone, e);
        }

        return ZoneId.of(DEFAULT_TIMEZONE);
    }

    /**
     * Get the overrides for the default date formats
     *
     * @param configEntries the list of main override formats from the config file
     * @param dateTimeFormats the list of datetime formats
     * @return a list of {@link DateTimeFormatter}s
     */
    private static List<DateTimeFormatter> setupDateFormats(final List<ConfigEntry> configEntries, final List<DateTimeFormatter> dateTimeFormats) {
        List<DateTimeFormatter> dateFormats;
        if (CollectionUtils.isNotEmpty(dateTimeFormats)) {
            dateFormats = Collections.unmodifiableList(dateTimeFormats);
            logger.debug("Created successfully. Created {} of {} formats from config", dateFormats.size(), configEntries.size());
            return dateFormats;
        } else {
            logger.error("Could not create with configured variables");
            throw new IllegalArgumentException("No date/time formats configured!!");
        }
    }

    /**
     * Loop through the date formats from the config file and create DateTimeFormatter objects
     *
     * @param configEntries the list of override formats from the config file
     * @return a list of {@link DateTimeFormatter}s
     */
    @Nullable
    private static List<DateTimeFormatter> getConfigFormats(final List<ConfigEntry> configEntries) {
        if (CollectionUtils.isEmpty(configEntries)) {
            return null;
        }
        return configEntries.stream().map(FlexibleDateTimeParser::getFormatter).filter(Objects::nonNull).collect(toList());
    }

    /**
     * Create the DateTimeFormatter object
     *
     * @param entry format from the config file
     * @return {@link DateTimeFormatter} if the pattern is valid, null otherwise
     */
    @Nullable
    private static DateTimeFormatter getFormatter(ConfigEntry entry) {
        try {
            return new DateTimeFormatterBuilder().parseCaseInsensitive().appendPattern(entry.getValue()).toFormatter();
        } catch (IllegalArgumentException e) {
            // log the bad one and move on because there could be other possible patterns
            logger.error("Error parsing pattern [{}]: {}", entry.getValue(), e.getLocalizedMessage());
        }
        return null;
    }

    /**
     * Clean up the date string for processing (remove extra spaces, tabs, html, ...)
     *
     * @param date the date string to clean
     * @return the scrubbed date string
     */
    private static String cleanDateString(final String date) {
        if (StringUtils.isBlank(date)) {
            return date;
        }

        // date strings over 100 characters are more than likely invalid
        String cleanedDateString = StringUtils.substring(date, 0, 100);
        cleanedDateString = REPLACE.matcher(cleanedDateString).replaceAll(SPACE);
        cleanedDateString = remove.matcher(cleanedDateString).replaceAll(EMPTY);

        return StringUtils.trimToNull(cleanedDateString);
    }

    /**
     * This class is not meant to be instantiated
     */
    private FlexibleDateTimeParser() {}

}