FlexibleDateTimeParser.java

  1. package emissary.util;

  2. import emissary.config.ConfigEntry;
  3. import emissary.config.ConfigUtil;
  4. import emissary.config.Configurator;

  5. import org.apache.commons.collections4.CollectionUtils;
  6. import org.apache.commons.lang3.StringUtils;
  7. import org.slf4j.Logger;
  8. import org.slf4j.LoggerFactory;

  9. import java.io.IOException;
  10. import java.time.DateTimeException;
  11. import java.time.LocalDate;
  12. import java.time.LocalDateTime;
  13. import java.time.OffsetDateTime;
  14. import java.time.ZoneId;
  15. import java.time.ZonedDateTime;
  16. import java.time.format.DateTimeFormatter;
  17. import java.time.format.DateTimeFormatterBuilder;
  18. import java.time.format.DateTimeParseException;
  19. import java.time.temporal.TemporalAccessor;
  20. import java.util.Collections;
  21. import java.util.List;
  22. import java.util.Objects;
  23. import java.util.regex.Matcher;
  24. import java.util.regex.Pattern;
  25. import javax.annotation.Nullable;

  26. import static java.util.stream.Collectors.toList;

  27. /**
  28.  * Attempt to parse a date in an unknown format. This will loop through a set of configured formats and convert it into
  29.  * a {@link ZonedDateTime}.
  30.  * <p>
  31.  * Other parsing libs:
  32.  * <p>
  33.  * Natty - It handled a good chunk of the formats but not all.
  34.  */
  35. public final class FlexibleDateTimeParser {

  36.     /* Logger */
  37.     private static final Logger logger = LoggerFactory.getLogger(FlexibleDateTimeParser.class);

  38.     /* Configuration Variables */
  39.     private static final String CFG_FORMAT_MAIN = "FORMAT_DATETIME_MAIN";
  40.     private static final String CFG_FORMAT_EXTRA = "FORMAT_DATETIME_EXTRA";
  41.     private static final String CFG_TIMEZONE = "TIMEZONE";
  42.     private static final String CFG_REMOVE_REGEX = "REMOVE_REGEX";
  43.     private static final String CFG_EXTRA_TEXT_REMOVE_REGEX = "EXTRA_TEXT_REMOVE_REGEX";
  44.     private static final String DEFAULT_TIMEZONE = "GMT";
  45.     private static final String SPACE = " ";
  46.     private static final String EMPTY = "";

  47.     /* Remove all tabs and extra spaces */
  48.     private static final Pattern REPLACE = Pattern.compile("\t+|[ ]+", Pattern.DOTALL);

  49.     /*
  50.      * Remove other junk -- anything in an html tag, all parenthesis and quotes, and any non-word characters at the
  51.      * beginning or end
  52.      */
  53.     private static final Pattern remove;

  54.     /*
  55.      * This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time
  56.      * zone offset
  57.      */
  58.     private static final Pattern extraTextRemove;

  59.     /* timezone - config var: TIMEZONE */
  60.     private static final ZoneId timezone;

  61.     /* date time formats - vars: FORMAT_DATETIME_MAIN */
  62.     private static final List<DateTimeFormatter> dateFormatsMain;

  63.     /* Extra date time formats - list to try if our main list has failed - vars: FORMAT_DATETIME_EXTRA */
  64.     private static final List<DateTimeFormatter> dateFormatsExtra;

  65.     /* init */
  66.     static {
  67.         try {
  68.             // fire up the configurator
  69.             Configurator configurator = ConfigUtil.getConfigInfo(FlexibleDateTimeParser.class);
  70.             timezone = setupTimezone(configurator.findStringEntry(CFG_TIMEZONE, DEFAULT_TIMEZONE));

  71.             List<ConfigEntry> configEntriesMain = configurator.findStringMatchEntries(CFG_FORMAT_MAIN);
  72.             dateFormatsMain = setupDateFormats(configEntriesMain, getConfigFormats(configEntriesMain));

  73.             List<ConfigEntry> configEntriesExtra = configurator.findStringMatchEntries(CFG_FORMAT_EXTRA);
  74.             dateFormatsExtra = setupDateFormats(configEntriesExtra, getConfigFormats(configEntriesExtra));

  75.             String removeRegex = configurator.findStringEntry(CFG_REMOVE_REGEX, "<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+");
  76.             remove = Pattern.compile(removeRegex, Pattern.DOTALL);

  77.             // last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric timezone offset
  78.             String extraTextRemoveRegex = configurator.findStringEntry(CFG_EXTRA_TEXT_REMOVE_REGEX, "((\\+|-)\\d{4}).*$");
  79.             extraTextRemove = Pattern.compile(extraTextRemoveRegex);
  80.         } catch (IOException e) {
  81.             throw new IllegalArgumentException("Could not configure parser!!", e);
  82.         }
  83.     }

  84.     /**
  85.      * Get the default timezone id for the application
  86.      *
  87.      * @return the configured immutable and thread-safe zone id
  88.      */
  89.     public static ZoneId getTimezone() {
  90.         return timezone;
  91.     }

  92.     /**
  93.      * Attempts to parse a string date using pre-configured patterns. Default not trying the extensive date/time format list
  94.      *
  95.      * @param dateString the string to parse
  96.      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
  97.      */
  98.     public static ZonedDateTime parse(final String dateString) {
  99.         return parse(dateString, false);
  100.     }

  101.     /**
  102.      * Attempts to parse a string date using pre-configured patterns
  103.      *
  104.      * @param dateString the string to parse
  105.      * @param tryExtensiveParsing True if we want to try out complete list of date/time formats False if we only want to
  106.      *        attempt the most common date/time formats
  107.      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
  108.      */
  109.     public static ZonedDateTime parse(final String dateString, boolean tryExtensiveParsing) {
  110.         ZonedDateTime zdt = parseToZonedDateTime(dateString, tryExtensiveParsing);

  111.         if (zdt != null || !tryExtensiveParsing) {
  112.             return zdt;
  113.         } else {
  114.             // if that all failed and we want to attempt extensive parsing, attempt the last ditch efforts we can try
  115.             return lastDitchParsingEffort(dateString);
  116.         }
  117.     }

  118.     /**
  119.      * Attempts to parse a string date
  120.      *
  121.      * @param dateString the string to parse
  122.      * @param format the date/time formats to use
  123.      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
  124.      */
  125.     public static ZonedDateTime parse(final String dateString, final DateTimeFormatter format) {
  126.         return parse(dateString, Collections.singletonList(format));
  127.     }

  128.     /**
  129.      * Attempts to parse a string date
  130.      *
  131.      * @param dateString the string to parse
  132.      * @param formats the date/time formats to use
  133.      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
  134.      */
  135.     @Nullable
  136.     public static ZonedDateTime parse(final String dateString, final List<DateTimeFormatter> formats) {
  137.         String cleanedDateString = cleanDateString(dateString);

  138.         if (StringUtils.isBlank(cleanedDateString) || CollectionUtils.isEmpty(formats)) {
  139.             return null;
  140.         }

  141.         for (DateTimeFormatter formatter : formats) {
  142.             if (formatter == null) {
  143.                 continue;
  144.             }

  145.             try {
  146.                 // try for a zoned date (has timezone), local date time (no time zone), or just a local date (no time)
  147.                 TemporalAccessor accessor =
  148.                         formatter.parseBest(cleanedDateString, ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from, LocalDate::from);
  149.                 if (accessor instanceof ZonedDateTime) {
  150.                     return (ZonedDateTime) accessor; // return the date time w/ timezone
  151.                 } else if (accessor instanceof OffsetDateTime) {
  152.                     return ((OffsetDateTime) accessor).atZoneSameInstant(timezone);
  153.                 } else if (accessor instanceof LocalDateTime) {
  154.                     return ((LocalDateTime) accessor).atZone(timezone); // set the timezone
  155.                 } else if (accessor instanceof LocalDate) {
  156.                     return ((LocalDate) accessor).atStartOfDay(timezone); // add zeroed out time
  157.                 }

  158.             } catch (NullPointerException | IllegalArgumentException | DateTimeParseException e) {
  159.                 // Ignore b/c failures are expected -> set to trace otherwise will be noisy
  160.                 logger.trace("Error parsing date {} with format {}", dateString, formatter);
  161.             }
  162.         }
  163.         return null;
  164.     }

  165.     /* Private Methods */

  166.     /**
  167.      * If all our formats failed to parse a date string, give it one last try to parse it. Look for a numeric offset (e.g.
  168.      * +0000) and remove all text afterward. This should cover another set of cases where there is random text appended to
  169.      * the end of the string, as well as removing invalid non-numeric time zone offsets while still picking up the numeric
  170.      * offset Assumption - that tryExtensiveParsing is true - we should only get to this point if we want to try our best to
  171.      * parse
  172.      *
  173.      * @param date The date string to parse
  174.      * @return the ZonedDateTime object if removing text at the end was successful, or null otherwise
  175.      */
  176.     @Nullable
  177.     static ZonedDateTime lastDitchParsingEffort(final String date) {

  178.         // Attempt to remove all text after the numeric offset and try again - this should give us a valid date string
  179.         // to work with
  180.         Matcher matcher = extraTextRemove.matcher(date);
  181.         if (matcher.find()) {
  182.             String secondChanceDate = matcher.replaceAll(matcher.group(1));
  183.             // if we removed text, attempt to parse again to see if we are more successful this time
  184.             return parseToZonedDateTime(secondChanceDate, true);
  185.         }
  186.         return null;
  187.     }

  188.     /**
  189.      * Created to help against code duplication. Calls parse with the standard set of date formats, and then if that fails,
  190.      * attempt the extra set of date formats if tryExtensiveParsing is set to true.
  191.      *
  192.      * @param dateString The string we are attempting to parse
  193.      * @param tryExtensiveParsing Whether to use the extensive set of date formats
  194.      * @return The ZonedDateTime object if our parsing was successful, or null if not
  195.      */
  196.     private static ZonedDateTime parseToZonedDateTime(final String dateString, boolean tryExtensiveParsing) {
  197.         ZonedDateTime zdt = parse(dateString, dateFormatsMain);

  198.         // if we got a successful parse or we don't want to attempt "extensive parsing", return here
  199.         if (!tryExtensiveParsing || zdt != null) {
  200.             return zdt;
  201.         }
  202.         zdt = parse(dateString, dateFormatsExtra);
  203.         return zdt;
  204.     }

  205.     /**
  206.      * Get the timezone to use for parsing (needed for DateTimes that do not have timezone information)
  207.      *
  208.      * @param configTimezone timezone string ["GMT" or "UTC" or "+0000" or "+00:00" ...]
  209.      * @return timezone
  210.      */
  211.     private static ZoneId setupTimezone(final String configTimezone) {
  212.         try {
  213.             if (StringUtils.isNotBlank(configTimezone)) {
  214.                 // parse the timezone from the config
  215.                 return ZoneId.of(configTimezone);
  216.             }
  217.         } catch (DateTimeException e) {
  218.             logger.error("Error parsing timezone {}, using default {}", configTimezone, timezone, e);
  219.         }

  220.         return ZoneId.of(DEFAULT_TIMEZONE);
  221.     }

  222.     /**
  223.      * Get the overrides for the default date formats
  224.      *
  225.      * @param configEntries the list of main override formats from the config file
  226.      * @param dateTimeFormats the list of datetime formats
  227.      * @return a list of {@link DateTimeFormatter}s
  228.      */
  229.     private static List<DateTimeFormatter> setupDateFormats(final List<ConfigEntry> configEntries, final List<DateTimeFormatter> dateTimeFormats) {
  230.         List<DateTimeFormatter> dateFormats;
  231.         if (CollectionUtils.isNotEmpty(dateTimeFormats)) {
  232.             dateFormats = Collections.unmodifiableList(dateTimeFormats);
  233.             logger.debug("Created successfully. Created {} of {} formats from config", dateFormats.size(), configEntries.size());
  234.             return dateFormats;
  235.         } else {
  236.             logger.error("Could not create with configured variables");
  237.             throw new IllegalArgumentException("No date/time formats configured!!");
  238.         }
  239.     }

  240.     /**
  241.      * Loop through the date formats from the config file and create DateTimeFormatter objects
  242.      *
  243.      * @param configEntries the list of override formats from the config file
  244.      * @return a list of {@link DateTimeFormatter}s
  245.      */
  246.     @Nullable
  247.     private static List<DateTimeFormatter> getConfigFormats(final List<ConfigEntry> configEntries) {
  248.         if (CollectionUtils.isEmpty(configEntries)) {
  249.             return null;
  250.         }
  251.         return configEntries.stream().map(FlexibleDateTimeParser::getFormatter).filter(Objects::nonNull).collect(toList());
  252.     }

  253.     /**
  254.      * Create the DateTimeFormatter object
  255.      *
  256.      * @param entry format from the config file
  257.      * @return {@link DateTimeFormatter} if the pattern is valid, null otherwise
  258.      */
  259.     @Nullable
  260.     private static DateTimeFormatter getFormatter(ConfigEntry entry) {
  261.         try {
  262.             return new DateTimeFormatterBuilder().parseCaseInsensitive().appendPattern(entry.getValue()).toFormatter();
  263.         } catch (IllegalArgumentException e) {
  264.             // log the bad one and move on because there could be other possible patterns
  265.             logger.error("Error parsing pattern [{}]: {}", entry.getValue(), e.getLocalizedMessage());
  266.         }
  267.         return null;
  268.     }

  269.     /**
  270.      * Clean up the date string for processing (remove extra spaces, tabs, html, ...)
  271.      *
  272.      * @param date the date string to clean
  273.      * @return the scrubbed date string
  274.      */
  275.     private static String cleanDateString(final String date) {
  276.         if (StringUtils.isBlank(date)) {
  277.             return date;
  278.         }

  279.         // date strings over 100 characters are more than likely invalid
  280.         String cleanedDateString = StringUtils.substring(date, 0, 100);
  281.         cleanedDateString = REPLACE.matcher(cleanedDateString).replaceAll(SPACE);
  282.         cleanedDateString = remove.matcher(cleanedDateString).replaceAll(EMPTY);

  283.         return StringUtils.trimToNull(cleanedDateString);
  284.     }

  285.     /**
  286.      * This class is not meant to be instantiated
  287.      */
  288.     private FlexibleDateTimeParser() {}

  289. }