View Javadoc
1   package emissary.util;
2   
3   import emissary.config.ConfigEntry;
4   import emissary.config.ConfigUtil;
5   import emissary.config.Configurator;
6   
7   import jakarta.annotation.Nullable;
8   import org.apache.commons.collections4.CollectionUtils;
9   import org.apache.commons.lang3.StringUtils;
10  import org.slf4j.Logger;
11  import org.slf4j.LoggerFactory;
12  
13  import java.io.IOException;
14  import java.time.DateTimeException;
15  import java.time.LocalDate;
16  import java.time.LocalDateTime;
17  import java.time.OffsetDateTime;
18  import java.time.ZoneId;
19  import java.time.ZonedDateTime;
20  import java.time.format.DateTimeFormatter;
21  import java.time.format.DateTimeFormatterBuilder;
22  import java.time.format.DateTimeParseException;
23  import java.time.temporal.TemporalAccessor;
24  import java.util.Collections;
25  import java.util.List;
26  import java.util.Objects;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  import static java.util.stream.Collectors.toList;
31  
32  /**
33   * Attempt to parse a date in an unknown format. This will loop through a set of configured formats and convert it into
34   * a {@link ZonedDateTime}.
35   * <p>
36   * Other parsing libs:
37   * <p>
38   * Natty - It handled a good chunk of the formats but not all.
39   */
40  public final class FlexibleDateTimeParser {
41  
42      /* Logger */
43      private static final Logger logger = LoggerFactory.getLogger(FlexibleDateTimeParser.class);
44  
45      /* Configuration Variables */
46      private static final String CFG_FORMAT_MAIN = "FORMAT_DATETIME_MAIN";
47      private static final String CFG_FORMAT_EXTRA = "FORMAT_DATETIME_EXTRA";
48      private static final String CFG_TIMEZONE = "TIMEZONE";
49      private static final String CFG_REMOVE_REGEX = "REMOVE_REGEX";
50      private static final String CFG_EXTRA_TEXT_REMOVE_REGEX = "EXTRA_TEXT_REMOVE_REGEX";
51      private static final String DEFAULT_TIMEZONE = "GMT";
52      private static final String SPACE = " ";
53      private static final String EMPTY = "";
54  
55      /* Remove all tabs and extra spaces */
56      private static final Pattern REPLACE = Pattern.compile("\t+|[ ]+", Pattern.DOTALL);
57  
58      /*
59       * Remove other junk -- anything in an html tag, all parenthesis and quotes, and any non-word characters at the
60       * beginning or end
61       */
62      private static final Pattern remove;
63  
64      /*
65       * This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time
66       * zone offset
67       */
68      private static final Pattern extraTextRemove;
69  
70      /* timezone - config var: TIMEZONE */
71      private static final ZoneId timezone;
72  
73      /* date time formats - vars: FORMAT_DATETIME_MAIN */
74      private static final List<DateTimeFormatter> dateFormatsMain;
75  
76      /* Extra date time formats - list to try if our main list has failed - vars: FORMAT_DATETIME_EXTRA */
77      private static final List<DateTimeFormatter> dateFormatsExtra;
78  
79      /* init */
80      static {
81          try {
82              // fire up the configurator
83              Configurator configurator = ConfigUtil.getConfigInfo(FlexibleDateTimeParser.class);
84              timezone = setupTimezone(configurator.findStringEntry(CFG_TIMEZONE, DEFAULT_TIMEZONE));
85  
86              List<ConfigEntry> configEntriesMain = configurator.findStringMatchEntries(CFG_FORMAT_MAIN);
87              dateFormatsMain = setupDateFormats(configEntriesMain, getConfigFormats(configEntriesMain));
88  
89              List<ConfigEntry> configEntriesExtra = configurator.findStringMatchEntries(CFG_FORMAT_EXTRA);
90              dateFormatsExtra = setupDateFormats(configEntriesExtra, getConfigFormats(configEntriesExtra));
91  
92              String removeRegex = configurator.findStringEntry(CFG_REMOVE_REGEX, "<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+");
93              remove = Pattern.compile(removeRegex, Pattern.DOTALL);
94  
95              // last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric timezone offset
96              String extraTextRemoveRegex = configurator.findStringEntry(CFG_EXTRA_TEXT_REMOVE_REGEX, "((\\+|-)\\d{4}).*$");
97              extraTextRemove = Pattern.compile(extraTextRemoveRegex);
98          } catch (IOException e) {
99              throw new IllegalArgumentException("Could not configure parser!!", e);
100         }
101     }
102 
103     /**
104      * Get the default timezone id for the application
105      *
106      * @return the configured immutable and thread-safe zone id
107      */
108     public static ZoneId getTimezone() {
109         return timezone;
110     }
111 
112     /**
113      * Attempts to parse a string date using pre-configured patterns. Default not trying the extensive date/time format list
114      *
115      * @param dateString the string to parse
116      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
117      */
118     public static ZonedDateTime parse(final String dateString) {
119         return parse(dateString, false);
120     }
121 
122     /**
123      * Attempts to parse a string date using pre-configured patterns
124      *
125      * @param dateString the string to parse
126      * @param tryExtensiveParsing True if we want to try out complete list of date/time formats False if we only want to
127      *        attempt the most common date/time formats
128      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
129      */
130     public static ZonedDateTime parse(final String dateString, boolean tryExtensiveParsing) {
131         ZonedDateTime zdt = parseToZonedDateTime(dateString, tryExtensiveParsing);
132 
133         if (zdt != null || !tryExtensiveParsing) {
134             return zdt;
135         } else {
136             // if that all failed and we want to attempt extensive parsing, attempt the last ditch efforts we can try
137             return lastDitchParsingEffort(dateString);
138         }
139     }
140 
141     /**
142      * Attempts to parse a string date
143      *
144      * @param dateString the string to parse
145      * @param format the date/time formats to use
146      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
147      */
148     public static ZonedDateTime parse(final String dateString, final DateTimeFormatter format) {
149         return parse(dateString, Collections.singletonList(format));
150     }
151 
152     /**
153      * Attempts to parse a string date
154      *
155      * @param dateString the string to parse
156      * @param formats the date/time formats to use
157      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
158      */
159     @Nullable
160     public static ZonedDateTime parse(final String dateString, final List<DateTimeFormatter> formats) {
161         String cleanedDateString = cleanDateString(dateString);
162 
163         if (StringUtils.isBlank(cleanedDateString) || CollectionUtils.isEmpty(formats)) {
164             return null;
165         }
166 
167         for (DateTimeFormatter formatter : formats) {
168             if (formatter == null) {
169                 continue;
170             }
171 
172             try {
173                 // try for a zoned date (has timezone), local date time (no time zone), or just a local date (no time)
174                 TemporalAccessor accessor =
175                         formatter.parseBest(cleanedDateString, ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from, LocalDate::from);
176                 if (accessor instanceof ZonedDateTime) {
177                     return (ZonedDateTime) accessor; // return the date time w/ timezone
178                 } else if (accessor instanceof OffsetDateTime) {
179                     return ((OffsetDateTime) accessor).atZoneSameInstant(timezone);
180                 } else if (accessor instanceof LocalDateTime) {
181                     return ((LocalDateTime) accessor).atZone(timezone); // set the timezone
182                 } else if (accessor instanceof LocalDate) {
183                     return ((LocalDate) accessor).atStartOfDay(timezone); // add zeroed out time
184                 }
185 
186             } catch (NullPointerException | IllegalArgumentException | DateTimeParseException e) {
187                 // Ignore b/c failures are expected -> set to trace otherwise will be noisy
188                 logger.trace("Error parsing date {} with format {}", dateString, formatter);
189             }
190         }
191         return null;
192     }
193 
194     /* Private Methods */
195 
196     /**
197      * If all our formats failed to parse a date string, give it one last try to parse it. Look for a numeric offset (e.g.
198      * +0000) and remove all text afterward. This should cover another set of cases where there is random text appended to
199      * the end of the string, as well as removing invalid non-numeric time zone offsets while still picking up the numeric
200      * offset Assumption - that tryExtensiveParsing is true - we should only get to this point if we want to try our best to
201      * parse
202      *
203      * @param date The date string to parse
204      * @return the ZonedDateTime object if removing text at the end was successful, or null otherwise
205      */
206     @Nullable
207     static ZonedDateTime lastDitchParsingEffort(final String date) {
208 
209         // Attempt to remove all text after the numeric offset and try again - this should give us a valid date string
210         // to work with
211         Matcher matcher = extraTextRemove.matcher(date);
212         if (matcher.find()) {
213             String secondChanceDate = matcher.replaceAll(matcher.group(1));
214             // if we removed text, attempt to parse again to see if we are more successful this time
215             return parseToZonedDateTime(secondChanceDate, true);
216         }
217         return null;
218     }
219 
220     /**
221      * Created to help against code duplication. Calls parse with the standard set of date formats, and then if that fails,
222      * attempt the extra set of date formats if tryExtensiveParsing is set to true.
223      *
224      * @param dateString The string we are attempting to parse
225      * @param tryExtensiveParsing Whether to use the extensive set of date formats
226      * @return The ZonedDateTime object if our parsing was successful, or null if not
227      */
228     private static ZonedDateTime parseToZonedDateTime(final String dateString, boolean tryExtensiveParsing) {
229         ZonedDateTime zdt = parse(dateString, dateFormatsMain);
230 
231         // if we got a successful parse or we don't want to attempt "extensive parsing", return here
232         if (!tryExtensiveParsing || zdt != null) {
233             return zdt;
234         }
235         zdt = parse(dateString, dateFormatsExtra);
236         return zdt;
237     }
238 
239     /**
240      * Get the timezone to use for parsing (needed for DateTimes that do not have timezone information)
241      *
242      * @param configTimezone timezone string ["GMT" or "UTC" or "+0000" or "+00:00" ...]
243      * @return timezone
244      */
245     private static ZoneId setupTimezone(final String configTimezone) {
246         try {
247             if (StringUtils.isNotBlank(configTimezone)) {
248                 // parse the timezone from the config
249                 return ZoneId.of(configTimezone);
250             }
251         } catch (DateTimeException e) {
252             logger.error("Error parsing timezone {}, using default {}", configTimezone, timezone, e);
253         }
254 
255         return ZoneId.of(DEFAULT_TIMEZONE);
256     }
257 
258     /**
259      * Get the overrides for the default date formats
260      *
261      * @param configEntries the list of main override formats from the config file
262      * @param dateTimeFormats the list of datetime formats
263      * @return a list of {@link DateTimeFormatter}s
264      */
265     private static List<DateTimeFormatter> setupDateFormats(final List<ConfigEntry> configEntries, final List<DateTimeFormatter> dateTimeFormats) {
266         List<DateTimeFormatter> dateFormats;
267         if (CollectionUtils.isNotEmpty(dateTimeFormats)) {
268             dateFormats = Collections.unmodifiableList(dateTimeFormats);
269             logger.debug("Created successfully. Created {} of {} formats from config", dateFormats.size(), configEntries.size());
270             return dateFormats;
271         } else {
272             logger.error("Could not create with configured variables");
273             throw new IllegalArgumentException("No date/time formats configured!!");
274         }
275     }
276 
277     /**
278      * Loop through the date formats from the config file and create DateTimeFormatter objects
279      *
280      * @param configEntries the list of override formats from the config file
281      * @return a list of {@link DateTimeFormatter}s
282      */
283     @Nullable
284     private static List<DateTimeFormatter> getConfigFormats(final List<ConfigEntry> configEntries) {
285         if (CollectionUtils.isEmpty(configEntries)) {
286             return null;
287         }
288         return configEntries.stream().map(FlexibleDateTimeParser::getFormatter).filter(Objects::nonNull).collect(toList());
289     }
290 
291     /**
292      * Create the DateTimeFormatter object
293      *
294      * @param entry format from the config file
295      * @return {@link DateTimeFormatter} if the pattern is valid, null otherwise
296      */
297     @Nullable
298     private static DateTimeFormatter getFormatter(ConfigEntry entry) {
299         try {
300             return new DateTimeFormatterBuilder().parseCaseInsensitive().appendPattern(entry.getValue()).toFormatter();
301         } catch (IllegalArgumentException e) {
302             // log the bad one and move on because there could be other possible patterns
303             logger.error("Error parsing pattern [{}]: {}", entry.getValue(), e.getLocalizedMessage());
304         }
305         return null;
306     }
307 
308     /**
309      * Clean up the date string for processing (remove extra spaces, tabs, html, ...)
310      *
311      * @param date the date string to clean
312      * @return the scrubbed date string
313      */
314     private static String cleanDateString(final String date) {
315         if (StringUtils.isBlank(date)) {
316             return date;
317         }
318 
319         // date strings over 100 characters are more than likely invalid
320         String cleanedDateString = StringUtils.substring(date, 0, 100);
321         cleanedDateString = REPLACE.matcher(cleanedDateString).replaceAll(SPACE);
322         cleanedDateString = remove.matcher(cleanedDateString).replaceAll(EMPTY);
323 
324         return StringUtils.trimToNull(cleanedDateString);
325     }
326 
327     /**
328      * This class is not meant to be instantiated
329      */
330     private FlexibleDateTimeParser() {}
331 
332 }