View Javadoc
1   package emissary.util;
2   
3   import emissary.config.ConfigEntry;
4   import emissary.config.ConfigUtil;
5   import emissary.config.Configurator;
6   
7   import jakarta.annotation.Nullable;
8   import org.apache.commons.collections4.CollectionUtils;
9   import org.apache.commons.lang3.StringUtils;
10  import org.slf4j.Logger;
11  import org.slf4j.LoggerFactory;
12  
13  import java.io.IOException;
14  import java.time.DateTimeException;
15  import java.time.LocalDate;
16  import java.time.LocalDateTime;
17  import java.time.OffsetDateTime;
18  import java.time.ZoneId;
19  import java.time.ZonedDateTime;
20  import java.time.format.DateTimeFormatter;
21  import java.time.format.DateTimeFormatterBuilder;
22  import java.time.format.DateTimeParseException;
23  import java.time.temporal.TemporalAccessor;
24  import java.util.Collections;
25  import java.util.List;
26  import java.util.Locale;
27  import java.util.Objects;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import static java.util.stream.Collectors.toList;
32  
33  /**
34   * Attempt to parse a date in an unknown format. This will loop through a set of configured formats and convert it into
35   * a {@link ZonedDateTime}.
36   * <p>
37   * Other parsing libs:
38   * <p>
39   * Natty - It handled a good chunk of the formats but not all.
40   */
41  public final class FlexibleDateTimeParser {
42  
43      /* Logger */
44      private static final Logger logger = LoggerFactory.getLogger(FlexibleDateTimeParser.class);
45  
46      /* Configuration Variables */
47      private static final String CFG_FORMAT_MAIN = "FORMAT_DATETIME_MAIN";
48      private static final String CFG_FORMAT_EXTRA = "FORMAT_DATETIME_EXTRA";
49      private static final String CFG_TIMEZONE = "TIMEZONE";
50      private static final String CFG_REMOVE_REGEX = "REMOVE_REGEX";
51      private static final String CFG_EXTRA_TEXT_REMOVE_REGEX = "EXTRA_TEXT_REMOVE_REGEX";
52      private static final String DEFAULT_TIMEZONE = "GMT";
53      private static final String SPACE = " ";
54      private static final String EMPTY = "";
55  
56      /* Remove all tabs and extra spaces */
57      private static final Pattern REPLACE = Pattern.compile("\t+|[ ]+", Pattern.DOTALL);
58  
59      /*
60       * Remove other junk -- anything in an html tag, all parenthesis and quotes, and any non-word characters at the
61       * beginning or end
62       */
63      private static final Pattern remove;
64  
65      /*
66       * This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time
67       * zone offset
68       */
69      private static final Pattern extraTextRemove;
70  
71      /* timezone - config var: TIMEZONE */
72      private static final ZoneId timezone;
73  
74      /* date time formats - vars: FORMAT_DATETIME_MAIN */
75      private static final List<DateTimeFormatter> dateFormatsMain;
76  
77      /* Extra date time formats - list to try if our main list has failed - vars: FORMAT_DATETIME_EXTRA */
78      private static final List<DateTimeFormatter> dateFormatsExtra;
79  
80      /* init */
81      static {
82          try {
83              // fire up the configurator
84              Configurator configurator = ConfigUtil.getConfigInfo(FlexibleDateTimeParser.class);
85              timezone = setupTimezone(configurator.findStringEntry(CFG_TIMEZONE, DEFAULT_TIMEZONE));
86  
87              List<ConfigEntry> configEntriesMain = configurator.findStringMatchEntries(CFG_FORMAT_MAIN);
88              dateFormatsMain = setupDateFormats(configEntriesMain, getConfigFormats(configEntriesMain));
89  
90              List<ConfigEntry> configEntriesExtra = configurator.findStringMatchEntries(CFG_FORMAT_EXTRA);
91              dateFormatsExtra = setupDateFormats(configEntriesExtra, getConfigFormats(configEntriesExtra));
92  
93              String removeRegex = configurator.findStringEntry(CFG_REMOVE_REGEX, "<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+");
94              remove = Pattern.compile(removeRegex, Pattern.DOTALL);
95  
96              // last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric timezone offset
97              String extraTextRemoveRegex = configurator.findStringEntry(CFG_EXTRA_TEXT_REMOVE_REGEX, "((\\+|-)\\d{4}).*$");
98              extraTextRemove = Pattern.compile(extraTextRemoveRegex);
99          } catch (IOException e) {
100             throw new IllegalArgumentException("Could not configure parser!!", e);
101         }
102     }
103 
104     /**
105      * Get the default timezone id for the application
106      *
107      * @return the configured immutable and thread-safe zone id
108      */
109     public static ZoneId getTimezone() {
110         return timezone;
111     }
112 
113     /**
114      * Attempts to parse a string date using pre-configured patterns. Default not trying the extensive date/time format list
115      *
116      * @param dateString the string to parse
117      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
118      */
119     public static ZonedDateTime parse(final String dateString) {
120         return parse(dateString, false);
121     }
122 
123     /**
124      * Attempts to parse a string date using pre-configured patterns
125      *
126      * @param dateString the string to parse
127      * @param tryExtensiveParsing True if we want to try out complete list of date/time formats False if we only want to
128      *        attempt the most common date/time formats
129      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
130      */
131     public static ZonedDateTime parse(final String dateString, boolean tryExtensiveParsing) {
132         ZonedDateTime zdt = parseToZonedDateTime(dateString, tryExtensiveParsing);
133 
134         if (zdt != null || !tryExtensiveParsing) {
135             return zdt;
136         } else {
137             // if that all failed and we want to attempt extensive parsing, attempt the last ditch efforts we can try
138             return lastDitchParsingEffort(dateString);
139         }
140     }
141 
142     /**
143      * Attempts to parse a string date
144      *
145      * @param dateString the string to parse
146      * @param format the date/time formats to use
147      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
148      */
149     public static ZonedDateTime parse(final String dateString, final DateTimeFormatter format) {
150         return parse(dateString, Collections.singletonList(format));
151     }
152 
153     /**
154      * Attempts to parse a string date
155      *
156      * @param dateString the string to parse
157      * @param formats the date/time formats to use
158      * @return the parsed immutable and thread-safe zoned-date, or null if it failed to parse
159      */
160     @Nullable
161     public static ZonedDateTime parse(final String dateString, final List<DateTimeFormatter> formats) {
162         String cleanedDateString = cleanDateString(dateString);
163 
164         if (StringUtils.isBlank(cleanedDateString) || CollectionUtils.isEmpty(formats)) {
165             return null;
166         }
167 
168         for (DateTimeFormatter formatter : formats) {
169             if (formatter == null) {
170                 continue;
171             }
172 
173             try {
174                 // try for a zoned date (has timezone), local date time (no time zone), or just a local date (no time)
175                 TemporalAccessor accessor =
176                         formatter.parseBest(cleanedDateString, ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from, LocalDate::from);
177                 if (accessor instanceof ZonedDateTime) {
178                     return (ZonedDateTime) accessor; // return the date time w/ timezone
179                 } else if (accessor instanceof OffsetDateTime) {
180                     return ((OffsetDateTime) accessor).atZoneSameInstant(timezone);
181                 } else if (accessor instanceof LocalDateTime) {
182                     return ((LocalDateTime) accessor).atZone(timezone); // set the timezone
183                 } else if (accessor instanceof LocalDate) {
184                     return ((LocalDate) accessor).atStartOfDay(timezone); // add zeroed out time
185                 }
186 
187             } catch (NullPointerException | IllegalArgumentException | DateTimeParseException e) {
188                 // Ignore b/c failures are expected -> set to trace otherwise will be noisy
189                 logger.trace("Error parsing date {} with format {}", dateString, formatter);
190             }
191         }
192         return null;
193     }
194 
195     /* Private Methods */
196 
197     /**
198      * If all our formats failed to parse a date string, give it one last try to parse it. Look for a numeric offset (e.g.
199      * +0000) and remove all text afterward. This should cover another set of cases where there is random text appended to
200      * the end of the string, as well as removing invalid non-numeric time zone offsets while still picking up the numeric
201      * offset Assumption - that tryExtensiveParsing is true - we should only get to this point if we want to try our best to
202      * parse
203      *
204      * @param date The date string to parse
205      * @return the ZonedDateTime object if removing text at the end was successful, or null otherwise
206      */
207     @Nullable
208     static ZonedDateTime lastDitchParsingEffort(final String date) {
209 
210         // Attempt to remove all text after the numeric offset and try again - this should give us a valid date string
211         // to work with
212         Matcher matcher = extraTextRemove.matcher(date);
213         if (matcher.find()) {
214             String secondChanceDate = matcher.replaceAll(matcher.group(1));
215             // if we removed text, attempt to parse again to see if we are more successful this time
216             return parseToZonedDateTime(secondChanceDate, true);
217         }
218         return null;
219     }
220 
221     /**
222      * Created to help against code duplication. Calls parse with the standard set of date formats, and then if that fails,
223      * attempt the extra set of date formats if tryExtensiveParsing is set to true.
224      *
225      * @param dateString The string we are attempting to parse
226      * @param tryExtensiveParsing Whether to use the extensive set of date formats
227      * @return The ZonedDateTime object if our parsing was successful, or null if not
228      */
229     private static ZonedDateTime parseToZonedDateTime(final String dateString, boolean tryExtensiveParsing) {
230         ZonedDateTime zdt = parse(dateString, dateFormatsMain);
231 
232         // if we got a successful parse or we don't want to attempt "extensive parsing", return here
233         if (!tryExtensiveParsing || zdt != null) {
234             return zdt;
235         }
236         zdt = parse(dateString, dateFormatsExtra);
237         return zdt;
238     }
239 
240     /**
241      * Get the timezone to use for parsing (needed for DateTimes that do not have timezone information)
242      *
243      * @param configTimezone timezone string ["GMT" or "UTC" or "+0000" or "+00:00" ...]
244      * @return timezone
245      */
246     private static ZoneId setupTimezone(final String configTimezone) {
247         try {
248             if (StringUtils.isNotBlank(configTimezone)) {
249                 // parse the timezone from the config
250                 return ZoneId.of(configTimezone);
251             }
252         } catch (DateTimeException e) {
253             logger.error("Error parsing timezone {}, using default {}", configTimezone, timezone, e);
254         }
255 
256         return ZoneId.of(DEFAULT_TIMEZONE);
257     }
258 
259     /**
260      * Get the overrides for the default date formats
261      *
262      * @param configEntries the list of main override formats from the config file
263      * @param dateTimeFormats the list of datetime formats
264      * @return a list of {@link DateTimeFormatter}s
265      */
266     private static List<DateTimeFormatter> setupDateFormats(final List<ConfigEntry> configEntries, final List<DateTimeFormatter> dateTimeFormats) {
267         List<DateTimeFormatter> dateFormats;
268         if (CollectionUtils.isNotEmpty(dateTimeFormats)) {
269             dateFormats = Collections.unmodifiableList(dateTimeFormats);
270             logger.debug("Created successfully. Created {} of {} formats from config", dateFormats.size(), configEntries.size());
271             return dateFormats;
272         } else {
273             logger.error("Could not create with configured variables");
274             throw new IllegalArgumentException("No date/time formats configured!!");
275         }
276     }
277 
278     /**
279      * Loop through the date formats from the config file and create DateTimeFormatter objects
280      *
281      * @param configEntries the list of override formats from the config file
282      * @return a list of {@link DateTimeFormatter}s
283      */
284     @Nullable
285     private static List<DateTimeFormatter> getConfigFormats(final List<ConfigEntry> configEntries) {
286         if (CollectionUtils.isEmpty(configEntries)) {
287             return null;
288         }
289         return configEntries.stream().map(FlexibleDateTimeParser::getFormatter).filter(Objects::nonNull).collect(toList());
290     }
291 
292     /**
293      * Create the DateTimeFormatter object
294      *
295      * @param entry format from the config file
296      * @return {@link DateTimeFormatter} if the pattern is valid, null otherwise
297      */
298     @Nullable
299     private static DateTimeFormatter getFormatter(ConfigEntry entry) {
300         try {
301             return new DateTimeFormatterBuilder().parseCaseInsensitive().appendPattern(entry.getValue())
302                     .toFormatter(Locale.getDefault());
303         } catch (IllegalArgumentException e) {
304             // log the bad one and move on because there could be other possible patterns
305             logger.error("Error parsing pattern [{}]: {}", entry.getValue(), e.getLocalizedMessage());
306         }
307         return null;
308     }
309 
310     /**
311      * Clean up the date string for processing (remove extra spaces, tabs, html, ...)
312      *
313      * @param date the date string to clean
314      * @return the scrubbed date string
315      */
316     private static String cleanDateString(final String date) {
317         if (StringUtils.isBlank(date)) {
318             return date;
319         }
320 
321         // date strings over 100 characters are more than likely invalid
322         String cleanedDateString = StringUtils.substring(date, 0, 100);
323         cleanedDateString = REPLACE.matcher(cleanedDateString).replaceAll(SPACE);
324         cleanedDateString = remove.matcher(cleanedDateString).replaceAll(EMPTY);
325 
326         return StringUtils.trimToNull(cleanedDateString);
327     }
328 
329     /**
330      * This class is not meant to be instantiated
331      */
332     private FlexibleDateTimeParser() {}
333 
334 }