View Javadoc
1   package emissary.util.search;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   
6   import java.io.UnsupportedEncodingException;
7   import java.nio.charset.Charset;
8   import java.util.Iterator;
9   import java.util.NoSuchElementException;
10  
11  /**
12   * The byte tokenizer class allows an application to break a byte buffer into tokens. This was modified from the
13   * java.util.StringTokenizer implementation. Note that all characters in the delimiter set are considered to be
14   * characters in the range 0 - 255. In other words the ISO8859-1 encoding is used to match the delimiters to the byte
15   * array.
16   */
17  public class ByteTokenizer implements Iterator<String> {
18      private int currentPosition;
19      private int newPosition;
20      private final int maxPosition;
21      private final byte[] data;
22      private String delimiters;
23      private final boolean retDelims;
24      private boolean delimsChanged;
25      private String encoding;
26  
27      private static final Logger logger = LoggerFactory.getLogger(ByteTokenizer.class);
28  
29      /**
30       * maxDelimChar stores the value of the delimiter character with the highest value. It is used to optimize the detection
31       * of delimiter characters.
32       */
33      private char maxDelimChar;
34  
35      /**
36       * Set maxDelimChar to the highest char in the delimiter set.
37       */
38      private void setMaxDelimChar() {
39          if (delimiters == null) {
40              maxDelimChar = 0;
41              return;
42          }
43  
44          char m = 0;
45          for (int i = 0; i < delimiters.length(); i++) {
46              char c = delimiters.charAt(i);
47              if (m < c) {
48                  m = c;
49              }
50          }
51          maxDelimChar = m;
52      }
53  
54      /**
55       * Constructs a byte tokenizer for the specified byte array. All characters in the <code>delim</code> argument are the
56       * delimiters for separating tokens. Characters must be in the range of 0 - 255.
57       * <p>
58       * If the <code>returnDelims</code> flag is <code>true</code>, then the delimiter characters are also returned as
59       * tokens. Each delimiter is returned as a byte[] or String of length one. If the flag is <code>false</code>, the
60       * delimiter characters are skipped and only serve as separators between tokens.
61       *
62       * @param bytes a byte array to be parsed.
63       * @param start the first byte in the array
64       * @param len the number of bytes to parse
65       * @param delim the delimiters.
66       * @param returnDelims flag indicating whether to return the delimiters as tokens.
67       */
68      public ByteTokenizer(byte[] bytes, int start, int len, String delim, boolean returnDelims) {
69          currentPosition = start;
70          newPosition = -1;
71          delimsChanged = false;
72          data = bytes;
73          maxPosition = start + len;
74          delimiters = delim;
75          retDelims = returnDelims;
76          setMaxDelimChar();
77      }
78  
79      /**
80       * Constructs a byte tokenizer for the specified byte array using the specified encoding.
81       *
82       * @param bytes a byte array to be parsed.
83       * @param start the first byte in the array
84       * @param len the number of bytes to parse
85       * @param delim the delimiters.
86       * @param returnDelims flag indicating whether to return the delimiters as tokens.
87       * @param encoding the encoding for which to return the bytes.
88       * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
89       */
90      public ByteTokenizer(byte[] bytes, int start, int len, String delim, boolean returnDelims, String encoding) throws UnsupportedEncodingException {
91          this(bytes, start, len, delim, returnDelims);
92          try {
93              Charset c = Charset.forName(encoding);
94              logger.debug("Loaded charset {}", c);
95          } catch (IllegalArgumentException ex) {
96              throw new UnsupportedEncodingException(ex.toString());
97          }
98          this.encoding = encoding;
99      }
100 
101     /**
102      * Constructs a byte tokenizer for the specified byte array. The characters in the <code>delim</code> argument are the
103      * delimiters for separating tokens. Delimiter characters themselves will not be treated as tokens.
104      *
105      * @param bytes a byte array to be parsed.
106      * @param start the first byte in the array
107      * @param len the number of bytes to parse
108      * @param delim the delimiters.
109      */
110     public ByteTokenizer(byte[] bytes, int start, int len, String delim) {
111         this(bytes, start, len, delim, false);
112     }
113 
114     /**
115      * Constructs a byte tokenizer for the specified byte array using the specified encoding.
116      *
117      * @param bytes a byte array to be parsed.
118      * @param start the first byte in the array
119      * @param len the number of bytes to parse
120      * @param delim the delimiters.
121      * @param encoding the encoding for which to return the bytes.
122      * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
123      */
124     public ByteTokenizer(byte[] bytes, int start, int len, String delim, String encoding) throws UnsupportedEncodingException {
125         this(bytes, start, len, delim, false, encoding);
126     }
127 
128     /**
129      * Constructs a byte tokenizer for the specified byte array. The tokenizer uses the default delimiter set, which is
130      * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character, the tab character, the newline character, the
131      * carriage-return character, and the form-feed character. Delimiter characters themselves will not be treated as
132      * tokens.
133      *
134      * @param bytes a byte array to be parsed.
135      * @param start the first byte in the array
136      * @param len the number of bytes to parse
137      */
138     public ByteTokenizer(byte[] bytes, int start, int len) {
139         this(bytes, start, len, " \t\n\r\f", false);
140     }
141 
142     /**
143      * Constructs a byte tokenizer for the specified byte array. All characters in the <code>delim</code> argument are the
144      * delimiters for separating tokens. Characters must be in the range of 0 - 255.
145      * <p>
146      * If the <code>returnDelims</code> flag is <code>true</code>, then the delimiter characters are also returned as
147      * tokens. Each delimiter is returned as a byte[] or String of length one. If the flag is <code>false</code>, the
148      * delimiter characters are skipped and only serve as separators between tokens.
149      *
150      * @param bytes a byte array to be parsed.
151      * @param delim the delimiters.
152      * @param returnDelims flag indicating whether to return the delimiters as tokens.
153      */
154     public ByteTokenizer(byte[] bytes, String delim, boolean returnDelims) {
155         this(bytes, 0, bytes.length, delim, returnDelims);
156     }
157 
158     /**
159      * Constructs a byte tokenizer for the specified byte array using the specified encoding.
160      *
161      * @param bytes a byte array to be parsed.
162      * @param delim the delimiters.
163      * @param returnDelims flag indicating whether to return the delimiters as tokens.
164      * @param encoding the encoding for which to return the bytes.
165      * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
166      */
167     public ByteTokenizer(byte[] bytes, String delim, boolean returnDelims, String encoding) throws UnsupportedEncodingException {
168         this(bytes, 0, bytes.length, delim, returnDelims, encoding);
169     }
170 
171     /**
172      * Constructs a byte tokenizer for the specified byte array. The characters in the <code>delim</code> argument are the
173      * delimiters for separating tokens. Delimiter characters themselves will not be treated as tokens.
174      *
175      * @param bytes a byte array to be parsed.
176      * @param delim the delimiters.
177      */
178     public ByteTokenizer(byte[] bytes, String delim) {
179         this(bytes, 0, bytes.length, delim);
180     }
181 
182     /**
183      * Constructs a byte tokenizer for the specified byte array using the specified encoding.
184      *
185      * @param bytes a byte array to be parsed.
186      * @param delim the delimiters.
187      * @param encoding the encoding for which to return the bytes.
188      * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
189      */
190     public ByteTokenizer(byte[] bytes, String delim, String encoding) throws UnsupportedEncodingException {
191         this(bytes, 0, bytes.length, delim, encoding);
192     }
193 
194     /**
195      * Constructs a byte tokenizer for the specified byte array. The tokenizer uses the default delimiter set, which is
196      * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character, the tab character, the newline character, the
197      * carriage-return character, and the form-feed character. Delimiter characters themselves will not be treated as
198      * tokens.
199      *
200      * @param bytes a byte array to be parsed.
201      */
202     public ByteTokenizer(byte[] bytes) {
203         this(bytes, 0, bytes.length);
204     }
205 
206     /**
207      * Skips delimiters starting from the specified position. If retDelims is false, returns the index of the first
208      * non-delimiter character at or after startPos. If retDelims is true, startPos is returned.
209      */
210     private int skipDelimiters(int startPos) {
211         if (delimiters == null) {
212             throw new NullPointerException();
213         }
214 
215         int position = startPos;
216         while (!retDelims && position < maxPosition) {
217             char c = (char) (0xFF & data[position]);
218             if ((c > maxDelimChar) || (delimiters.indexOf(c) < 0)) {
219                 break;
220             }
221             position++;
222         }
223         return position;
224     }
225 
226     /**
227      * Skips ahead from startPos and returns the index of the next delimiter character encountered, or maxPosition if no
228      * such delimiter is found.
229      */
230     private int scanToken(int startPos) {
231         int position = startPos;
232         while (position < maxPosition) {
233             char c = (char) (0xFF & data[position]);
234             if ((c <= maxDelimChar) && (delimiters.indexOf(c) >= 0)) {
235                 break;
236             }
237             position++;
238         }
239         if (retDelims && (startPos == position)) {
240             char c = (char) (0xFF & data[position]);
241             if ((c <= maxDelimChar) && (delimiters.indexOf(c) >= 0)) {
242                 position++;
243             }
244         }
245         return position;
246     }
247 
248     /**
249      * Tests if there are more tokens available from this tokenizer's string. If this method returns <code>true</code>, then
250      * a subsequent call to <code>next</code> with no argument will successfully return a token.
251      *
252      * @return <code>true</code> if and only if there is at least one token in the string after the current position;
253      *         <code>false</code> otherwise.
254      */
255     @Override
256     public boolean hasNext() {
257         /*
258          * Temporary store this position and use it in the following next() method only if the delimiters haven't been changed
259          * in that next() invocation.
260          */
261         newPosition = skipDelimiters(currentPosition);
262         return newPosition < maxPosition;
263     }
264 
265     /**
266      * Returns the next token from this string tokenizer.
267      *
268      * @return the next token from this string tokenizer.
269      * @exception NoSuchElementException if there are no more tokens in this tokenizer's string.
270      */
271     @Override
272     public String next() {
273         /*
274          * If next position already computed in hasMoreElements() and delimiters have changed between the computation and this
275          * invocation, then use the computed value.
276          */
277 
278         currentPosition = (newPosition >= 0 && !delimsChanged) ? newPosition : skipDelimiters(currentPosition);
279 
280         /* Reset these anyway */
281         delimsChanged = false;
282         newPosition = -1;
283 
284         if (currentPosition >= maxPosition) {
285             throw new NoSuchElementException();
286         }
287         int start = currentPosition;
288         currentPosition = scanToken(currentPosition);
289 
290         String token = null;
291         try {
292             if (encoding != null) {
293                 token = new String(data, start, currentPosition - start, encoding);
294             } else {
295                 token = new String(data, start, currentPosition - start);
296             }
297         } catch (UnsupportedEncodingException ignored) {
298             // cannot happen...we already verified in constructor
299         }
300         return token;
301     }
302 
303     /**
304      * Returns the next token in this string tokenizer's string. First, the set of characters considered to be delimiters by
305      * this <code>ByteTokenizer</code> object is changed to be the characters in the string <code>delim</code>. Then the
306      * next token in the string after the current position is returned. The current position is advanced beyond the
307      * recognized token. The new delimiter set remains the default after this call.
308      *
309      * @param delim the new delimiters.
310      * @return the next token, after switching to the new delimiter set.
311      * @exception NoSuchElementException if there are no more tokens in this tokenizer's string.
312      */
313     public String next(String delim) {
314         delimiters = delim;
315 
316         /* delimiter string specified, so set the appropriate flag. */
317         delimsChanged = true;
318 
319         setMaxDelimChar();
320         return next();
321     }
322 
323     /**
324      * Calculates the number of times that this tokenizer's <code>next</code> method can be called before it generates an
325      * exception. The current position is not advanced.
326      *
327      * @return the number of tokens remaining in the string using the current delimiter set.
328      * @see ByteTokenizer#next()
329      */
330     public int countTokens() {
331         int count = 0;
332         int currpos = currentPosition;
333         while (currpos < maxPosition) {
334             currpos = skipDelimiters(currpos);
335             if (currpos >= maxPosition) {
336                 break;
337             }
338             currpos = scanToken(currpos);
339             count++;
340         }
341         return count;
342     }
343 }