1 package emissary.util.search; 2 3 import org.slf4j.Logger; 4 import org.slf4j.LoggerFactory; 5 6 import java.io.UnsupportedEncodingException; 7 import java.nio.charset.Charset; 8 import java.util.Iterator; 9 import java.util.NoSuchElementException; 10 11 /** 12 * The byte tokenizer class allows an application to break a byte buffer into tokens. This was modified from the 13 * java.util.StringTokenizer implementation. Note that all characters in the delimiter set are considered to be 14 * characters in the range 0 - 255. In other words the ISO8859-1 encoding is used to match the delimiters to the byte 15 * array. 16 */ 17 public class ByteTokenizer implements Iterator<String> { 18 private int currentPosition; 19 private int newPosition; 20 private final int maxPosition; 21 private final byte[] data; 22 private String delimiters; 23 private final boolean retDelims; 24 private boolean delimsChanged; 25 private String encoding; 26 27 private static final Logger logger = LoggerFactory.getLogger(ByteTokenizer.class); 28 29 /** 30 * maxDelimChar stores the value of the delimiter character with the highest value. It is used to optimize the detection 31 * of delimiter characters. 32 */ 33 private char maxDelimChar; 34 35 /** 36 * Set maxDelimChar to the highest char in the delimiter set. 37 */ 38 private void setMaxDelimChar() { 39 if (delimiters == null) { 40 maxDelimChar = 0; 41 return; 42 } 43 44 char m = 0; 45 for (int i = 0; i < delimiters.length(); i++) { 46 char c = delimiters.charAt(i); 47 if (m < c) { 48 m = c; 49 } 50 } 51 maxDelimChar = m; 52 } 53 54 /** 55 * Constructs a byte tokenizer for the specified byte array. All characters in the <code>delim</code> argument are the 56 * delimiters for separating tokens. Characters must be in the range of 0 - 255. 57 * <p> 58 * If the <code>returnDelims</code> flag is <code>true</code>, then the delimiter characters are also returned as 59 * tokens. Each delimiter is returned as a byte[] or String of length one. If the flag is <code>false</code>, the 60 * delimiter characters are skipped and only serve as separators between tokens. 61 * 62 * @param bytes a byte array to be parsed. 63 * @param start the first byte in the array 64 * @param len the number of bytes to parse 65 * @param delim the delimiters. 66 * @param returnDelims flag indicating whether to return the delimiters as tokens. 67 */ 68 public ByteTokenizer(byte[] bytes, int start, int len, String delim, boolean returnDelims) { 69 currentPosition = start; 70 newPosition = -1; 71 delimsChanged = false; 72 data = bytes; 73 maxPosition = start + len; 74 delimiters = delim; 75 retDelims = returnDelims; 76 setMaxDelimChar(); 77 } 78 79 /** 80 * Constructs a byte tokenizer for the specified byte array using the specified encoding. 81 * 82 * @param bytes a byte array to be parsed. 83 * @param start the first byte in the array 84 * @param len the number of bytes to parse 85 * @param delim the delimiters. 86 * @param returnDelims flag indicating whether to return the delimiters as tokens. 87 * @param encoding the encoding for which to return the bytes. 88 * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported. 89 */ 90 public ByteTokenizer(byte[] bytes, int start, int len, String delim, boolean returnDelims, String encoding) throws UnsupportedEncodingException { 91 this(bytes, start, len, delim, returnDelims); 92 try { 93 Charset c = Charset.forName(encoding); 94 logger.debug("Loaded charset {}", c); 95 } catch (IllegalArgumentException ex) { 96 throw new UnsupportedEncodingException(ex.toString()); 97 } 98 this.encoding = encoding; 99 } 100 101 /** 102 * Constructs a byte tokenizer for the specified byte array. The characters in the <code>delim</code> argument are the 103 * delimiters for separating tokens. Delimiter characters themselves will not be treated as tokens. 104 * 105 * @param bytes a byte array to be parsed. 106 * @param start the first byte in the array 107 * @param len the number of bytes to parse 108 * @param delim the delimiters. 109 */ 110 public ByteTokenizer(byte[] bytes, int start, int len, String delim) { 111 this(bytes, start, len, delim, false); 112 } 113 114 /** 115 * Constructs a byte tokenizer for the specified byte array using the specified encoding. 116 * 117 * @param bytes a byte array to be parsed. 118 * @param start the first byte in the array 119 * @param len the number of bytes to parse 120 * @param delim the delimiters. 121 * @param encoding the encoding for which to return the bytes. 122 * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported. 123 */ 124 public ByteTokenizer(byte[] bytes, int start, int len, String delim, String encoding) throws UnsupportedEncodingException { 125 this(bytes, start, len, delim, false, encoding); 126 } 127 128 /** 129 * Constructs a byte tokenizer for the specified byte array. The tokenizer uses the default delimiter set, which is 130 * <code>" \t\n\r\f"</code>: the space character, the tab character, the newline character, the 131 * carriage-return character, and the form-feed character. Delimiter characters themselves will not be treated as 132 * tokens. 133 * 134 * @param bytes a byte array to be parsed. 135 * @param start the first byte in the array 136 * @param len the number of bytes to parse 137 */ 138 public ByteTokenizer(byte[] bytes, int start, int len) { 139 this(bytes, start, len, " \t\n\r\f", false); 140 } 141 142 /** 143 * Constructs a byte tokenizer for the specified byte array. All characters in the <code>delim</code> argument are the 144 * delimiters for separating tokens. Characters must be in the range of 0 - 255. 145 * <p> 146 * If the <code>returnDelims</code> flag is <code>true</code>, then the delimiter characters are also returned as 147 * tokens. Each delimiter is returned as a byte[] or String of length one. If the flag is <code>false</code>, the 148 * delimiter characters are skipped and only serve as separators between tokens. 149 * 150 * @param bytes a byte array to be parsed. 151 * @param delim the delimiters. 152 * @param returnDelims flag indicating whether to return the delimiters as tokens. 153 */ 154 public ByteTokenizer(byte[] bytes, String delim, boolean returnDelims) { 155 this(bytes, 0, bytes.length, delim, returnDelims); 156 } 157 158 /** 159 * Constructs a byte tokenizer for the specified byte array using the specified encoding. 160 * 161 * @param bytes a byte array to be parsed. 162 * @param delim the delimiters. 163 * @param returnDelims flag indicating whether to return the delimiters as tokens. 164 * @param encoding the encoding for which to return the bytes. 165 * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported. 166 */ 167 public ByteTokenizer(byte[] bytes, String delim, boolean returnDelims, String encoding) throws UnsupportedEncodingException { 168 this(bytes, 0, bytes.length, delim, returnDelims, encoding); 169 } 170 171 /** 172 * Constructs a byte tokenizer for the specified byte array. The characters in the <code>delim</code> argument are the 173 * delimiters for separating tokens. Delimiter characters themselves will not be treated as tokens. 174 * 175 * @param bytes a byte array to be parsed. 176 * @param delim the delimiters. 177 */ 178 public ByteTokenizer(byte[] bytes, String delim) { 179 this(bytes, 0, bytes.length, delim); 180 } 181 182 /** 183 * Constructs a byte tokenizer for the specified byte array using the specified encoding. 184 * 185 * @param bytes a byte array to be parsed. 186 * @param delim the delimiters. 187 * @param encoding the encoding for which to return the bytes. 188 * @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported. 189 */ 190 public ByteTokenizer(byte[] bytes, String delim, String encoding) throws UnsupportedEncodingException { 191 this(bytes, 0, bytes.length, delim, encoding); 192 } 193 194 /** 195 * Constructs a byte tokenizer for the specified byte array. The tokenizer uses the default delimiter set, which is 196 * <code>" \t\n\r\f"</code>: the space character, the tab character, the newline character, the 197 * carriage-return character, and the form-feed character. Delimiter characters themselves will not be treated as 198 * tokens. 199 * 200 * @param bytes a byte array to be parsed. 201 */ 202 public ByteTokenizer(byte[] bytes) { 203 this(bytes, 0, bytes.length); 204 } 205 206 /** 207 * Skips delimiters starting from the specified position. If retDelims is false, returns the index of the first 208 * non-delimiter character at or after startPos. If retDelims is true, startPos is returned. 209 */ 210 private int skipDelimiters(int startPos) { 211 if (delimiters == null) { 212 throw new NullPointerException(); 213 } 214 215 int position = startPos; 216 while (!retDelims && position < maxPosition) { 217 char c = (char) (0xFF & data[position]); 218 if ((c > maxDelimChar) || (delimiters.indexOf(c) < 0)) { 219 break; 220 } 221 position++; 222 } 223 return position; 224 } 225 226 /** 227 * Skips ahead from startPos and returns the index of the next delimiter character encountered, or maxPosition if no 228 * such delimiter is found. 229 */ 230 private int scanToken(int startPos) { 231 int position = startPos; 232 while (position < maxPosition) { 233 char c = (char) (0xFF & data[position]); 234 if ((c <= maxDelimChar) && (delimiters.indexOf(c) >= 0)) { 235 break; 236 } 237 position++; 238 } 239 if (retDelims && (startPos == position)) { 240 char c = (char) (0xFF & data[position]); 241 if ((c <= maxDelimChar) && (delimiters.indexOf(c) >= 0)) { 242 position++; 243 } 244 } 245 return position; 246 } 247 248 /** 249 * Tests if there are more tokens available from this tokenizer's string. If this method returns <code>true</code>, then 250 * a subsequent call to <code>next</code> with no argument will successfully return a token. 251 * 252 * @return <code>true</code> if and only if there is at least one token in the string after the current position; 253 * <code>false</code> otherwise. 254 */ 255 @Override 256 public boolean hasNext() { 257 /* 258 * Temporary store this position and use it in the following next() method only if the delimiters haven't been changed 259 * in that next() invocation. 260 */ 261 newPosition = skipDelimiters(currentPosition); 262 return newPosition < maxPosition; 263 } 264 265 /** 266 * Returns the next token from this string tokenizer. 267 * 268 * @return the next token from this string tokenizer. 269 * @exception NoSuchElementException if there are no more tokens in this tokenizer's string. 270 */ 271 @Override 272 public String next() { 273 /* 274 * If next position already computed in hasMoreElements() and delimiters have changed between the computation and this 275 * invocation, then use the computed value. 276 */ 277 278 currentPosition = (newPosition >= 0 && !delimsChanged) ? newPosition : skipDelimiters(currentPosition); 279 280 /* Reset these anyway */ 281 delimsChanged = false; 282 newPosition = -1; 283 284 if (currentPosition >= maxPosition) { 285 throw new NoSuchElementException(); 286 } 287 int start = currentPosition; 288 currentPosition = scanToken(currentPosition); 289 290 String token = null; 291 try { 292 if (encoding != null) { 293 token = new String(data, start, currentPosition - start, encoding); 294 } else { 295 token = new String(data, start, currentPosition - start); 296 } 297 } catch (UnsupportedEncodingException ignored) { 298 // cannot happen...we already verified in constructor 299 } 300 return token; 301 } 302 303 /** 304 * Returns the next token in this string tokenizer's string. First, the set of characters considered to be delimiters by 305 * this <code>ByteTokenizer</code> object is changed to be the characters in the string <code>delim</code>. Then the 306 * next token in the string after the current position is returned. The current position is advanced beyond the 307 * recognized token. The new delimiter set remains the default after this call. 308 * 309 * @param delim the new delimiters. 310 * @return the next token, after switching to the new delimiter set. 311 * @exception NoSuchElementException if there are no more tokens in this tokenizer's string. 312 */ 313 public String next(String delim) { 314 delimiters = delim; 315 316 /* delimiter string specified, so set the appropriate flag. */ 317 delimsChanged = true; 318 319 setMaxDelimChar(); 320 return next(); 321 } 322 323 /** 324 * Calculates the number of times that this tokenizer's <code>next</code> method can be called before it generates an 325 * exception. The current position is not advanced. 326 * 327 * @return the number of tokens remaining in the string using the current delimiter set. 328 * @see ByteTokenizer#next() 329 */ 330 public int countTokens() { 331 int count = 0; 332 int currpos = currentPosition; 333 while (currpos < maxPosition) { 334 currpos = skipDelimiters(currpos); 335 if (currpos >= maxPosition) { 336 break; 337 } 338 currpos = scanToken(currpos); 339 count++; 340 } 341 return count; 342 } 343 }