ByteTokenizer.java
package emissary.util.search;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* The byte tokenizer class allows an application to break a byte buffer into tokens. This was modified from the
* java.util.StringTokenizer implementation. Note that all characters in the delimiter set are considered to be
* characters in the range 0 - 255. In other words the ISO8859-1 encoding is used to match the delimiters to the byte
* array.
*/
public class ByteTokenizer implements Iterator<String> {
private int currentPosition;
private int newPosition;
private final int maxPosition;
private final byte[] data;
private String delimiters;
private final boolean retDelims;
private boolean delimsChanged;
private String encoding;
private static final Logger logger = LoggerFactory.getLogger(ByteTokenizer.class);
/**
* maxDelimChar stores the value of the delimiter character with the highest value. It is used to optimize the detection
* of delimiter characters.
*/
private char maxDelimChar;
/**
* Set maxDelimChar to the highest char in the delimiter set.
*/
private void setMaxDelimChar() {
if (delimiters == null) {
maxDelimChar = 0;
return;
}
char m = 0;
for (int i = 0; i < delimiters.length(); i++) {
char c = delimiters.charAt(i);
if (m < c) {
m = c;
}
}
maxDelimChar = m;
}
/**
* Constructs a byte tokenizer for the specified byte array. All characters in the <code>delim</code> argument are the
* delimiters for separating tokens. Characters must be in the range of 0 - 255.
* <p>
* If the <code>returnDelims</code> flag is <code>true</code>, then the delimiter characters are also returned as
* tokens. Each delimiter is returned as a byte[] or String of length one. If the flag is <code>false</code>, the
* delimiter characters are skipped and only serve as separators between tokens.
*
* @param bytes a byte array to be parsed.
* @param start the first byte in the array
* @param len the number of bytes to parse
* @param delim the delimiters.
* @param returnDelims flag indicating whether to return the delimiters as tokens.
*/
public ByteTokenizer(byte[] bytes, int start, int len, String delim, boolean returnDelims) {
currentPosition = start;
newPosition = -1;
delimsChanged = false;
data = bytes;
maxPosition = start + len;
delimiters = delim;
retDelims = returnDelims;
setMaxDelimChar();
}
/**
* Constructs a byte tokenizer for the specified byte array using the specified encoding.
*
* @param bytes a byte array to be parsed.
* @param start the first byte in the array
* @param len the number of bytes to parse
* @param delim the delimiters.
* @param returnDelims flag indicating whether to return the delimiters as tokens.
* @param encoding the encoding for which to return the bytes.
* @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
*/
public ByteTokenizer(byte[] bytes, int start, int len, String delim, boolean returnDelims, String encoding) throws UnsupportedEncodingException {
this(bytes, start, len, delim, returnDelims);
try {
Charset c = Charset.forName(encoding);
logger.debug("Loaded charset {}", c);
} catch (IllegalArgumentException ex) {
throw new UnsupportedEncodingException(ex.toString());
}
this.encoding = encoding;
}
/**
* Constructs a byte tokenizer for the specified byte array. The characters in the <code>delim</code> argument are the
* delimiters for separating tokens. Delimiter characters themselves will not be treated as tokens.
*
* @param bytes a byte array to be parsed.
* @param start the first byte in the array
* @param len the number of bytes to parse
* @param delim the delimiters.
*/
public ByteTokenizer(byte[] bytes, int start, int len, String delim) {
this(bytes, start, len, delim, false);
}
/**
* Constructs a byte tokenizer for the specified byte array using the specified encoding.
*
* @param bytes a byte array to be parsed.
* @param start the first byte in the array
* @param len the number of bytes to parse
* @param delim the delimiters.
* @param encoding the encoding for which to return the bytes.
* @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
*/
public ByteTokenizer(byte[] bytes, int start, int len, String delim, String encoding) throws UnsupportedEncodingException {
this(bytes, start, len, delim, false, encoding);
}
/**
* Constructs a byte tokenizer for the specified byte array. The tokenizer uses the default delimiter set, which is
* <code>" \t\n\r\f"</code>: the space character, the tab character, the newline character, the
* carriage-return character, and the form-feed character. Delimiter characters themselves will not be treated as
* tokens.
*
* @param bytes a byte array to be parsed.
* @param start the first byte in the array
* @param len the number of bytes to parse
*/
public ByteTokenizer(byte[] bytes, int start, int len) {
this(bytes, start, len, " \t\n\r\f", false);
}
/**
* Constructs a byte tokenizer for the specified byte array. All characters in the <code>delim</code> argument are the
* delimiters for separating tokens. Characters must be in the range of 0 - 255.
* <p>
* If the <code>returnDelims</code> flag is <code>true</code>, then the delimiter characters are also returned as
* tokens. Each delimiter is returned as a byte[] or String of length one. If the flag is <code>false</code>, the
* delimiter characters are skipped and only serve as separators between tokens.
*
* @param bytes a byte array to be parsed.
* @param delim the delimiters.
* @param returnDelims flag indicating whether to return the delimiters as tokens.
*/
public ByteTokenizer(byte[] bytes, String delim, boolean returnDelims) {
this(bytes, 0, bytes.length, delim, returnDelims);
}
/**
* Constructs a byte tokenizer for the specified byte array using the specified encoding.
*
* @param bytes a byte array to be parsed.
* @param delim the delimiters.
* @param returnDelims flag indicating whether to return the delimiters as tokens.
* @param encoding the encoding for which to return the bytes.
* @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
*/
public ByteTokenizer(byte[] bytes, String delim, boolean returnDelims, String encoding) throws UnsupportedEncodingException {
this(bytes, 0, bytes.length, delim, returnDelims, encoding);
}
/**
* Constructs a byte tokenizer for the specified byte array. The characters in the <code>delim</code> argument are the
* delimiters for separating tokens. Delimiter characters themselves will not be treated as tokens.
*
* @param bytes a byte array to be parsed.
* @param delim the delimiters.
*/
public ByteTokenizer(byte[] bytes, String delim) {
this(bytes, 0, bytes.length, delim);
}
/**
* Constructs a byte tokenizer for the specified byte array using the specified encoding.
*
* @param bytes a byte array to be parsed.
* @param delim the delimiters.
* @param encoding the encoding for which to return the bytes.
* @exception UnsupportedEncodingException thrown if the supplied encoding is unsupported.
*/
public ByteTokenizer(byte[] bytes, String delim, String encoding) throws UnsupportedEncodingException {
this(bytes, 0, bytes.length, delim, encoding);
}
/**
* Constructs a byte tokenizer for the specified byte array. The tokenizer uses the default delimiter set, which is
* <code>" \t\n\r\f"</code>: the space character, the tab character, the newline character, the
* carriage-return character, and the form-feed character. Delimiter characters themselves will not be treated as
* tokens.
*
* @param bytes a byte array to be parsed.
*/
public ByteTokenizer(byte[] bytes) {
this(bytes, 0, bytes.length);
}
/**
* Skips delimiters starting from the specified position. If retDelims is false, returns the index of the first
* non-delimiter character at or after startPos. If retDelims is true, startPos is returned.
*/
private int skipDelimiters(int startPos) {
if (delimiters == null) {
throw new NullPointerException();
}
int position = startPos;
while (!retDelims && position < maxPosition) {
char c = (char) (0xFF & data[position]);
if ((c > maxDelimChar) || (delimiters.indexOf(c) < 0)) {
break;
}
position++;
}
return position;
}
/**
* Skips ahead from startPos and returns the index of the next delimiter character encountered, or maxPosition if no
* such delimiter is found.
*/
private int scanToken(int startPos) {
int position = startPos;
while (position < maxPosition) {
char c = (char) (0xFF & data[position]);
if ((c <= maxDelimChar) && (delimiters.indexOf(c) >= 0)) {
break;
}
position++;
}
if (retDelims && (startPos == position)) {
char c = (char) (0xFF & data[position]);
if ((c <= maxDelimChar) && (delimiters.indexOf(c) >= 0)) {
position++;
}
}
return position;
}
/**
* Tests if there are more tokens available from this tokenizer's string. If this method returns <code>true</code>, then
* a subsequent call to <code>next</code> with no argument will successfully return a token.
*
* @return <code>true</code> if and only if there is at least one token in the string after the current position;
* <code>false</code> otherwise.
*/
@Override
public boolean hasNext() {
/*
* Temporary store this position and use it in the following next() method only if the delimiters haven't been changed
* in that next() invocation.
*/
newPosition = skipDelimiters(currentPosition);
return (newPosition < maxPosition);
}
/**
* Returns the next token from this string tokenizer.
*
* @return the next token from this string tokenizer.
* @exception NoSuchElementException if there are no more tokens in this tokenizer's string.
*/
@Override
public String next() {
/*
* If next position already computed in hasMoreElements() and delimiters have changed between the computation and this
* invocation, then use the computed value.
*/
currentPosition = (newPosition >= 0 && !delimsChanged) ? newPosition : skipDelimiters(currentPosition);
/* Reset these anyway */
delimsChanged = false;
newPosition = -1;
if (currentPosition >= maxPosition) {
throw new NoSuchElementException();
}
int start = currentPosition;
currentPosition = scanToken(currentPosition);
String token = null;
try {
if (encoding != null) {
token = new String(data, start, currentPosition - start, encoding);
} else {
token = new String(data, start, currentPosition - start);
}
} catch (UnsupportedEncodingException uee) {
// cannot happen...we already verified in constructor
}
return token;
}
/**
* Returns the next token in this string tokenizer's string. First, the set of characters considered to be delimiters by
* this <code>ByteTokenizer</code> object is changed to be the characters in the string <code>delim</code>. Then the
* next token in the string after the current position is returned. The current position is advanced beyond the
* recognized token. The new delimiter set remains the default after this call.
*
* @param delim the new delimiters.
* @return the next token, after switching to the new delimiter set.
* @exception NoSuchElementException if there are no more tokens in this tokenizer's string.
*/
public String next(String delim) {
delimiters = delim;
/* delimiter string specified, so set the appropriate flag. */
delimsChanged = true;
setMaxDelimChar();
return next();
}
/**
* Calculates the number of times that this tokenizer's <code>next</code> method can be called before it generates an
* exception. The current position is not advanced.
*
* @return the number of tokens remaining in the string using the current delimiter set.
* @see ByteTokenizer#next()
*/
public int countTokens() {
int count = 0;
int currpos = currentPosition;
while (currpos < maxPosition) {
currpos = skipDelimiters(currpos);
if (currpos >= maxPosition) {
break;
}
currpos = scanToken(currpos);
count++;
}
return count;
}
}