KeywordScanner.java
package emissary.util.search;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nullable;
/**
* Provides the ability to find specified {@code byte[]} patterns inside a larger {@code byte[]}.
*/
public class KeywordScanner {
private final int[] skip = new int[256];
private int dataLength = -1;
private byte[] data;
private byte[] pattern;
private int patternLength = -1;
private int lastByte = 0;
private int lastPosition = 0;
private boolean caseSensitive = true;
public KeywordScanner() {
this(new byte[0]);
}
/**
* Initializes a new {@code KeywordScanner} object with the provided data bytes.
*
* @param data the data to be scanned
*/
public KeywordScanner(final byte[] data) {
resetData(data);
}
public void resetData(String data) {
resetData(data, StandardCharsets.UTF_8);
}
public void resetData(String data, String charsetName) {
resetData(data, Charset.forName(charsetName));
}
public void resetData(String data, Charset charset) {
resetData(data.getBytes(charset));
}
/**
* Reset the byte array. Use of this method avoids having to instantiate a new KeywordScanner.
*
* @param data - bytes to match against
*/
public void resetData(@Nullable byte[] data) {
this.data = data;
if (data != null) {
this.dataLength = data.length;
} else {
this.dataLength = -1;
}
}
/**
* Returns the first occurrence of the provided pattern in the data.
*
* @param patternArg the byte pattern to scan for, null returns -1
* @return the index in the data where the pattern begins, -1 if not found
*/
public int indexOf(final byte[] patternArg) {
return indexOf(patternArg, 0, this.dataLength);
}
/**
* Returns the first occurrence of the provided pattern in the data, starting from the specified index.
* <p>
* There is no restriction on the value of {@code start}. If it is negative, it has the same effect as if it were zero:
* the entire data will be scanned. If it is greater than the length of the data, it has the same effect as if it were
* equal to the length of the data: -1 is returned.
*
* @param patternArg the byte pattern to scan for, null returns -1
* @param start the index to start searching from, negative values treated as 0
* @return the index in the data where the pattern begins, -1 if not found
*/
public int indexOf(final byte[] patternArg, final int start) {
return indexOf(patternArg, start, this.dataLength);
}
/**
* Returns the first occurrence of the provided pattern in the data, starting from the specified index and stopping at
* the specified index.
* <p>
* There is no restriction on the value of {@code start}. If it is negative, it has the same effect as if it were zero:
* the entire data may be scanned. If it is greater than the length of the data, it has the same effect as if it were
* equal to the length of the data: -1 is returned.
* <p>
* If the value of {@code stop} is negative, greater than the data length, or less than or equal to the start value, -1
* is returned.
*
* @param patternArg the byte pattern to scan for, null returns -1
* @param start the index to start searching from, negative values treated as 0
* @param stop the index to stop searching at, exclusive, negative value returns -1
* @return the index in the data where the pattern begins, -1 if not found
*/
public int indexOf(@Nullable final byte[] patternArg, final int start, final int stop) {
if ((start >= this.dataLength) || (stop > this.dataLength) || (patternArg == null)) {
return -1;
}
// Adjust the actual start index to 0 if a negative value is provided.
final int actualStart = Math.max(start, 0);
this.pattern = patternArg;
this.patternLength = patternArg.length;
analyze();
final int position = match(actualStart, stop);
this.lastPosition = position;
return position;
}
/**
* Returns a list of occurrences of the provided pattern in the data.
*
* @param patternArg the byte pattern to scan for, null returns empty list
* @return index list of positions in the data where the pattern begins, empty list if not found
*/
public List<Integer> listIndexOf(final byte[] patternArg) {
return listIndexOf(patternArg, 0, this.dataLength);
}
/**
* Returns a list of occurrences of the provided pattern in the data, starting from the specified index.
* <p>
* There is no restriction on the value of {@code start}. If it is negative, it has the same effect as if it were zero:
* the entire data will be scanned. If it is greater than the length of the data, it has the same effect as if it were
* equal to the length of the data: empty list returned.
*
* @param patternArg the byte pattern to scan for, null returns empty list
* @param start the index to start searching from, negative values treated as 0
* @return index list of positions in the data where the pattern begins, empty list if not found
*/
public List<Integer> listIndexOf(final byte[] patternArg, final int start) {
return listIndexOf(patternArg, start, this.dataLength);
}
/**
* Returns a list of occurrences of the provided pattern in the data, starting from the specified index and stopping at
* the specified index.
* <p>
* There is no restriction on the value of {@code start}. If it is negative, it has the same effect as if it were zero:
* the entire data may be scanned. If it is greater than the length of the data, it has the same effect as if it were
* equal to the length of the data: an empty list is returned.
* <p>
* If the value of {@code stop} is negative, greater than the data length, or less than or equal to the start value, an
* empty list is returned
*
* @param patternArg the byte pattern to scan for, null returns empty list
* @param start the index to start searching from, negative values treated as 0
* @param stop the index to stop searching at, exclusive, negative value returns empty list
* @return index list of positions in the data where the pattern begins, empty list if not found
*/
public List<Integer> listIndexOf(@Nullable final byte[] patternArg, final int start, final int stop) {
List<Integer> matches = new ArrayList<>();
if ((start >= this.dataLength) || (stop > this.dataLength) || (patternArg == null)) {
return List.of();
}
int newStart = start;
int actualStart;
int position = 0;
this.pattern = patternArg;
this.patternLength = patternArg.length;
while (position > -1) {
actualStart = Math.max(newStart, 0);
analyze();
position = match(actualStart, stop);
this.lastPosition = position;
if (position == -1) {
break;
} else {
matches.add(position);
newStart = position + this.patternLength;
}
}
return matches;
}
/**
* Find the next occurrence of the set pattern, stopping at the specified index.
* <p>
* If the value of {@code stop} is negative, greater than or equal to the data length, or less than the previously found
* index: -1 is returned.
* <p>
* This method should follow a call to one of the {@code indexOf} methods. These methods will set the pattern and return
* the index of the first occurrence of the provided pattern. Calls to this method will then return the index of
* subsequent occurrences. Without first establishing a pattern in this way, -1 will be returned.
*
* @param stop the index to stop searching at, exclusive, negative value returns -1
* @return the index, less than the stop index, where the next occurrence of the pattern is found, -1 if not found
*/
public int findNext(final int stop) {
if (stop >= this.dataLength) {
return -1;
}
if ((this.lastPosition > stop) || (this.lastPosition < 0)) {
return -1;
}
// if a pattern has not been set, just return -1
if (this.pattern == null) {
return -1;
}
final int position = match((this.lastPosition + 1), stop);
this.lastPosition = position;
return position;
}
/**
* Find the next occurrence of the set pattern.
* <p>
* This method should follow a call to one of the {@code indexOf} methods. These methods will set the pattern and return
* the index of the first occurrence of the provided pattern. Calls to this method will then return the index of
* subsequent occurrences. Without first establishing a pattern in this way, -1 will be returned.
*
* @return the index where the next occurrence of the pattern is found, -1 if not found
*/
public int findNext() {
return findNext(this.dataLength - 1);
}
/**
* Sets the case sensitivity of the scanner.
*
* @param theCase if set to false, the scanner will ignore case. Default is true.
*/
public void setCaseSensitive(final boolean theCase) {
this.caseSensitive = theCase;
}
/**
* Returns the case sensitivity set for the scanner.
*
* @return true if the scanner is case-sensitive, false otherwise
*/
public boolean isCaseSensitive() {
return this.caseSensitive;
}
private int get256Value(final byte b) {
return caseSensitive ? Byte.toUnsignedInt(b) : lowercase(Byte.toUnsignedInt(b));
}
private void analyze() {
for (int i = 0; i < 256; i++) {
this.skip[i] = this.patternLength;
}
for (int i = 0; i < this.patternLength - 1; i++) {
this.skip[get256Value(this.pattern[i])] = this.patternLength - i - 1;
}
this.lastByte = get256Value(this.pattern[this.patternLength - 1]);
}
private int match(final int start, final int stop) {
int matchIndex = -1;
int position = start + this.patternLength - 1;
while (position < stop) {
int currentByte = get256Value(this.data[position]);
if (currentByte == this.lastByte && isSame(position)) {
matchIndex = position + 1 - this.patternLength;
break;
}
position += this.skip[currentByte];
}
return matchIndex;
}
private static int lowercase(final int i) {
if ((i >= 'A') && (i <= 'Z')) {
return i + 32;
} else {
return i;
}
}
private boolean isSame(final int pos) {
for (int i = 0; i < this.patternLength; i++) {
int patternByte = this.pattern[i];
int dataByte = this.data[pos - this.patternLength + 1 + i];
if (!this.caseSensitive) {
patternByte = lowercase(patternByte);
dataByte = lowercase(dataByte);
}
if (patternByte != dataByte) {
return false;
}
}
return true;
}
}