/* * Copyright 2010 Tom Gibara * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.tomgibara.crinch.hashing; import java.math.BigInteger; import java.util.Arrays; import java.util.Comparator; /** *
* A "minimal perfect hash" for Strings. After construction with an array of * n unique non-null strings, an instance of this class will return a * unique hash value h (0 <= h < n) for any string s in the * array. A negative has value will typically be returned for a string that is * not in the array. *
* ** However, the supplied array is not retained. This means that the * implementation cannot necessarily confirm that a string is not in the * supplied array. Where this implementation cannot distinguish that a string is * not in the array, a 'valid' hash value may be returned. Under no * circumstances will a hash value be returned that is greater than or equal to * n. *
* *
* IMPORTANT NOTE: The array of strings supplied to the
* constructor will be mutated: it is re-ordered so that
* hash(a[i]) == i
. Application code must generally use this
* information to map hash values back onto the appropriate string value.
*
* NOTE: Good performance of this algorithm is predicated on
* string hash values being cached by the String
class. Experience
* indicates that is is a good assumption.
*
hash(values[i]) == i
.
*/
public PerfectStringHash(final String values[]) {
final int length = values.length;
if (length == 0) throw new IllegalArgumentException("No values supplied");
final int[] hashes = new int[length];
final int[] offsets = new int[2 * length];
final int[] runLengths = new int[length];
//sort values so that we can assume ordering by hashcode, length and char[]
Arrays.sort(values, comparator);
//pull the hashcodes into an array for analysis
for (int i = 0; i < length; i++) hashes[i] = values[i].hashCode();
//test for unique hashes
int offset = 0;
if (length > 1) {
int previousHash = hashes[0];
int runLength = 1;
for (int i = 1; i <= length; i++) {
int currentHash = i == length ? ~previousHash : hashes[i];
if (currentHash == previousHash) {
runLength++;
} else {
if (runLength > 1) {
final int firstIndex = i - runLength;
for (int j = i - 1; j >= firstIndex; j--) {
runLengths[j] = runLength;
//offset points to the first node in decision tree
offsets[ j<<1 ] = offset;
//adjustment is number of indices to first duplicate
offsets[(j<<1) + 1] = j - firstIndex;
}
//extra one for recording depth
offset += (Integer.highestOneBit(runLength - 1) << 1);
runLength = 1;
} else {
runLengths[i-1] = 1;
offsets[(i-1)<<1] = -1;
}
}
previousHash = currentHash;
}
}
//shortcut for when all hashes are unique
if (offset == 0) {
this.hashes = hashes;
this.offsets = null;
this.pivots = null;
this.range = new HashRange(0, length - 1);
return;
}
//build the decision trees
final int[] pivots = new int[offset * 2];
for (int i = 0; i < length;) {
final int runLength = runLengths[i];
if (runLength > 1) generatePivots(values, i, runLength, pivots, (int) offsets[i << 1]);
i += runLength;
}
//setup our state
this.pivots = pivots;
this.offsets = offsets;
this.hashes = hashes;
this.range = new HashRange(0, length - 1);
}
// hash generator methods
@Override
public HashRange getRange() {
return range;
}
@Override
public BigInteger hashAsBigInt(String value) {
return BigInteger.valueOf(hash(value));
}
//TODO decide whether to throw an IAE if -1 is returned from hash
@Override
public int hashAsInt(String value) {
return hash(value);
}
@Override
public long hashAsLong(String value) {
return hash(value);
}
/**
* Generates a hashcode for the supplied string.
*
* @param value
* any string, not null
* @return a minimal hashcode for the supplied string, or -1
*/
private int hash(String value) {
final int h = value.hashCode();
final int index = Arrays.binarySearch(hashes, h);
final int[] pivots = this.pivots;
if (pivots == null || index < 0) return index;
final int offset = offsets[index << 1];
if (offset == -1) return index;
final int depth = pivots[(offset << 1) ];
final int count = pivots[(offset << 1) + 1];
int i = 0;
for (int d = 0; d < depth; d++) {
final int t = (offset + (1 << d) + i) << 1;
final int part = pivots[t ];
final int comp = pivots[t + 1];
final boolean right;
if (part == Integer.MIN_VALUE) { //easy case - no right value
right = false;
} else if (part == -1) { //compare length
right = value.length() > comp;
} else { //lengths are equal, compare character
right = value.charAt(part) > (char) comp;
}
i <<= 1;
if (right) i++;
}
return i >= count ? -1 : index + i - offsets[(index << 1) + 1];
}
}