forked from I2P_Developers/i2p.i2p
318 lines
11 KiB
Java
318 lines
11 KiB
Java
/*
|
|
* Copyright 2010 Tom Gibara
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*/
|
|
package com.tomgibara.crinch.hashing;
|
|
|
|
import java.math.BigInteger;
|
|
import java.util.Arrays;
|
|
import java.util.Comparator;
|
|
|
|
/**
|
|
* <p>
|
|
* A "minimal perfect hash" for Strings. After construction with an array of
|
|
* <em>n</em> unique non-null strings, an instance of this class will return a
|
|
* unique hash value <em>h</em> (0 <= h < n) for any string <em>s</em> in the
|
|
* array. A negative has value will typically be returned for a string that is
|
|
* not in the array.
|
|
* </p>
|
|
*
|
|
* <p>
|
|
* However, the supplied array is <em>not</em> retained. This means that the
|
|
* implementation cannot necessarily confirm that a string is not in the
|
|
* supplied array. Where this implementation cannot distinguish that a string is
|
|
* not in the array, a 'valid' hash value may be returned. Under no
|
|
* circumstances will a hash value be returned that is greater than or equal to
|
|
* <em>n</em>.
|
|
* </p>
|
|
*
|
|
* <p>
|
|
* <strong>IMPORTANT NOTE:</strong> The array of strings supplied to the
|
|
* constructor will be mutated: it is re-ordered so that
|
|
* <code>hash(a[i]) == i</code>. Application code must generally use this
|
|
* information to map hash values back onto the appropriate string value.
|
|
* </p>
|
|
*
|
|
* <p>
|
|
* <strong>NOTE:</strong> Good performance of this algorithm is predicated on
|
|
* string hash values being cached by the <code>String</code> class. Experience
|
|
* indicates that is is a good assumption.
|
|
* </p>
|
|
*
|
|
*
|
|
* @author Tom Gibara
|
|
*/
|
|
|
|
public class PerfectStringHash implements Hash<String> {
|
|
|
|
// statics
|
|
|
|
/**
|
|
* Comparator used to order the supplied string array. Hashcodes take
|
|
* priority, we will do a binary search on those. Otherwise, lengths take
|
|
* priority over character ordering because the hash algorithm prefers to
|
|
* compare lengths, it's faster.
|
|
*/
|
|
|
|
private static final Comparator<String> comparator = new Comparator<String>() {
|
|
@Override
|
|
public int compare(String s1, String s2) {
|
|
final int h1 = s1.hashCode();
|
|
final int h2 = s2.hashCode();
|
|
if (h1 == h2) {
|
|
final int d = s1.length() - s2.length();
|
|
return d == 0 ? s1.compareTo(s2) : d;
|
|
}
|
|
return h1 < h2 ? -1 : 1;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Builds a (typically v. small) decision tree for distinguishing strings
|
|
* that share the same hash value.
|
|
*
|
|
* @param values
|
|
* the string values to distinguish
|
|
* @param start
|
|
* the index from which the values should be read
|
|
* @param length
|
|
* the number of string values that need to be distinguished
|
|
* @param pivots
|
|
* the array that will hold our decision nodes
|
|
* @param pivotIndex
|
|
* the index at which the tree should be written
|
|
*/
|
|
private static void generatePivots(String[] values, int start, int length, int[] pivots, int pivotIndex) {
|
|
final int capacity = Integer.highestOneBit(length - 1) << 1;
|
|
final int depth = Integer.numberOfTrailingZeros(capacity);
|
|
pivots[ pivotIndex << 1 ] = depth;
|
|
pivots[(pivotIndex << 1) + 1] = length;
|
|
pivotIndex++;
|
|
//build the array
|
|
for (int i = 0; i < depth; i++) {
|
|
int step = capacity >> i;
|
|
for (int j = (1 << (depth-i-1)) - 1; j < capacity; j += step) {
|
|
final int part;
|
|
final int comp;
|
|
if (j >= length - 1) {
|
|
part = Integer.MIN_VALUE;
|
|
comp = 0;
|
|
} else {
|
|
final String v1 = values[start + j];
|
|
final String v2 = values[start + j + 1];
|
|
final int l1 = v1.length();
|
|
final int l2 = v2.length();
|
|
if (l1 == l2) {
|
|
int tPart = -1;
|
|
int tComp = -1;
|
|
for (int k = 0; k < l1; k++) {
|
|
final char c1 = v1.charAt(k);
|
|
final char c2 = v2.charAt(k);
|
|
if (c1 == c2) continue;
|
|
if (c1 < c2) { //must occur at some point because we have already checked that the two strings are unequal
|
|
tPart = k;
|
|
tComp = c1;
|
|
} else {
|
|
//shouldn't be possible - we've sorted the strings to avoid this case
|
|
throw new IllegalStateException();
|
|
}
|
|
break;
|
|
}
|
|
//check if we've been passed a duplicated value
|
|
if (tPart == -1) throw new IllegalArgumentException("duplicate value: " + v1);
|
|
part = tPart;
|
|
comp = tComp;
|
|
} else {
|
|
part = -1;
|
|
comp = l1;
|
|
}
|
|
}
|
|
pivots[ pivotIndex<<1 ] = part;
|
|
pivots[(pivotIndex<<1) + 1] = comp;
|
|
pivotIndex++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// fields
|
|
|
|
/**
|
|
* The hashcodes of the supplied strings.
|
|
*/
|
|
|
|
private final int[] hashes;
|
|
|
|
/**
|
|
* Stores two ints for every string, an offset into the pivot array (-1 if
|
|
* not necessary) and the depth of the decision tree that is rooted there.
|
|
*/
|
|
|
|
private final int[] offsets;
|
|
|
|
/**
|
|
* Stores two ints for every decision, the index at which a character
|
|
* comparison needs to be made, followed by the character value to be
|
|
* compared against; or -1 to indicate a length comparison, followed by the
|
|
* length to be compared against.
|
|
*/
|
|
|
|
private final int[] pivots;
|
|
|
|
/**
|
|
* Cache a range object which indicates the range of hash values generated.
|
|
*/
|
|
|
|
private final HashRange range;
|
|
|
|
/**
|
|
* Constructs a minimal perfect string hashing over the supplied strings.
|
|
*
|
|
* @param values
|
|
* an array of unique non-null strings that will be reordered
|
|
* such that <code>hash(values[i]) == i</code>.
|
|
*/
|
|
|
|
public PerfectStringHash(final String values[]) {
|
|
final int length = values.length;
|
|
if (length == 0) throw new IllegalArgumentException("No values supplied");
|
|
|
|
final int[] hashes = new int[length];
|
|
final int[] offsets = new int[2 * length];
|
|
final int[] runLengths = new int[length];
|
|
|
|
//sort values so that we can assume ordering by hashcode, length and char[]
|
|
Arrays.sort(values, comparator);
|
|
|
|
//pull the hashcodes into an array for analysis
|
|
for (int i = 0; i < length; i++) hashes[i] = values[i].hashCode();
|
|
|
|
//test for unique hashes
|
|
int offset = 0;
|
|
if (length > 1) {
|
|
int previousHash = hashes[0];
|
|
int runLength = 1;
|
|
for (int i = 1; i <= length; i++) {
|
|
int currentHash = i == length ? ~previousHash : hashes[i];
|
|
if (currentHash == previousHash) {
|
|
runLength++;
|
|
} else {
|
|
if (runLength > 1) {
|
|
final int firstIndex = i - runLength;
|
|
for (int j = i - 1; j >= firstIndex; j--) {
|
|
runLengths[j] = runLength;
|
|
//offset points to the first node in decision tree
|
|
offsets[ j<<1 ] = offset;
|
|
//adjustment is number of indices to first duplicate
|
|
offsets[(j<<1) + 1] = j - firstIndex;
|
|
}
|
|
//extra one for recording depth
|
|
offset += (Integer.highestOneBit(runLength - 1) << 1);
|
|
runLength = 1;
|
|
} else {
|
|
runLengths[i-1] = 1;
|
|
offsets[(i-1)<<1] = -1;
|
|
}
|
|
}
|
|
previousHash = currentHash;
|
|
}
|
|
}
|
|
|
|
//shortcut for when all hashes are unique
|
|
if (offset == 0) {
|
|
this.hashes = hashes;
|
|
this.offsets = null;
|
|
this.pivots = null;
|
|
this.range = new HashRange(0, length - 1);
|
|
return;
|
|
}
|
|
|
|
//build the decision trees
|
|
final int[] pivots = new int[offset * 2];
|
|
for (int i = 0; i < length;) {
|
|
final int runLength = runLengths[i];
|
|
if (runLength > 1) generatePivots(values, i, runLength, pivots, (int) offsets[i << 1]);
|
|
i += runLength;
|
|
}
|
|
|
|
//setup our state
|
|
this.pivots = pivots;
|
|
this.offsets = offsets;
|
|
this.hashes = hashes;
|
|
this.range = new HashRange(0, length - 1);
|
|
}
|
|
|
|
// hash generator methods
|
|
|
|
@Override
|
|
public HashRange getRange() {
|
|
return range;
|
|
}
|
|
|
|
@Override
|
|
public BigInteger hashAsBigInt(String value) {
|
|
return BigInteger.valueOf(hash(value));
|
|
}
|
|
|
|
//TODO decide whether to throw an IAE if -1 is returned from hash
|
|
@Override
|
|
public int hashAsInt(String value) {
|
|
return hash(value);
|
|
}
|
|
|
|
@Override
|
|
public long hashAsLong(String value) {
|
|
return hash(value);
|
|
}
|
|
|
|
/**
|
|
* Generates a hashcode for the supplied string.
|
|
*
|
|
* @param value
|
|
* any string, not null
|
|
* @return a minimal hashcode for the supplied string, or -1
|
|
*/
|
|
|
|
private int hash(String value) {
|
|
final int h = value.hashCode();
|
|
final int index = Arrays.binarySearch(hashes, h);
|
|
final int[] pivots = this.pivots;
|
|
if (pivots == null || index < 0) return index;
|
|
|
|
final int offset = offsets[index << 1];
|
|
if (offset == -1) return index;
|
|
|
|
final int depth = pivots[(offset << 1) ];
|
|
final int count = pivots[(offset << 1) + 1];
|
|
int i = 0;
|
|
for (int d = 0; d < depth; d++) {
|
|
final int t = (offset + (1 << d) + i) << 1;
|
|
final int part = pivots[t ];
|
|
final int comp = pivots[t + 1];
|
|
final boolean right;
|
|
if (part == Integer.MIN_VALUE) { //easy case - no right value
|
|
right = false;
|
|
} else if (part == -1) { //compare length
|
|
right = value.length() > comp;
|
|
} else { //lengths are equal, compare character
|
|
right = value.charAt(part) > (char) comp;
|
|
}
|
|
i <<= 1;
|
|
if (right) i++;
|
|
}
|
|
return i >= count ? -1 : index + i - offsets[(index << 1) + 1];
|
|
}
|
|
|
|
}
|