-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speedup PriorityQueue a little #13936
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,7 +117,8 @@ public PriorityQueue(int maxSize, Supplier<T> sentinelObjectSupplier) { | |
* ArrayIndexOutOfBoundsException} is thrown. | ||
*/ | ||
public void addAll(Collection<T> elements) { | ||
if (this.size + elements.size() > this.maxSize) { | ||
int size = this.size; | ||
if (size + elements.size() > this.maxSize) { | ||
throw new ArrayIndexOutOfBoundsException( | ||
"Cannot add " | ||
+ elements.size() | ||
|
@@ -128,15 +129,17 @@ public void addAll(Collection<T> elements) { | |
// Heap with size S always takes first S elements of the array, | ||
// and thus it's safe to fill array further - no actual non-sentinel value will be overwritten. | ||
Iterator<T> iterator = elements.iterator(); | ||
var heap = this.heap; | ||
while (iterator.hasNext()) { | ||
this.heap[size + 1] = iterator.next(); | ||
this.size++; | ||
heap[size + 1] = iterator.next(); | ||
size++; | ||
} | ||
|
||
// The loop goes down to 1 as heap is 1-based not 0-based. | ||
for (int i = (size >>> 1); i >= 1; i--) { | ||
downHeap(i); | ||
downHeap(i, heap, size); | ||
} | ||
this.size = size; | ||
} | ||
|
||
/** | ||
|
@@ -156,9 +159,10 @@ public void addAll(Collection<T> elements) { | |
public final T add(T element) { | ||
// don't modify size until we know heap access didn't throw AIOOB. | ||
int index = size + 1; | ||
var heap = this.heap; | ||
heap[index] = element; | ||
size = index; | ||
upHeap(index); | ||
upHeap(index, heap); | ||
return heap[1]; | ||
} | ||
|
||
|
@@ -193,12 +197,15 @@ public final T top() { | |
|
||
/** Removes and returns the least element of the PriorityQueue in log(size) time. */ | ||
public final T pop() { | ||
if (size > 0) { | ||
int s = size; | ||
if (s > 0) { | ||
var heap = this.heap; | ||
T result = heap[1]; // save first value | ||
heap[1] = heap[size]; // move last to first | ||
heap[size] = null; // permit GC of objects | ||
size--; | ||
downHeap(1); // adjust heap | ||
heap[1] = heap[s]; // move last to first | ||
heap[s] = null; // permit GC of objects | ||
s--; | ||
size = s; | ||
downHeap(1, heap, s); // adjust heap | ||
return result; | ||
} else { | ||
return null; | ||
|
@@ -225,7 +232,8 @@ public final T pop() { | |
* @return the new 'top' element. | ||
*/ | ||
public final T updateTop() { | ||
downHeap(1); | ||
var heap = this.heap; | ||
downHeap(1, heap, size); | ||
return heap[1]; | ||
} | ||
|
||
|
@@ -242,7 +250,8 @@ public final int size() { | |
|
||
/** Removes all entries from the PriorityQueue. */ | ||
public final void clear() { | ||
for (int i = 0; i <= size; i++) { | ||
var heap = this.heap; | ||
for (int i = 0, to = size; i <= to; i++) { | ||
heap[i] = null; | ||
} | ||
size = 0; | ||
|
@@ -254,14 +263,15 @@ public final void clear() { | |
* constant remove time but the trade-off would be extra cost to all additions/insertions) | ||
*/ | ||
public final boolean remove(T element) { | ||
for (int i = 1; i <= size; i++) { | ||
var heap = this.heap; | ||
for (int i = 1, to = size; i <= to; i++) { | ||
if (heap[i] == element) { | ||
heap[i] = heap[size]; | ||
heap[size] = null; // permit GC of objects | ||
size--; | ||
if (i <= size) { | ||
if (!upHeap(i)) { | ||
downHeap(i); | ||
if (!upHeap(i, heap)) { | ||
downHeap(i, heap, size); | ||
} | ||
} | ||
return true; | ||
|
@@ -270,7 +280,7 @@ public final boolean remove(T element) { | |
return false; | ||
} | ||
|
||
private final boolean upHeap(int origPos) { | ||
private boolean upHeap(int origPos, T[] heap) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd create a local heap variable (var heap = this.heap) locally in this method, not pass it as an argument. It is confusing why you'd want it as an argument. I agree with Robert here that we should perhaps see long-term maintenance as worth the tiny performance benefit (although I think assigning to a local variable within the method would yield the same result). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not quite, the idea was that I already have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
With this class in particular I'm not sure the argument holds. Isn't the whole point of it the ability to mutate top and resort via There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I did understand the intention but I think the difference, if any, will be noticeable only if the loop doesn't hoist out the field read (which, I think it should?). My suggestion keeps the variables local, which helps in understanding of what it does. But anyway. I'm not entirely sold on these low-level optimizations that target c2/hotspot. There is so many moving parts here... operating system and CPU included. Eh.
I believe the differences were also functional - insertWithOverflow is one particular example that comes to mind and would require more complex logic in the JDK's PQ. Another is resigning from one level of indirection (method instead of Comparator) - these choices predate a lot of newer Java's offerings - perhaps it could be implemented in a different way now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whether or not it's worth doing this kind of optimization for the observed gain is a tricky question. From the perspective of a user of a large (and read-heavy) ES, Opensearch or similar deployment, an O(1%) gain might translate into a lot of dollars saved and this kind of thing is well worth the effort. That said :) as far as the technical details go I don't think it can hoist out those reads and it's not an exclusively C2/hotstpot specific thing either. Since Java allows using reflection to update final field values (except for fields that are either static, on a record or on a hidden classs) the compiler can't hoist the field access out of the loop I think (maybe in some happy cases escape analysis helps here). resultsTaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value HighTermTitleBDVSort 25.72 (7.0%) 25.77 (6.5%) 0.2% ( -12% - 14%) 0.897 BrowseRandomLabelSSDVFacets 3.37 (6.0%) 3.40 (4.2%) 1.0% ( -8% - 11%) 0.409 OrHighMed 214.06 (3.7%) 216.37 (3.8%) 1.1% ( -6% - 8%) 0.206 OrHighNotHigh 353.55 (8.4%) 358.52 (8.9%) 1.4% ( -14% - 20%) 0.475 AndHighHigh 111.32 (4.9%) 113.08 (5.5%) 1.6% ( -8% - 12%) 0.179 OrNotHighHigh 567.88 (4.8%) 577.94 (4.9%) 1.8% ( -7% - 12%) 0.108 PKLookup 241.21 (2.1%) 245.53 (2.1%) 1.8% ( -2% - 6%) 0.000 HighTerm 455.94 (6.6%) 464.35 (7.5%) 1.8% ( -11% - 17%) 0.250 MedTerm 590.06 (6.5%) 601.24 (6.0%) 1.9% ( -9% - 15%) 0.182 AndHighMed 156.22 (3.1%) 159.19 (2.9%) 1.9% ( -3% - 8%) 0.005 LowTerm 750.87 (4.6%) 765.45 (4.2%) 1.9% ( -6% - 11%) 0.052 BrowseRandomLabelTaxoFacets 4.48 (8.6%) 4.57 (3.9%) 2.0% ( -9% - 15%) 0.182 OrNotHighMed 479.29 (4.6%) 489.00 (5.4%) 2.0% ( -7% - 12%) 0.074 HighTermMonthSort 1515.68 (6.4%) 1546.97 (7.0%) 2.1% ( -10% - 16%) 0.171 OrHighHigh 85.48 (4.6%) 87.32 (5.3%) 2.2% ( -7% - 12%) 0.055 MedTermDayTaxoFacets 19.13 (3.0%) 19.55 (4.1%) 2.2% ( -4% - 9%) 0.007 MedIntervalsOrdered 28.59 (6.3%) 29.23 (4.7%) 2.2% ( -8% - 14%) 0.079 OrHighLow 610.70 (5.0%) 624.94 (5.0%) 2.3% ( -7% - 13%) 0.040 OrHighNotMed 474.52 (5.5%) 485.78 (5.7%) 2.4% ( -8% - 14%) 0.061 Fuzzy2 66.51 (3.2%) 68.09 (3.0%) 2.4% ( -3% - 8%) 0.001 BrowseDateSSDVFacets 1.24 (7.7%) 1.27 (8.1%) 2.4% ( -12% - 19%) 0.181 MedSpanNear 119.05 (4.4%) 121.94 (4.4%) 2.4% ( -6% - 11%) 0.016 HighTermTitleSort 76.83 (4.8%) 78.72 (3.7%) 2.5% ( -5% - 11%) 0.011 AndHighHighDayTaxoFacets 14.60 (3.8%) 14.96 (3.5%) 2.5% ( -4% - 10%) 0.003 BrowseMonthTaxoFacets 11.04 (38.5%) 11.32 (40.3%) 2.5% ( -55% - 132%) 0.778 OrNotHighLow 1089.24 (4.0%) 1117.30 (4.0%) 2.6% ( -5% - 10%) 0.004 TermDTSort 188.79 (4.6%) 193.74 (4.9%) 2.6% ( -6% - 12%) 0.015 Wildcard 426.59 (4.2%) 437.79 (4.2%) 2.6% ( -5% - 11%) 0.006 MedPhrase 78.10 (3.4%) 80.38 (3.2%) 2.9% ( -3% - 9%) 0.000 Prefix3 1068.70 (7.7%) 1100.07 (7.7%) 2.9% ( -11% - 19%) 0.094 AndHighLow 1546.10 (5.3%) 1591.97 (6.0%) 3.0% ( -7% - 15%) 0.020 LowIntervalsOrdered 134.11 (6.2%) 138.10 (5.0%) 3.0% ( -7% - 15%) 0.019 MedSloppyPhrase 47.07 (4.5%) 48.49 (3.7%) 3.0% ( -5% - 11%) 0.001 AndHighMedDayTaxoFacets 65.36 (2.3%) 67.38 (2.3%) 3.1% ( -1% - 7%) 0.000 LowSpanNear 175.93 (3.7%) 181.36 (4.7%) 3.1% ( -5% - 11%) 0.001 HighPhrase 131.54 (7.2%) 135.70 (5.8%) 3.2% ( -9% - 17%) 0.033 Fuzzy1 108.08 (3.4%) 111.62 (2.1%) 3.3% ( -2% - 9%) 0.000 BrowseDayOfYearSSDVFacets 4.52 (7.7%) 4.67 (7.9%) 3.4% ( -11% - 20%) 0.056 OrHighNotLow 550.21 (7.0%) 569.01 (7.9%) 3.4% ( -10% - 19%) 0.043 HighTermDayOfYearSort 380.03 (7.6%) 393.27 (6.6%) 3.5% ( -9% - 19%) 0.030 HighSpanNear 11.37 (4.5%) 11.77 (6.0%) 3.5% ( -6% - 14%) 0.004 Respell 54.77 (1.6%) 56.69 (1.7%) 3.5% ( 0% - 6%) 0.000 HighSloppyPhrase 30.28 (5.2%) 31.40 (4.8%) 3.7% ( -5% - 14%) 0.001 LowPhrase 76.63 (5.6%) 79.65 (5.6%) 3.9% ( -6% - 16%) 0.002 OrHighMedDayTaxoFacets 6.78 (6.2%) 7.05 (6.9%) 4.0% ( -8% - 18%) 0.007 IntNRQ 78.26 (6.5%) 81.38 (7.2%) 4.0% ( -9% - 18%) 0.010 LowSloppyPhrase 65.45 (6.6%) 68.14 (6.0%) 4.1% ( -7% - 17%) 0.004 HighIntervalsOrdered 9.16 (6.5%) 9.59 (5.9%) 4.6% ( -7% - 18%) 0.001 BrowseMonthSSDVFacets 4.48 (10.4%) 4.70 (12.4%) 4.8% ( -16% - 30%) 0.062 BrowseDateTaxoFacets 5.38 (10.8%) 5.67 (12.7%) 5.4% ( -16% - 32%) 0.043 BrowseDayOfYearTaxoFacets 5.44 (10.6%) 5.74 (12.7%) 5.5% ( -16% - 32%) 0.039 So to me it feels like manually hoisting field access is a generally valid optimization in a world that has reflective writes to final fields. To me, reducing field access is not in the same category as e.g. extracting cold paths artifically to make a method inline or other such tricks that are specific to C2 and hardware. This is just giving the compiler input that it cannot practically work out with the constraints imposed by the language and the JIT's runtime cost needing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think there's anything in the spec preventing it from doing so. The final keyword is indeed for the java compiler, not for the jvm, but... you know - it's easy to show that c2 can happily hoist out field reads, try it.
This aside, I am not rejecting the change - I just suggested to rename one local variable (s) and to remove method parameter in favor of a single local variable read - this should result in identical code to what your 1% gain was producing, if my gut feeling is right. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We've done such optimizations in the past for very hot hotspots in Lucene, e.g. |
||
int i = origPos; | ||
T node = heap[i]; // save bottom node | ||
int j = i >>> 1; | ||
|
@@ -283,7 +293,7 @@ private final boolean upHeap(int origPos) { | |
return i != origPos; | ||
} | ||
|
||
private final void downHeap(int i) { | ||
private void downHeap(int i, T[] heap, int size) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we removing |
||
T node = heap[i]; // save top node | ||
int j = i << 1; // find smaller child | ||
int k = j + 1; | ||
|
@@ -313,7 +323,7 @@ protected final Object[] getHeapArray() { | |
|
||
@Override | ||
public Iterator<T> iterator() { | ||
return new Iterator<T>() { | ||
return new Iterator<>() { | ||
|
||
int i = 1; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add comments explaining that the local variable assignment is done on purpose for performance reasons? We don't want a future refactoring to "simplify" this code and cut back to
this.size
.