Skip to content

Commit 880c551

Browse files
committed
re2: hoist a few loads out of BitState ShouldVisit
Caching the fields from prog_ in the outer loop instead of reloading them inside each call to ShouldVisit makes the fast search path of BitState noticeably faster. Thanks to @nafi3000 for the idea and the initial patch. benchmark \ host s7 mac vs base vs base Search_Success1_BitState/size=8 ~ ~ Search_Success1_BitState/size=64 -3.63% -2.04% Search_Success1_BitState/size=512 -15.83% -6.83% Search_Success1_BitState/size=4096 -25.59% -10.74% Search_Success1_BitState/size=32768 -27.85% -11.85% Search_Success1_BitState/size=262144 -28.03% -11.94% Search_Success1_BitState/size=2097152 -27.99% -11.80% Search_Success1_CachedBitState/size=8 -17.81% -8.16% Search_Success1_CachedBitState/size=64 -27.17% -10.69% Search_Success1_CachedBitState/size=512 -27.63% -11.56% Search_Success1_CachedBitState/size=4096 -27.88% -11.61% Search_Success1_CachedBitState/size=32768 -27.90% -11.60% Search_Success1_CachedBitState/size=262144 -27.73% -11.55% Search_Success1_CachedBitState/size=2097152 -27.88% -11.54% Search_AltMatch_BitState/size=8 +0.83% ~ Search_AltMatch_BitState/size=64 +0.88% ~ Search_AltMatch_BitState/size=512 +0.88% ~ Search_AltMatch_BitState/size=4096 +1.02% ~ Search_AltMatch_BitState/size=32768 +1.02% ~ Search_AltMatch_BitState/size=262144 +1.22% ~ Search_AltMatch_BitState/size=2097152 +1.10% ~ Search_AltMatch_BitState/size=16777216 ~ ~ Search_AltMatch_CachedBitState/size=8 -1.79% ~ Search_AltMatch_CachedBitState/size=64 -3.23% ~ Search_AltMatch_CachedBitState/size=512 -3.39% ~ Search_AltMatch_CachedBitState/size=4096 ~ ~ Search_AltMatch_CachedBitState/size=32768 ~ ~ Search_AltMatch_CachedBitState/size=262144 ~ ~ Search_AltMatch_CachedBitState/size=2097152 ~ ~ Search_AltMatch_CachedBitState/size=16777216 ~ ~ host: s7 │ old │ new │ │ sec/op │ sec/op vs base │ Search_Success1_BitState/size=8 50.55µ ± 0% 50.58µ ± 0% ~ (p=0.806 n=25) Search_Success1_BitState/size=64 58.28µ ± 0% 56.16µ ± 0% -3.63% (p=0.000 n=25) Search_Success1_BitState/size=512 119.6µ ± 0% 100.7µ ± 0% -15.83% (p=0.000 n=25) Search_Success1_BitState/size=4096 612.7µ ± 0% 455.9µ ± 0% -25.59% (p=0.000 n=25) Search_Success1_BitState/size=32768 4.563m ± 0% 3.292m ± 0% -27.85% (p=0.000 n=25) Search_Success1_BitState/size=262144 36.09m ± 0% 25.98m ± 0% -28.03% (p=0.000 n=25) Search_Success1_BitState/size=2097152 288.6m ± 0% 207.8m ± 0% -27.99% (p=0.000 n=25) Search_Success1_CachedBitState/size=8 1.971µ ± 1% 1.620µ ± 0% -17.81% (p=0.000 n=25) Search_Success1_CachedBitState/size=64 9.829µ ± 1% 7.158µ ± 0% -27.17% (p=0.000 n=25) Search_Success1_CachedBitState/size=512 71.19µ ± 0% 51.52µ ± 0% -27.63% (p=0.000 n=25) Search_Success1_CachedBitState/size=4096 563.1µ ± 0% 406.1µ ± 0% -27.88% (p=0.000 n=25) Search_Success1_CachedBitState/size=32768 4.492m ± 0% 3.239m ± 0% -27.90% (p=0.000 n=25) Search_Success1_CachedBitState/size=262144 35.88m ± 0% 25.93m ± 0% -27.73% (p=0.000 n=25) Search_Success1_CachedBitState/size=2097152 287.3m ± 0% 207.2m ± 0% -27.88% (p=0.000 n=25) Search_AltMatch_BitState/size=8 16.54µ ± 0% 16.68µ ± 0% +0.83% (p=0.000 n=25) Search_AltMatch_BitState/size=64 16.53µ ± 0% 16.68µ ± 0% +0.88% (p=0.000 n=25) Search_AltMatch_BitState/size=512 16.53µ ± 0% 16.68µ ± 0% +0.88% (p=0.000 n=25) Search_AltMatch_BitState/size=4096 16.62µ ± 0% 16.79µ ± 0% +1.02% (p=0.000 n=25) Search_AltMatch_BitState/size=32768 16.63µ ± 0% 16.80µ ± 0% +1.02% (p=0.000 n=25) Search_AltMatch_BitState/size=262144 17.05µ ± 0% 17.26µ ± 0% +1.22% (p=0.000 n=25) Search_AltMatch_BitState/size=2097152 22.36µ ± 0% 22.61µ ± 0% +1.10% (p=0.000 n=25) Search_AltMatch_BitState/size=16777216 508.3µ ± 1% 510.5µ ± 0% ~ (p=0.202 n=25) Search_AltMatch_CachedBitState/size=8 616.0n ± 1% 605.0n ± 2% -1.79% (p=0.000 n=25) Search_AltMatch_CachedBitState/size=64 620.0n ± 1% 600.0n ± 1% -3.23% (p=0.000 n=25) Search_AltMatch_CachedBitState/size=512 620.0n ± 1% 599.0n ± 1% -3.39% (p=0.000 n=25) Search_AltMatch_CachedBitState/size=4096 629.0n ± 1% 629.0n ± 0% ~ (p=0.747 n=25) Search_AltMatch_CachedBitState/size=32768 690.0n ± 0% 688.0n ± 1% ~ (p=0.874 n=25) Search_AltMatch_CachedBitState/size=262144 1.212µ ± 0% 1.210µ ± 0% ~ (p=0.415 n=25) Search_AltMatch_CachedBitState/size=2097152 5.988µ ± 2% 5.931µ ± 4% ~ (p=0.066 n=25) Search_AltMatch_CachedBitState/size=16777216 510.8µ ± 0% 512.0µ ± 0% ~ (p=0.164 n=25) geomean 70.97µ 62.83µ -11.47% host: mac │ old │ new │ │ sec/op │ sec/op vs base │ Search_Success1_BitState/size=8 33.09µ ± 1% 33.02µ ± 1% ~ (p=0.003 n=25) Search_Success1_BitState/size=64 37.52µ ± 1% 36.75µ ± 0% -2.04% (p=0.000 n=25) Search_Success1_BitState/size=512 71.32µ ± 1% 66.45µ ± 0% -6.83% (p=0.000 n=25) Search_Success1_BitState/size=4096 341.2µ ± 0% 304.6µ ± 0% -10.74% (p=0.000 n=25) Search_Success1_BitState/size=32768 2.505m ± 0% 2.208m ± 0% -11.85% (p=0.000 n=25) Search_Success1_BitState/size=262144 19.79m ± 0% 17.43m ± 0% -11.94% (p=0.000 n=25) Search_Success1_BitState/size=2097152 158.1m ± 0% 139.5m ± 0% -11.80% (p=0.000 n=25) Search_Success1_CachedBitState/size=8 1.201µ ± 0% 1.103µ ± 0% -8.16% (p=0.000 n=25) Search_Success1_CachedBitState/size=64 5.442µ ± 0% 4.860µ ± 0% -10.69% (p=0.000 n=25) Search_Success1_CachedBitState/size=512 39.18µ ± 0% 34.65µ ± 0% -11.56% (p=0.000 n=25) Search_Success1_CachedBitState/size=4096 308.7µ ± 0% 272.9µ ± 0% -11.61% (p=0.000 n=25) Search_Success1_CachedBitState/size=32768 2.466m ± 0% 2.180m ± 0% -11.60% (p=0.000 n=25) Search_Success1_CachedBitState/size=262144 19.71m ± 0% 17.44m ± 0% -11.55% (p=0.000 n=25) Search_Success1_CachedBitState/size=2097152 157.8m ± 0% 139.6m ± 0% -11.54% (p=0.000 n=25) Search_AltMatch_BitState/size=8 11.48µ ± 0% 11.40µ ± 1% ~ (p=0.573 n=25) Search_AltMatch_BitState/size=64 11.51µ ± 0% 11.39µ ± 2% ~ (p=0.200 n=25) Search_AltMatch_BitState/size=512 11.41µ ± 1% 11.39µ ± 1% ~ (p=0.866 n=25) Search_AltMatch_BitState/size=4096 11.51µ ± 1% 11.39µ ± 2% ~ (p=0.328 n=25) Search_AltMatch_BitState/size=32768 11.60µ ± 0% 11.61µ ± 2% ~ (p=0.711 n=25) Search_AltMatch_BitState/size=262144 12.33µ ± 1% 12.20µ ± 1% ~ (p=0.044 n=25) Search_AltMatch_BitState/size=2097152 24.24µ ± 2% 25.27µ ± 7% ~ (p=0.095 n=25) Search_AltMatch_BitState/size=16777216 310.3µ ± 7% 280.1µ ± 2% ~ (p=0.001 n=25) Search_AltMatch_CachedBitState/size=8 375.0n ± 1% 375.0n ± 0% ~ (p=0.260 n=25) Search_AltMatch_CachedBitState/size=64 414.0n ± 6% 380.0n ± 14% ~ (p=0.025 n=25) Search_AltMatch_CachedBitState/size=512 377.0n ± 0% 378.0n ± 1% ~ (p=0.775 n=25) Search_AltMatch_CachedBitState/size=4096 414.0n ± 1% 412.0n ± 1% ~ (p=0.008 n=25) Search_AltMatch_CachedBitState/size=32768 524.0n ± 1% 526.0n ± 1% ~ (p=0.248 n=25) Search_AltMatch_CachedBitState/size=262144 2.825µ ± 4% 2.904µ ± 6% ~ (p=0.342 n=25) Search_AltMatch_CachedBitState/size=2097152 22.63µ ± 31% 22.63µ ± 9% ~ (p=0.399 n=25) Search_AltMatch_CachedBitState/size=16777216 304.0µ ± 6% 281.2µ ± 4% ~ (p=0.052 n=25) geomean 49.43µ 46.81µ -5.31%
1 parent dcd1f64 commit 880c551

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

re2/bitstate.cc

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class BitState {
4949
bool longest, absl::string_view* submatch, int nsubmatch);
5050

5151
private:
52-
inline bool ShouldVisit(int id, const char* p);
52+
static inline bool ShouldVisit(absl::string_view text, uint64_t* visited, uint16_t id, const char* p);
5353
void Push(int id, const char* p);
5454
void GrowStack();
5555
bool TrySearch(int id, const char* p);
@@ -85,16 +85,19 @@ BitState::BitState(Prog* prog)
8585
njob_(0) {
8686
}
8787

88-
// Given id, which *must* be a list head, we can look up its list ID.
89-
// Then the question is: Should the search visit the (list ID, p) pair?
88+
// Given the text being searched and current visited state,
89+
// as well as a list ID, should the search visit the (list ID, p) pair?
9090
// If so, remember that it was visited so that the next time,
9191
// we don't repeat the visit.
92-
bool BitState::ShouldVisit(int id, const char* p) {
93-
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
94-
static_cast<int>(p-text_.data());
95-
if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
92+
// We pass text and visited to this as a static method so that the
93+
// caller can do those loads once instead of this code dereferencing
94+
// them multiple times.
95+
bool BitState::ShouldVisit(absl::string_view text, uint64_t* visited, uint16_t list_id, const char* p) {
96+
int n = list_id * static_cast<int>(text.size()+1) +
97+
static_cast<int>(p-text.data());
98+
if (visited[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
9699
return false;
97-
visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
100+
visited[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
98101
return true;
99102
}
100103

@@ -140,10 +143,12 @@ void BitState::Push(int id, const char* p) {
140143
bool BitState::TrySearch(int id0, const char* p0) {
141144
bool matched = false;
142145
const char* end = text_.data() + text_.size();
146+
uint16_t* list_heads = prog_->list_heads();
147+
uint64_t* visited = visited_.data();
143148
njob_ = 0;
144149
// Push() no longer checks ShouldVisit(),
145150
// so we must perform the check ourselves.
146-
if (ShouldVisit(id0, p0))
151+
if (ShouldVisit(text_, visited, list_heads[id0], p0))
147152
Push(id0, p0);
148153
while (njob_ > 0) {
149154
// Pop job off stack.
@@ -237,7 +242,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
237242
// Sanity check: id is the head of its list, which must
238243
// be the case if id-1 is the last of *its* list. :)
239244
ABSL_DCHECK(id == 0 || prog_->inst(id-1)->last());
240-
if (ShouldVisit(id, p))
245+
if (ShouldVisit(text_, visited, list_heads[id], p))
241246
goto Loop;
242247
break;
243248

0 commit comments

Comments
 (0)