Skip to content

Commit cc65bd5

Browse files
authored
Implement optional flag for rotating index (#37)
LoonyRotate by default - add in ReadMe Add compilation flags to readme Pad loonyqueue to cachelines Add extra compilation assertions for flags 0.3.1
1 parent e420241 commit cc65bd5

File tree

5 files changed

+72
-10
lines changed

5 files changed

+72
-10
lines changed

README.md

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ tests and further documentation are to follow when time allows.
104104

105105
[The full API documentation is kept up-to-date on GitHub.](https://nim-works.github.io/loony/loony.html)
106106

107-
[The API documentation for the Ward submodule is found here.](https://nim-works.github.io/loony/loony/ward.html)
107+
[~~The API documentation for the Ward submodule is found here.~~](https://nim-works.github.io/loony/loony/ward.html) ~~*Wards are untested and are unlikely to remain in the library*~~
108108

109109
#### Memory Safety & Cache Coherence
110110

@@ -114,6 +114,19 @@ committed on the push operation and read on the pop operation; this is a
114114
higher-cost primitive. You can use `unsafePush` and `unsafePop` to manipulate
115115
a `LoonyQueue` without regard to cache coherency for ultimate performance.
116116

117+
The LoonyQueue itself is padded across cachelines, and by default, the slots
118+
are read and written to in a cyclic fashion over cachelines to reduce false
119+
sharing.
120+
121+
```
122+
Visual representation of rotating index
123+
124+
| 64 bytes | 64 bytes | 64 bytes |...
125+
| 0------- | 1------- | 2------- |...
126+
| -63------| -64------| -65------|...
127+
|--127-----|--128-----|--129-----|...
128+
```
129+
117130
### Debugging
118131

119132
Pass `--d:loonyDebug` in compilation or with a config nimscript to use debug
@@ -140,8 +153,20 @@ debugNodeCounter:
140153
We recommend against changing these values unless you know what you are doing. The suggested max alignment is 16 to achieve drastically higher contention capacities. Compilation will fail if your alignment does not fit the slot count index.
141154

142155
`-d:loonyNodeAlignment=11` - Adjust node alignment to increase/decrease contention capacity
156+
143157
`-d:loonySlotCount=1024` - Adjust the number of slots in each node
144158

159+
`-d:loonyDebug=false` - Toggle debug counters and templates, see
160+
[debugging](#debugging). False by default.
161+
162+
`-d:loonyRotate=true` - Toggle the index for the slots of
163+
loony queue to be read over cacheline bounds in a cyclic
164+
manner. True by default.
165+
166+
> While loonyRotate is enabled, the slot count must be a
167+
> power of 2. Error messages will indicate whether this
168+
> is a cause of compilation failure.
169+
145170
## What are Continuations?
146171

147172
If you've somehow missed the next big thing for nim; see [CPS](https://github.com/nim-works/cps)

loony.nim

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,16 @@ type
2525

2626
LoonyQueue*[T] = ref LoonyQueueImpl[T]
2727
LoonyQueueImpl*[T] = object
28-
head : Atomic[TagPtr] ## Whereby node contains the slots and idx
29-
tail : Atomic[TagPtr] ## is the uint16 index of the slot array
30-
currTail : Atomic[NodePtr] ## 8 bytes Current NodePtr
28+
head {.align: 128.}: Atomic[TagPtr] ## Whereby node contains the slots and idx
29+
tail {.align: 128.}: Atomic[TagPtr] ## is the uint16 index of the slot array
30+
currTail {.align: 128.}: Atomic[NodePtr] ## 8 bytes Current NodePtr
31+
# Align to 128 bytes to avoid false sharing, see:
32+
# https://stackoverflow.com/questions/72126606/should-the-cache-padding-size-of-x86-64-be-128-bytes
33+
# Plenty of architectural differences can impact whether
34+
# or not 128 bytes is superior alignment to 64 bytes, but
35+
# considering the cost that this change introduces to the
36+
# memory consumption of the loony queue object, it is
37+
# recommended.
3138

3239
## Result types for the private
3340
## advHead and advTail functions

loony.nimble

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version = "0.3.0"
1+
version = "0.3.1"
22
author = "cabboose"
33
description = "Fast mpmc queue with sympathetic memory behavior"
44
license = "MIT"

loony/node.nim

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,17 @@ else:
6969
template incEnqPathCounter*(): untyped = discard
7070
template incDeqPathCounter*(): untyped = discard
7171

72+
template prn*(idx: uint16): uint16 =
73+
## prn = 'Pro re nata' - when required
74+
## Provides the actual index depending on
75+
## if we are rotating the index or not.
76+
when loonyRotate:
77+
# multiply by cacheLineSize, mod by loonySlotCount
78+
# then add idx*cacheLineSize/loonySlotCount
79+
(idx shl lShiftBits) and (loonySlotCount - 1) or (idx shr rShiftBits)
80+
else:
81+
idx
82+
7283
template toNodePtr*(pt: uint | ptr Node): NodePtr =
7384
# Convert ptr Node into NodePtr uint
7485
cast[NodePtr](pt)
@@ -105,7 +116,7 @@ proc fetchAddSlot*(t: var Node, idx: uint16, w: uint, moorder: MemoryOrder): uin
105116
## Remembering that the pointer has 3 tail bits clear; these are
106117
## reserved and increased atomically to indicate RESUME, READER, WRITER
107118
## statuship.
108-
t.slots[idx].fetchAdd(w, order = moorder)
119+
t.slots[prn idx].fetchAdd(w, order = moorder)
109120
110121
proc compareAndSwapNext*(t: var Node, expect: var uint, swap: uint): bool =
111122
t.next.compareExchange(expect, swap, moRelease, moRelaxed)
@@ -131,7 +142,7 @@ proc allocNode*[T](pel: T): ptr Node =
131142
proc tryReclaim*(node: var Node; start: uint16) =
132143
block done:
133144
for i in start..<N:
134-
template s: Atomic[uint] = node.slots[i]
145+
template s: Atomic[uint] = node.slots[prn i]
135146
if (s.load(order = moAcquire) and CONSUMED) != CONSUMED:
136147
var prev = s.fetchAdd(RESUME, order = moRelaxed) and CONSUMED
137148
if prev != CONSUMED:

loony/spec.nim

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,36 @@
1-
import std/atomics
1+
import std/[atomics, math, strformat]
22

33
const
4-
loonyNodeAlignment {.intdefine.} = 11
5-
loonySlotCount {.intdefine.} = 1024
4+
loonyNodeAlignment* {.intdefine.} = 11
5+
loonySlotCount* {.intdefine.} = 1024
66

77
loonyIsolated* {.booldefine.} = false ## Indicate that loony should
88
## assert that all references passing through the queue have a single
99
## owner. Note that in particular, child Continuations have cycles,
1010
## which will trigger a failure of this assertion.
1111

12+
loonyRotate* {.booldefine.} = true ## Indicate that loony should rotate
13+
## the slots in the queue to avoid contention on the same cache line.
14+
## This is useful when the queue is shared between multiple threads.
15+
## Note that this will only work if the number of slots is a power of 2.
16+
17+
when loonyRotate:
18+
# TODO Impl dynamic cache line size detection
19+
const
20+
cacheLineSize = 64
21+
lShiftBits* = int log2(float cacheLineSize)
22+
rShiftBits* = int(log2(float loonySlotCount)) - lShiftBits
23+
1224
static:
1325
doAssert (1 shl loonyNodeAlignment) > loonySlotCount,
1426
"Your LoonySlot count exceeds your alignment!"
27+
doAssert loonySlotCount > 1,
28+
"Your LoonySlot count must be greater than 1!"
29+
when loonyRotate:
30+
doAssert (loonySlotCount and (loonySlotCount - 1)) == 0,
31+
fmt"Your LoonySlot count of {loonySlotCount} is not a power of 2!" &
32+
" Either disable loonyRotate (-d:loonyRotate=false) or" &
33+
" change the slot count."
1534

1635
const
1736
## Slot flag constants

0 commit comments

Comments
 (0)