@@ -28,45 +28,60 @@ namespace {
28
28
// ----------------------------------------------------------------------
29
29
// Rank implementation
30
30
31
- template <typename ValueSelector,
32
- typename T = std::decay_t <std::invoke_result_t <ValueSelector, int64_t >>>
31
+ constexpr uint64_t kDuplicateMask = 1ULL << 63 ;
32
+
33
+ template <typename ValueSelector>
34
+ void MarkDuplicates (const NullPartitionResult& sorted, ValueSelector&& value_selector) {
35
+ using T = std::decay_t <decltype (value_selector (uint64_t (0 )))>;
36
+
37
+ // Process non-nulls
38
+ if (sorted.non_nulls_end != sorted.non_nulls_begin ) {
39
+ auto it = sorted.non_nulls_begin ;
40
+ T prev_value = value_selector (*it);
41
+ T curr_value{};
42
+ while (++it < sorted.non_nulls_end ) {
43
+ curr_value = value_selector (*it);
44
+ if (curr_value == prev_value) {
45
+ // Mark as duplicate
46
+ *it |= kDuplicateMask ;
47
+ }
48
+ prev_value = curr_value;
49
+ }
50
+ }
51
+ // Process nulls
52
+ if (sorted.nulls_end != sorted.nulls_begin ) {
53
+ auto it = sorted.nulls_begin ;
54
+ // Mark all other nulles as duplicate
55
+ while (++it < sorted.nulls_end ) {
56
+ *it |= kDuplicateMask ;
57
+ }
58
+ }
59
+ }
60
+
61
+ bool NeedsDuplicates (RankOptions::Tiebreaker tiebreaker) {
62
+ return tiebreaker != RankOptions::First;
63
+ }
64
+
33
65
Result<Datum> CreateRankings (ExecContext* ctx, const NullPartitionResult& sorted,
34
66
const NullPlacement null_placement,
35
- const RankOptions::Tiebreaker tiebreaker,
36
- ValueSelector&& value_selector) {
67
+ const RankOptions::Tiebreaker tiebreaker) {
37
68
auto length = sorted.overall_end () - sorted.overall_begin ();
38
69
ARROW_ASSIGN_OR_RAISE (auto rankings,
39
70
MakeMutableUInt64Array (length, ctx->memory_pool ()));
40
71
auto out_begin = rankings->GetMutableValues <uint64_t >(1 );
41
72
uint64_t rank;
42
73
74
+ auto is_duplicate = [](uint64_t index ) { return (index & kDuplicateMask ) != 0 ; };
75
+ auto original_index = [](uint64_t index ) { return index & ~kDuplicateMask ; };
76
+
43
77
switch (tiebreaker) {
44
78
case RankOptions::Dense: {
45
- T curr_value, prev_value{};
46
79
rank = 0 ;
47
-
48
- if (null_placement == NullPlacement::AtStart && sorted.null_count () > 0 ) {
49
- rank++;
50
- for (auto it = sorted.nulls_begin ; it < sorted.nulls_end ; it++) {
51
- out_begin[*it] = rank;
52
- }
53
- }
54
-
55
- for (auto it = sorted.non_nulls_begin ; it < sorted.non_nulls_end ; it++) {
56
- curr_value = value_selector (*it);
57
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
58
- rank++;
59
- }
60
-
61
- out_begin[*it] = rank;
62
- prev_value = curr_value;
63
- }
64
-
65
- if (null_placement == NullPlacement::AtEnd) {
66
- rank++;
67
- for (auto it = sorted.nulls_begin ; it < sorted.nulls_end ; it++) {
68
- out_begin[*it] = rank;
80
+ for (auto it = sorted.overall_begin (); it < sorted.overall_end (); ++it) {
81
+ if (!is_duplicate (*it)) {
82
+ ++rank;
69
83
}
84
+ out_begin[original_index (*it)] = rank;
70
85
}
71
86
break ;
72
87
}
@@ -80,62 +95,27 @@ Result<Datum> CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted
80
95
}
81
96
82
97
case RankOptions::Min: {
83
- T curr_value, prev_value{};
84
98
rank = 0 ;
85
-
86
- if (null_placement == NullPlacement::AtStart) {
87
- rank++;
88
- for (auto it = sorted.nulls_begin ; it < sorted.nulls_end ; it++) {
89
- out_begin[*it] = rank;
90
- }
91
- }
92
-
93
- for (auto it = sorted.non_nulls_begin ; it < sorted.non_nulls_end ; it++) {
94
- curr_value = value_selector (*it);
95
- if (it == sorted.non_nulls_begin || curr_value != prev_value) {
99
+ for (auto it = sorted.overall_begin (); it < sorted.overall_end (); ++it) {
100
+ if (!is_duplicate (*it)) {
96
101
rank = (it - sorted.overall_begin ()) + 1 ;
97
102
}
98
- out_begin[*it] = rank;
99
- prev_value = curr_value;
100
- }
101
-
102
- if (null_placement == NullPlacement::AtEnd) {
103
- rank = sorted.non_null_count () + 1 ;
104
- for (auto it = sorted.nulls_begin ; it < sorted.nulls_end ; it++) {
105
- out_begin[*it] = rank;
106
- }
103
+ out_begin[original_index (*it)] = rank;
107
104
}
108
105
break ;
109
106
}
110
107
111
108
case RankOptions::Max: {
112
- // The algorithm for Max is just like Min, but in reverse order.
113
- T curr_value, prev_value{};
114
109
rank = length;
115
-
116
- if (null_placement == NullPlacement::AtEnd) {
117
- for (auto it = sorted.nulls_begin ; it < sorted.nulls_end ; it++) {
118
- out_begin[*it] = rank;
119
- }
120
- }
121
-
122
- for (auto it = sorted.non_nulls_end - 1 ; it >= sorted.non_nulls_begin ; it--) {
123
- curr_value = value_selector (*it);
124
-
125
- if (it == sorted.non_nulls_end - 1 || curr_value != prev_value) {
126
- rank = (it - sorted.overall_begin ()) + 1 ;
110
+ for (auto it = sorted.overall_end () - 1 ; it >= sorted.overall_begin (); --it) {
111
+ out_begin[original_index (*it)] = rank;
112
+ // If the current index isn't marked as duplicate, then it's the last
113
+ // tie in a row (since we iterate in reverse order), so update rank
114
+ // for the next row of ties.
115
+ if (!is_duplicate (*it)) {
116
+ rank = it - sorted.overall_begin ();
127
117
}
128
- out_begin[*it] = rank;
129
- prev_value = curr_value;
130
118
}
131
-
132
- if (null_placement == NullPlacement::AtStart) {
133
- rank = sorted.null_count ();
134
- for (auto it = sorted.nulls_begin ; it < sorted.nulls_end ; it++) {
135
- out_begin[*it] = rank;
136
- }
137
- }
138
-
139
119
break ;
140
120
}
141
121
}
@@ -212,8 +192,11 @@ class Ranker<Array> : public RankerMixin<Array, Ranker<Array>> {
212
192
auto value_selector = [&array](int64_t index ) {
213
193
return GetView::LogicalValue (array.GetView (index ));
214
194
};
215
- ARROW_ASSIGN_OR_RAISE (*output_, CreateRankings (ctx_, sorted, null_placement_,
216
- tiebreaker_, value_selector));
195
+ if (NeedsDuplicates (tiebreaker_)) {
196
+ MarkDuplicates (sorted, value_selector);
197
+ }
198
+ ARROW_ASSIGN_OR_RAISE (*output_,
199
+ CreateRankings (ctx_, sorted, null_placement_, tiebreaker_));
217
200
218
201
return Status::OK ();
219
202
}
@@ -242,8 +225,11 @@ class Ranker<ChunkedArray> : public RankerMixin<ChunkedArray, Ranker<ChunkedArra
242
225
auto value_selector = [resolver = ChunkedArrayResolver (span (arrays))](int64_t index ) {
243
226
return resolver.Resolve (index ).Value <InType>();
244
227
};
245
- ARROW_ASSIGN_OR_RAISE (*output_, CreateRankings (ctx_, sorted, null_placement_,
246
- tiebreaker_, value_selector));
228
+ if (NeedsDuplicates (tiebreaker_)) {
229
+ MarkDuplicates (sorted, value_selector);
230
+ }
231
+ ARROW_ASSIGN_OR_RAISE (*output_,
232
+ CreateRankings (ctx_, sorted, null_placement_, tiebreaker_));
247
233
248
234
return Status::OK ();
249
235
}
0 commit comments