1
+ library(tibble )
2
+ library(dplyr )
3
+ library(lubridate )
4
+ library(glue )
5
+ library(tidyr )
6
+
1
7
is_dev <- function () {
2
8
Sys.getenv(" GITHUB_ACTIONS" ) != " true"
3
9
}
@@ -40,14 +46,24 @@ make_nice_names <- function(x) {
40
46
toTitleCase(gsub(" _" , " " , names(x )))
41
47
}
42
48
43
- arrow_build_table <- function (nightly_data , type , task ) {
49
+ get_commit <- function (df , label ) {
50
+ df $ arrow_commit [df $ fail_label == label ]
51
+ }
52
+
53
+ arrow_build_table <- function (nightly_data , type , task , to_day = today()) {
54
+ # Filter data for a specific build type and task
44
55
type_task_data <- nightly_data %> %
45
56
filter(build_type == type ) %> %
46
57
filter(task_name == task )
47
58
48
- # # filter for when the most recent run is a failure
49
- day_window <- today() - 2
59
+ # Look at yesterday's date to determine recent failures
60
+ # This is used as a window for identifying tasks that failed recently
61
+ day_window <- to_day - 1
62
+
63
+ # Get records where the task failed recently, order by date (newest first)
64
+ # and standardize task status values to "pass" and "fail"
50
65
ordered_only_recent_fails <- type_task_data %> %
66
+ # Only keep records where the task name appears in yesterday's failures
51
67
filter(
52
68
task_name %in%
53
69
task_name [nightly_date == day_window & task_status != " success" ]
@@ -61,15 +77,22 @@ arrow_build_table <- function(nightly_data, type, task) {
61
77
)
62
78
)
63
79
80
+ # If there are no recent failures, return a success summary or a null summary if the task is not active
64
81
if (nrow(ordered_only_recent_fails ) == 0 ) {
65
- # # if there are no failures, return a version of the table that reflects that
82
+ # Calculate days since the last run (regardless of status)
66
83
days <- as.numeric(
67
84
difftime(
68
- ymd(Sys.Date() , tz = " UTC" ),
85
+ ymd(to_day , tz = " UTC" ),
69
86
max(type_task_data $ nightly_date )
70
87
)
71
88
)
89
+ # Create a summary with success information
72
90
success_df <- type_task_data %> %
91
+ # Remove stale data by filtering out everything but the last ~2 days of runs
92
+ # this makes it so that jobs that have been deleted (but are still in the 120 day look back)
93
+ # don't continue to show up.
94
+ filter(nightly_date > = to_day - 2 ) %> %
95
+ # Then, take the most recent run since that's all we care about if there are no failures.
73
96
slice_max(order_by = nightly_date ) %> %
74
97
mutate(
75
98
since_last_successful_build = days ,
@@ -87,33 +110,39 @@ arrow_build_table <- function(nightly_data, type, task) {
87
110
last_successful_build ,
88
111
build_type
89
112
)
113
+
90
114
return (success_df )
91
115
}
92
116
93
- # # find first failure index
117
+ # Find the length of the most recent consecutive failure streak
118
+ # This uses run length encoding to identify the first sequence of failures
94
119
idx_recent_fail <- rle(ordered_only_recent_fails $ task_status )$ lengths [1 ]
95
120
96
- # # expand failure index and give it some names
121
+ # Create labels for the failure streak timeline
122
+ # This builds a dataframe with positions and labels for the recent failure sequence
97
123
failure_df <- tibble(fails_plus_one = seq(1 , idx_recent_fail + 1 )) %> %
98
124
mutate(
99
125
fail_label = case_when(
100
- fails_plus_one == idx_recent_fail ~ " first_failure" ,
101
- fails_plus_one == 1 ~ " most_recent_failure" ,
102
- fails_plus_one == idx_recent_fail + 1 ~ " last_successful_build" ,
103
- TRUE ~ paste0(fails_plus_one , " days ago" )
126
+ fails_plus_one == idx_recent_fail ~ " first_failure" , # Where the failures began
127
+ fails_plus_one == 1 ~ " most_recent_failure" , # The most recent failure
128
+ fails_plus_one == idx_recent_fail + 1 ~ " last_successful_build" , # Last successful build before failures
129
+ TRUE ~ paste0(fails_plus_one , " days ago" ) # General failure timeline
104
130
)
105
131
) %> %
132
+ # Only keep the most recent 9 days of failures or specific labeled events
106
133
filter(fails_plus_one < = 9 | grepl(" failure|build" , fail_label ))
107
134
108
- # # inner_join to ordered data
135
+ # Join the failure timeline labels with the actual build data
109
136
df <- ordered_only_recent_fails %> %
110
137
rowid_to_column() %> %
111
138
inner_join(failure_df , by = c(" rowid" = " fails_plus_one" ))
112
139
140
+ # Calculate days since last successful build
113
141
if (all(type_task_data $ task_status %in% " failure" )) {
114
142
days <- NA_real_
115
143
} else {
116
- # # days since last successful build (need to add one)
144
+ # Calculate days between most recent failure and last successful build
145
+ # Adding 1 to include the day of the failure
117
146
days <- sum(
118
147
as.numeric(
119
148
difftime(
@@ -125,10 +154,7 @@ arrow_build_table <- function(nightly_data, type, task) {
125
154
)
126
155
}
127
156
128
- get_commit <- function (label ) {
129
- df $ arrow_commit [df $ fail_label == label ]
130
- }
131
-
157
+ # Format the final result as a table with build status information (one row per task)
132
158
df %> %
133
159
arrange(desc(fail_label )) %> %
134
160
mutate(
@@ -137,12 +163,14 @@ arrow_build_table <- function(nightly_data, type, task) {
137
163
)
138
164
) %> %
139
165
select(task_name , build_type , build_links , fail_label ) %> %
166
+ # Reshape data to have one column for each failure stage
140
167
pivot_wider(names_from = fail_label , values_from = build_links ) %> %
168
+ # Add additional context columns
141
169
mutate(
142
170
since_last_successful_build = days ,
143
171
last_successful_commit = arrow_compare_links(
144
- get_commit(" last_successful_build" ),
145
- get_commit(" first_failure" )
172
+ get_commit(df , " last_successful_build" ),
173
+ get_commit(df , " first_failure" )
146
174
),
147
175
most_recent_status = " failing" ,
148
176
.after = build_type
0 commit comments