Skip to content

Commit

Permalink
Add docs, unit tests, and refactor repro test infra
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Nov 15, 2024
1 parent 1d372ce commit 52484ac
Show file tree
Hide file tree
Showing 3 changed files with 455 additions and 57 deletions.
5 changes: 4 additions & 1 deletion reproducibility_tests/move_output.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,8 @@ else
end

if buildkite_ci && in_merge_queue
cleanup_central(cluster_data_prefix)
folders = get_reference_paths_to_delete(; root_path = cluster_data_prefix)
for f in folders
rm(f; recursive = true, force = true)
end
end
228 changes: 184 additions & 44 deletions reproducibility_tests/reproducibility_utils.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,44 @@
#=
################################################################################
Reproducibility Terminology.
Consider the following set of reproducibility
folders, prefixed by "reference counters", which
allow users to compare against other reproducible
states in that column.
Note that reference counter changes can "rewind"
(which may happen in the case of reverted commits).
In such cases, we do consider the rewound state as
an entirely new state, in order to fully preserve
the history (to some depth).
An important consequence of this requires precise
terminology to avoid ambiguous descriptions.
For example, "comparable references per reference counter"
is not well defined, because the reference counter can
be reverted. So, let's introduce the concept of a "bin",
which can be defined as a collection of folders
created in a period with the same reference counter.
Folders created before and after that bin have a different
reference counter. Also, `n_bins == n_reference_changes + 1`
(modulo the edge case of when there are no bins)
because, if the reference counter doesn't change, new results
are put into the same bin.
```
comparable states
| ref counter changes ----> | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
################################################################################
=#

import Dates

read_ref_counter(filename) = parse(Int, first(readlines(filename)))
Expand Down Expand Up @@ -118,57 +159,156 @@ function latest_comparable_paths(;
return comparable_paths
end

function reason(path)
f = joinpath(path, "ref_counter.jl")
if !isfile(f)
return "ref_counter.jl does not exist"
else
ref_counter = parse(Int, first(readlines(f)))
return "ref_counter: $ref_counter"
"""
invalid_reference_folders(; root_path)
Returns all subfolders in `root_path`
that meet the following criteria:
- A `ref_counter.jl` file is missing
"""
function invalid_reference_folders(; root_path)
paths = sorted_dataset_folder(; dir = root_path)
invalid_folders = filter(paths) do p
!isfile(joinpath(p, "ref_counter.jl"))
end
return invalid_folders
end

function cleanup_central(cluster_data_prefix)
@warn "Cleaning up old files on central"
# Get (sorted) array of paths, `pop!(sorted_paths)`
# is the most recent merged folder.
sorted_paths = sorted_dataset_folder(; dir = cluster_data_prefix)
keep_latest_n = 0
keep_latest_ref_counters = 5
if !isempty(sorted_paths)
N = length(sorted_paths) - keep_latest_n
paths_to_delete = []
ref_counters_main = ref_counters_per_path(sorted_paths)
i_largest_reference = argmax(ref_counters_main)
path = sorted_paths[i_largest_reference]
ref_counter_file_main = joinpath(path, "ref_counter.jl")
@assert isfile(ref_counter_file_main)
ref_counter_main = parse(Int, first(readlines(ref_counter_file_main)))

for i in 1:N
path = sorted_paths[i]
ref_counter_file = joinpath(path, "ref_counter.jl")
if !isfile(ref_counter_file)
push!(paths_to_delete, path)
"""
compute_bins(root_path::String)
compute_bins(sorted_paths::Vector{String})
Returns a vector of reproducibility bins.
Each bin is sorted such that
`bin[1]` is the newest and `bin[end]`
is the oldest comparable reference.
```
comparable states
| ref counter changes ----> | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
"""
compute_bins(root_path::String) =
compute_bins(reverse(sorted_dataset_folder(; dir = root_path)))
function compute_bins(sorted_paths::Vector{String})
isempty(sorted_paths) && return Vector{String}[]
bins = Vector{String}[]
local paths_per_bin
p_first_next = nothing
k = 1
while true
paths_per_bin = String[]
k > length(sorted_paths) && break
while true
k > length(sorted_paths) && break
p = sorted_paths[k]
rcp = read_ref_counter(joinpath(p, "ref_counter.jl"))
if isempty(paths_per_bin)
push!(paths_per_bin, p)
k += 1
else
ref_counter = parse(Int, first(readlines(ref_counter_file)))
# Just to be safe, let's also make sure that we don't delete
# any paths with recent (let's say 5) ref counter increments ago.
if ref_counter + keep_latest_ref_counters < ref_counter_main
push!(paths_to_delete, path)
rcb = read_ref_counter(
joinpath(first(paths_per_bin), "ref_counter.jl"),
)
if rcp == rcb
push!(paths_per_bin, p)
k += 1
else
break
end
end
end
@show ref_counter_main
@show length(sorted_paths)
@show length(paths_to_delete)
@info "Deleting files:"
for i in 1:length(paths_to_delete)
f = paths_to_delete[i]
@info " (File, date): ($(f), $(Dates.unix2datetime(stat(f).mtime))). Reason: $(reason(f))"
end
for i in 1:length(paths_to_delete)
rm(paths_to_delete[i]; recursive = true, force = true)
push!(bins, paths_per_bin)
end
return bins
end

"""
get_reference_paths_to_delete(;
root_path,
keep_n_comparable_states = 5,
keep_n_bins_back = 7,
)
Returns a list of folders to delete.
Our reference folders are saved, and can
therefore build up significantly.
Consider a collection of folders whose
names are prepended by the reference
counter:
```
keep_n_comparable_states
| <---- keep_n_bins_back | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
With these folders, and given a reference
counter of 10, we'll see the following
behavior:
```
get_reference_paths_to_delete(;
keep_n_comparable_states = 4,
keep_n_bins_back = 3
) -> [02_49f92, 04_36ebe, 04_d6e48, 04_4c042]
get_reference_paths_to_delete(;
keep_n_comparable_states = 1,
keep_n_bins_back = 5
) -> [02_49f92, 04_d6e48, 04_4c042, 06_d6d73, 08_1cc58]
```
Note:
`keep_n_references_back` is sorted _chronologically_,
in order to correctly operate in the case of
reverted pull requests. In other words, the above
references may look like this:
```
keep_n_comparable_states
| <---- keep_n_bins_back | oldest
| |
| bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 |
| |
| 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 |
| 04_d6e48 06_d6d73 08_1cc58 |
v 04_4c042 v newest
```
"""
function get_reference_paths_to_delete(;
root_path,
keep_n_comparable_states = 5,
keep_n_bins_back = 7,
)
@assert isempty(invalid_reference_folders(; root_path))
paths_to_delete = String[]
sorted_paths = reverse(sorted_dataset_folder(; dir = root_path))
if !isempty(sorted_paths)
# Now, sorted_paths[1] is newest, sorted_paths[end] is oldest
bins = compute_bins(sorted_paths)
for i in 1:length(bins), j in 1:length(bins[i])
if i keep_n_bins_back
if !(j keep_n_comparable_states)
push!(paths_to_delete, bins[i][j])
end
else
push!(paths_to_delete, bins[i][j])
end
end
end
return paths_to_delete
end
Loading

0 comments on commit 52484ac

Please sign in to comment.