Skip to content

Commit 29ada72

Browse files
authored
Drop the deprecated binary format. (#11307)
- Drop support for the deprecated binary format. - Add compatibility tests for categorical features. - Add compatibility tests for AFT survival training. - Use the same set of models for Python and R tests.
1 parent d603953 commit 29ada72

File tree

30 files changed

+578
-809
lines changed

30 files changed

+578
-809
lines changed

R-package/tests/testthat/test_io.R

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,14 @@ test_that("load/save raw works", {
1818

1919
json_bytes <- xgb.save.raw(booster, raw_format = "json")
2020
ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj")
21-
old_bytes <- xgb.save.raw(booster, raw_format = "deprecated")
2221

2322
from_json <- xgb.load.raw(json_bytes)
2423
from_ubj <- xgb.load.raw(ubj_bytes)
2524

26-
json2old <- xgb.save.raw(from_json, raw_format = "deprecated")
27-
ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated")
25+
json2ubj <- xgb.save.raw(from_json, raw_format = "ubj")
26+
ubj2ubj <- xgb.save.raw(from_ubj, raw_format = "ubj")
2827

29-
expect_equal(json2old, ubj2old)
30-
expect_equal(json2old, old_bytes)
28+
expect_equal(json2ubj, ubj2ubj)
3129
})
3230

3331
test_that("saveRDS preserves C and R attributes", {
Lines changed: 104 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
context("Models from previous versions of XGBoost can be loaded")
22

33
metadata <- list(
4-
kRounds = 2,
4+
kRounds = 4,
55
kRows = 1000,
66
kCols = 4,
77
kForests = 2,
@@ -10,87 +10,130 @@ metadata <- list(
1010
)
1111

1212
run_model_param_check <- function(config) {
13-
testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
14-
testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
13+
testthat::expect_equal(config$learner$learner_model_param$num_feature, "4")
14+
testthat::expect_equal(config$learner$learner_train_param$booster, "gbtree")
15+
}
16+
17+
get_n_rounds <- function(model_file) {
18+
is_10 <- grepl("1.0.0rc1", model_file, fixed = TRUE)
19+
if (is_10) {
20+
2
21+
} else {
22+
metadata$kRounds
23+
}
1524
}
1625

1726
get_num_tree <- function(booster) {
1827
dump <- xgb.dump(booster)
19-
m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
28+
m <- regexec("booster\\[[0-9]+\\]", dump, perl = TRUE)
2029
m <- regmatches(dump, m)
21-
num_tree <- Reduce('+', lapply(m, length))
22-
return(num_tree)
30+
num_tree <- Reduce("+", lapply(m, length))
31+
num_tree
2332
}
2433

25-
run_booster_check <- function(booster, name) {
34+
run_booster_check <- function(booster, model_file) {
2635
config <- xgb.config(booster)
2736
run_model_param_check(config)
28-
if (name == 'cls') {
29-
testthat::expect_equal(get_num_tree(booster),
30-
metadata$kForests * metadata$kRounds * metadata$kClasses)
31-
testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
32-
testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
33-
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
34-
metadata$kClasses)
35-
} else if (name == 'logitraw') {
36-
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
37-
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
38-
testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logitraw')
39-
} else if (name == 'logit') {
40-
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
41-
testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
42-
testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logistic')
43-
} else if (name == 'ltr') {
44-
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
45-
testthat::expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg')
37+
is_model <- function(typ) {
38+
grepl(typ, model_file, fixed = TRUE)
39+
}
40+
n_rounds <- get_n_rounds(model_file = model_file)
41+
if (is_model("cls")) {
42+
testthat::expect_equal(
43+
get_num_tree(booster), metadata$kForests * n_rounds * metadata$kClasses
44+
)
45+
testthat::expect_equal(
46+
as.numeric(config$learner$learner_model_param$base_score), 0.5
47+
)
48+
testthat::expect_equal(
49+
config$learner$learner_train_param$objective, "multi:softmax"
50+
)
51+
testthat::expect_equal(
52+
as.numeric(config$learner$learner_model_param$num_class),
53+
metadata$kClasses
54+
)
55+
} else if (is_model("logitraw")) {
56+
testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
57+
testthat::expect_equal(
58+
as.numeric(config$learner$learner_model_param$num_class), 0
59+
)
60+
testthat::expect_equal(
61+
config$learner$learner_train_param$objective, "binary:logitraw"
62+
)
63+
} else if (is_model("logit")) {
64+
testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
65+
testthat::expect_equal(
66+
as.numeric(config$learner$learner_model_param$num_class), 0
67+
)
68+
testthat::expect_equal(
69+
config$learner$learner_train_param$objective, "binary:logistic"
70+
)
71+
} else if (is_model("ltr")) {
72+
testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
73+
testthat::expect_equal(
74+
config$learner$learner_train_param$objective, "rank:ndcg"
75+
)
76+
} else if (is_model("aft")) {
77+
testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
78+
testthat::expect_equal(
79+
config$learner$learner_train_param$objective, "survival:aft"
80+
)
4681
} else {
47-
testthat::expect_equal(name, 'reg')
48-
testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
49-
testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
50-
testthat::expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror')
82+
testthat::expect_true(is_model("reg"))
83+
testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
84+
testthat::expect_equal(
85+
as.numeric(config$learner$learner_model_param$base_score), 0.5
86+
)
87+
testthat::expect_equal(
88+
config$learner$learner_train_param$objective, "reg:squarederror"
89+
)
5190
}
5291
}
5392

5493
test_that("Models from previous versions of XGBoost can be loaded", {
55-
bucket <- 'xgboost-ci-jenkins-artifacts'
56-
region <- 'us-west-2'
57-
file_name <- 'xgboost_r_model_compatibility_test.zip'
94+
bucket <- "xgboost-ci-jenkins-artifacts"
95+
region <- "us-west-2"
96+
file_name <- "xgboost_model_compatibility_tests-3.0.2.zip"
5897
zipfile <- tempfile(fileext = ".zip")
5998
extract_dir <- tempdir()
60-
download.file(paste('https://', bucket, '.s3-', region, '.amazonaws.com/', file_name, sep = ''),
61-
destfile = zipfile, mode = 'wb', quiet = TRUE)
99+
result <- tryCatch(
100+
{
101+
download.file(
102+
paste(
103+
"https://", bucket, ".s3-", region, ".amazonaws.com/", file_name,
104+
sep = ""
105+
),
106+
destfile = zipfile, mode = "wb", quiet = TRUE
107+
)
108+
zipfile
109+
},
110+
error = function(e) {
111+
print(e)
112+
NA_character_
113+
}
114+
)
115+
if (is.na(result)) {
116+
print("Failed to download old models.")
117+
return()
118+
}
119+
62120
unzip(zipfile, exdir = extract_dir, overwrite = TRUE)
63-
model_dir <- file.path(extract_dir, 'models')
121+
model_dir <- file.path(extract_dir, "models")
64122

65-
pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
123+
pred_data <- xgb.DMatrix(
124+
matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4),
125+
nthread = 2
126+
)
66127

67128
lapply(list.files(model_dir), function(x) {
68129
model_file <- file.path(model_dir, x)
69-
m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
70-
m <- regmatches(model_file, m)[[1]]
71-
model_xgb_ver <- m[2]
72-
name <- m[3]
73-
is_rds <- endsWith(model_file, '.rds')
74-
is_json <- endsWith(model_file, '.json')
75-
# TODO: update this test for new RDS format
76-
if (is_rds) {
77-
return(NULL)
78-
}
79-
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
80-
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
81-
booster <- readRDS(model_file)
82-
expect_warning(predict(booster, newdata = pred_data))
83-
booster <- readRDS(model_file)
84-
expect_warning(run_booster_check(booster, name))
85-
} else {
86-
if (is_rds) {
87-
booster <- readRDS(model_file)
88-
} else {
89-
booster <- xgb.load(model_file)
90-
xgb.model.parameters(booster) <- list(nthread = 2)
91-
}
92-
predict(booster, newdata = pred_data)
93-
run_booster_check(booster, name)
130+
is_skl <- grepl("scikit", model_file, fixed = TRUE)
131+
if (is_skl) {
132+
return()
94133
}
134+
booster <- xgb.load(model_file)
135+
xgb.model.parameters(booster) <- list(nthread = 2)
136+
predict(booster, newdata = pred_data)
137+
run_booster_check(booster, model_file)
95138
})
96139
})

demo/CLI/binary_classification/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@ xgboost also supports monitoring multiple metrics, suppose we also want to monit
150150
If you want to save model every two round, simply set save_period=2. You will find 0002.model in the current folder. If you want to change the output folder of models, add model_dir=foldername. By default xgboost saves the model of last round.
151151

152152
#### Continue from Existing Model
153-
If you want to continue boosting from existing model, say 0002.model, use
153+
If you want to continue boosting from existing model, say 0002.ubj, use
154154
```
155-
../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.model
155+
../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.ubj
156156
```
157157
xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
158158
#### Use Multi-Threading

demo/CLI/binary_classification/runexp.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ XGBOOST=../../../xgboost
99
# training and output the models
1010
$XGBOOST mushroom.conf
1111
# output prediction task=pred
12-
$XGBOOST mushroom.conf task=pred model_in=0002.model
13-
# print the boosters of 00002.model in dump.raw.txt
14-
$XGBOOST mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
12+
$XGBOOST mushroom.conf task=pred model_in=0002.ubj
13+
# print the boosters of 00002.ubj in dump.raw.txt
14+
$XGBOOST mushroom.conf task=dump model_in=0002.ubj name_dump=dump.raw.txt
1515
# use the feature map in printing for better visualization
16-
$XGBOOST mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
16+
$XGBOOST mushroom.conf task=dump model_in=0002.ubj fmap=featmap.txt name_dump=dump.nice.txt
1717
cat dump.nice.txt

doc/tutorials/saving_model.rst

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,6 @@ Since 2.1.0, the default model format for XGBoost is the UBJSON format, the opti
66
enabled for serializing models to file, serializing models to buffer, and for memory
77
snapshot (pickle and alike).
88

9-
In XGBoost 1.0.0, we introduced support of using `JSON
10-
<https://www.json.org/json-en.html>`_ for saving/loading XGBoost models and related
11-
hyper-parameters for training, aiming to replace the old binary internal format with an
12-
open format that can be easily reused. Later in XGBoost 1.6.0, additional support for
13-
`Universal Binary JSON <https://ubjson.org/>`__ is added as an optimization for more
14-
efficient model IO, which is set to default in 2.1.
15-
169
JSON and UBJSON have the same document structure with different representations, and we
1710
will refer them collectively as the JSON format. This tutorial aims to share some basic
1811
insights into the JSON serialisation method used in XGBoost. Without explicitly
@@ -27,41 +20,33 @@ which means inside XGBoost, there are 2 distinct parts:
2720
1. The model consisting of trees and
2821
2. Hyperparameters and configurations used for building the model.
2922

30-
If you come from Deep Learning community, then it should be
31-
clear to you that there are differences between the neural network structures composed of
32-
weights with fixed tensor operations, and the optimizers (like RMSprop) used to train them.
23+
If you come from the Deep Learning community, then it should be clear to you that there
24+
are differences between the neural network structures composed of weights with fixed
25+
tensor operations, and the optimizers (like RMSprop) used to train them.
3326

3427
So when one calls ``booster.save_model`` (``xgb.save`` in R), XGBoost saves the trees,
3528
some model parameters like number of input columns in trained trees, and the objective
3629
function, which combined to represent the concept of "model" in XGBoost. As for why are
3730
we saving the objective as part of model, that's because objective controls transformation
38-
of global bias (called ``base_score`` in XGBoost) and task-specific information. Users
39-
can share this model with others for prediction, evaluation or continue the training with
40-
a different set of hyper-parameters etc.
31+
of global bias (called ``base_score`` or the intercept in XGBoost) and task-specific
32+
information. Users can share this model with others for inference, evaluation or continue
33+
the training with a different set of hyper-parameters etc.
4134

4235
However, this is not the end of story. There are cases where we need to save something
4336
more than just the model itself. For example, in distributed training, XGBoost performs
4437
checkpointing operation. Or for some reasons, your favorite distributed computing
4538
framework decide to copy the model from one worker to another and continue the training in
46-
there. In such cases, the serialisation output is required to contain enough information
39+
there. In such cases, the serialisation output is required to contain enough information
4740
to continue previous training without user providing any parameters again. We consider
48-
such scenario as **memory snapshot** (or memory based serialisation method) and distinguish it
49-
with normal model IO operation. Currently, memory snapshot is used in the following places:
41+
such scenario as **memory snapshot** (or memory based serialisation method) and
42+
distinguish it with normal model IO operation. Currently, memory snapshot is used in the
43+
following places:
5044

5145
* Python package: when the ``Booster`` object is pickled with the built-in ``pickle`` module.
5246
* R package: when the ``xgb.Booster`` object is persisted with the built-in functions ``saveRDS``
5347
or ``save``.
5448
* JVM packages: when the ``Booster`` object is serialized with the built-in functions ``saveModel``.
5549

56-
Other language bindings are still working in progress.
57-
58-
.. note::
59-
60-
The old binary format doesn't distinguish difference between model and raw memory
61-
serialisation format, it's a mix of everything, which is part of the reason why we want
62-
to replace it with a more robust serialisation method. JVM Package has its own memory
63-
based serialisation methods.
64-
6550
To enable JSON format support for model IO (saving only the trees and objective), provide
6651
a filename with ``.json`` or ``.ubj`` as file extension, the latter is the extension for
6752
`Universal Binary JSON <https://ubjson.org/>`__
@@ -88,10 +73,9 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
8873
JSON files that were produced by an external source may lead to undefined behaviors
8974
and crashes.
9075

91-
While for memory snapshot, UBJSON is the default starting with xgboost 1.6. When loading
92-
the model back, XGBoost recognizes the file extensions ``.json`` and ``.ubj``, and can
93-
dispatch accordingly. If the extension is not specified, XGBoost tries to guess the right
94-
one.
76+
When loading the model back, XGBoost recognizes the file extensions ``.json`` and
77+
``.ubj``, and can dispatch accordingly. If the extension is not specified, XGBoost tries
78+
to guess the right one.
9579

9680
***************************************************************
9781
A note on backward compatibility of models and memory snapshots
@@ -234,18 +218,18 @@ You can load it back to the model generated by same version of XGBoost by:
234218
235219
bst.load_config(config)
236220
237-
This way users can study the internal representation more closely. Please note that some
221+
This way users can study the internal representation more closely. Please note that some
238222
JSON generators make use of locale dependent floating point serialization methods, which
239223
is not supported by XGBoost.
240224

241225
*************************************************
242226
Difference between saving model and dumping model
243227
*************************************************
244228

245-
XGBoost has a function called ``dump_model`` in Booster object, which lets you to export
246-
the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary
247-
use case for it is for model interpretation or visualization, and is not supposed to be
248-
loaded back to XGBoost. The JSON version has a `schema
229+
XGBoost has a function called ``dump_model`` in the Booster class, which lets you to
230+
export the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The
231+
primary use case for it is for model interpretation and visualization, and is not supposed
232+
to be loaded back to XGBoost. The JSON version has a `schema
249233
<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__. See next section for
250234
more info.
251235

@@ -263,3 +247,15 @@ array.
263247

264248
.. include:: ../model.schema
265249
:code: json
250+
251+
252+
*************
253+
Brief History
254+
*************
255+
256+
- The JSON format was introduced in 1.0, aiming to replace the now removed old binary
257+
internal format with an open format that can be easily reused
258+
- Later in XGBoost 1.6.0, additional support for Universal Binary JSON was introduced as
259+
an optimization for more efficient model IO.
260+
- UBJSON has been set to default in 2.1.
261+
- The old binary format was removed in 3.1.

0 commit comments

Comments
 (0)