dmlc
diff --git a/‎R-package/tests/testthat/test_io.R
Lines changed: 3 additions & 5 deletions b/‎R-package/tests/testthat/test_io.R
Lines changed: 3 additions & 5 deletions
diff --git a/‎R-package/tests/testthat/test_model_compatibility.R
Lines changed: 104 additions & 61 deletions b/‎R-package/tests/testthat/test_model_compatibility.R
Lines changed: 104 additions & 61 deletions
diff --git a/‎demo/CLI/binary_classification/README.md
Lines changed: 2 additions & 2 deletions b/‎demo/CLI/binary_classification/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎demo/CLI/binary_classification/runexp.sh
Lines changed: 4 additions & 4 deletions b/‎demo/CLI/binary_classification/runexp.sh
Lines changed: 4 additions & 4 deletions
diff --git a/‎doc/tutorials/saving_model.rst
Lines changed: 30 additions & 34 deletions b/‎doc/tutorials/saving_model.rst
Lines changed: 30 additions & 34 deletions
@@ -18,16 +18,14 @@ test_that("load/save raw works", {
 
   json_bytes <- xgb.save.raw(booster, raw_format = "json")
   ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj")
-  old_bytes <- xgb.save.raw(booster, raw_format = "deprecated")
 
   from_json <- xgb.load.raw(json_bytes)
   from_ubj <- xgb.load.raw(ubj_bytes)
 
-  json2old <- xgb.save.raw(from_json, raw_format = "deprecated")
-  ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated")
+  json2ubj <- xgb.save.raw(from_json, raw_format = "ubj")
+  ubj2ubj <- xgb.save.raw(from_ubj, raw_format = "ubj")
 
-  expect_equal(json2old, ubj2old)
-  expect_equal(json2old, old_bytes)
+  expect_equal(json2ubj, ubj2ubj)
 })
 
 test_that("saveRDS preserves C and R attributes", {
 
@@ -1,7 +1,7 @@
 context("Models from previous versions of XGBoost can be loaded")
 
 metadata <- list(
-  kRounds = 2,
+  kRounds = 4,
   kRows = 1000,
   kCols = 4,
   kForests = 2,
@@ -10,87 +10,130 @@ metadata <- list(
 )
 
 run_model_param_check <- function(config) {
-  testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
-  testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
+  testthat::expect_equal(config$learner$learner_model_param$num_feature, "4")
+  testthat::expect_equal(config$learner$learner_train_param$booster, "gbtree")
+}
+
+get_n_rounds <- function(model_file) {
+  is_10 <- grepl("1.0.0rc1", model_file, fixed = TRUE)
+  if (is_10) {
+    2
+  } else {
+    metadata$kRounds
+  }
 }
 
 get_num_tree <- function(booster) {
   dump <- xgb.dump(booster)
-  m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
+  m <- regexec("booster\\[[0-9]+\\]", dump, perl = TRUE)
   m <- regmatches(dump, m)
-  num_tree <- Reduce('+', lapply(m, length))
-  return(num_tree)
+  num_tree <- Reduce("+", lapply(m, length))
+  num_tree
 }
 
-run_booster_check <- function(booster, name) {
+run_booster_check <- function(booster, model_file) {
   config <- xgb.config(booster)
   run_model_param_check(config)
-  if (name == 'cls') {
-    testthat::expect_equal(get_num_tree(booster),
-                           metadata$kForests * metadata$kRounds * metadata$kClasses)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
-                           metadata$kClasses)
-  } else if (name == 'logitraw') {
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logitraw')
-  } else if (name == 'logit') {
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logistic')
-  } else if (name == 'ltr') {
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg')
+  is_model <- function(typ) {
+    grepl(typ, model_file, fixed = TRUE)
+  }
+  n_rounds <- get_n_rounds(model_file = model_file)
+  if (is_model("cls")) {
+    testthat::expect_equal(
+      get_num_tree(booster), metadata$kForests * n_rounds * metadata$kClasses
+    )
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$base_score), 0.5
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "multi:softmax"
+    )
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$num_class),
+      metadata$kClasses
+    )
+  } else if (is_model("logitraw")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$num_class), 0
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "binary:logitraw"
+    )
+  } else if (is_model("logit")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$num_class), 0
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "binary:logistic"
+    )
+  } else if (is_model("ltr")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "rank:ndcg"
+    )
+  } else if (is_model("aft")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "survival:aft"
+    )
   } else {
-    testthat::expect_equal(name, 'reg')
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror')
+    testthat::expect_true(is_model("reg"))
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$base_score), 0.5
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "reg:squarederror"
+    )
   }
 }
 
 test_that("Models from previous versions of XGBoost can be loaded", {
-  bucket <- 'xgboost-ci-jenkins-artifacts'
-  region <- 'us-west-2'
-  file_name <- 'xgboost_r_model_compatibility_test.zip'
+  bucket <- "xgboost-ci-jenkins-artifacts"
+  region <- "us-west-2"
+  file_name <- "xgboost_model_compatibility_tests-3.0.2.zip"
   zipfile <- tempfile(fileext = ".zip")
   extract_dir <- tempdir()
-  download.file(paste('https://', bucket, '.s3-', region, '.amazonaws.com/', file_name, sep = ''),
-                destfile = zipfile, mode = 'wb', quiet = TRUE)
+  result <- tryCatch(
+    {
+      download.file(
+        paste(
+          "https://", bucket, ".s3-", region, ".amazonaws.com/", file_name,
+          sep = ""
+        ),
+        destfile = zipfile, mode = "wb", quiet = TRUE
+      )
+      zipfile
+    },
+    error = function(e) {
+      print(e)
+      NA_character_
+    }
+  )
+  if (is.na(result)) {
+    print("Failed to download old models.")
+    return()
+  }
+
   unzip(zipfile, exdir = extract_dir, overwrite = TRUE)
-  model_dir <- file.path(extract_dir, 'models')
+  model_dir <- file.path(extract_dir, "models")
 
-  pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
+  pred_data <- xgb.DMatrix(
+    matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4),
+    nthread = 2
+  )
 
   lapply(list.files(model_dir), function(x) {
     model_file <- file.path(model_dir, x)
-    m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
-    m <- regmatches(model_file, m)[[1]]
-    model_xgb_ver <- m[2]
-    name <- m[3]
-    is_rds <- endsWith(model_file, '.rds')
-    is_json <- endsWith(model_file, '.json')
-    # TODO: update this test for new RDS format
-    if (is_rds) {
-      return(NULL)
-    }
-    # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
-    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
-      booster <- readRDS(model_file)
-      expect_warning(predict(booster, newdata = pred_data))
-      booster <- readRDS(model_file)
-      expect_warning(run_booster_check(booster, name))
-    } else {
-      if (is_rds) {
-        booster <- readRDS(model_file)
-      } else {
-        booster <- xgb.load(model_file)
-        xgb.model.parameters(booster) <- list(nthread = 2)
-      }
-      predict(booster, newdata = pred_data)
-      run_booster_check(booster, name)
+    is_skl <- grepl("scikit", model_file, fixed = TRUE)
+    if (is_skl) {
+      return()
     }
+    booster <- xgb.load(model_file)
+    xgb.model.parameters(booster) <- list(nthread = 2)
+    predict(booster, newdata = pred_data)
+    run_booster_check(booster, model_file)
   })
 })
@@ -150,9 +150,9 @@ xgboost also supports monitoring multiple metrics, suppose we also want to monit
 If you want to save model every two round, simply set save_period=2. You will find 0002.model in the current folder. If you want to change the output folder of models, add model_dir=foldername. By default xgboost saves the model of last round.
 
 #### Continue from Existing Model
-If you want to continue boosting from existing model, say 0002.model, use
+If you want to continue boosting from existing model, say 0002.ubj, use
 ```
-../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.model
+../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.ubj
 ```
 xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
 #### Use Multi-Threading
 
@@ -9,9 +9,9 @@ XGBOOST=../../../xgboost
 # training and output the models
 $XGBOOST mushroom.conf
 # output prediction task=pred
-$XGBOOST mushroom.conf task=pred model_in=0002.model
-# print the boosters of 00002.model in dump.raw.txt
-$XGBOOST mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
+$XGBOOST mushroom.conf task=pred model_in=0002.ubj
+# print the boosters of 00002.ubj in dump.raw.txt
+$XGBOOST mushroom.conf task=dump model_in=0002.ubj name_dump=dump.raw.txt
 # use the feature map in printing for better visualization
-$XGBOOST mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
+$XGBOOST mushroom.conf task=dump model_in=0002.ubj fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
@@ -6,13 +6,6 @@ Since 2.1.0, the default model format for XGBoost is the UBJSON format, the opti
 enabled for serializing models to file, serializing models to buffer, and for memory
 snapshot (pickle and alike).
 
-In XGBoost 1.0.0, we introduced support of using `JSON
-<https://www.json.org/json-en.html>`_ for saving/loading XGBoost models and related
-hyper-parameters for training, aiming to replace the old binary internal format with an
-open format that can be easily reused.  Later in XGBoost 1.6.0, additional support for
-`Universal Binary JSON <https://ubjson.org/>`__ is added as an optimization for more
-efficient model IO, which is set to default in 2.1.
-
 JSON and UBJSON have the same document structure with different representations, and we
 will refer them collectively as the JSON format. This tutorial aims to share some basic
 insights into the JSON serialisation method used in XGBoost.  Without explicitly
@@ -27,41 +20,33 @@ which means inside XGBoost, there are 2 distinct parts:
 1. The model consisting of trees and
 2. Hyperparameters and configurations used for building the model.
 
-If you come from Deep Learning community, then it should be
-clear to you that there are differences between the neural network structures composed of
-weights with fixed tensor operations, and the optimizers (like RMSprop) used to train them.
+If you come from the Deep Learning community, then it should be clear to you that there
+are differences between the neural network structures composed of weights with fixed
+tensor operations, and the optimizers (like RMSprop) used to train them.
 
 So when one calls ``booster.save_model`` (``xgb.save`` in R), XGBoost saves the trees,
 some model parameters like number of input columns in trained trees, and the objective
 function, which combined to represent the concept of "model" in XGBoost.  As for why are
 we saving the objective as part of model, that's because objective controls transformation
-of global bias (called ``base_score`` in XGBoost) and task-specific information.  Users
-can share this model with others for prediction, evaluation or continue the training with
-a different set of hyper-parameters etc.
+of global bias (called ``base_score`` or the intercept in XGBoost) and task-specific
+information.  Users can share this model with others for inference, evaluation or continue
+the training with a different set of hyper-parameters etc.
 
 However, this is not the end of story.  There are cases where we need to save something
 more than just the model itself.  For example, in distributed training, XGBoost performs
 checkpointing operation.  Or for some reasons, your favorite distributed computing
 framework decide to copy the model from one worker to another and continue the training in
-there.  In such cases, the serialisation output is required to contain enough information
+there. In such cases, the serialisation output is required to contain enough information
 to continue previous training without user providing any parameters again.  We consider
-such scenario as **memory snapshot** (or memory based serialisation method) and distinguish it
-with normal model IO operation. Currently, memory snapshot is used in the following places:
+such scenario as **memory snapshot** (or memory based serialisation method) and
+distinguish it with normal model IO operation. Currently, memory snapshot is used in the
+following places:
 
 * Python package: when the ``Booster`` object is pickled with the built-in ``pickle`` module.
 * R package: when the ``xgb.Booster`` object is persisted with the built-in functions ``saveRDS``
   or ``save``.
 * JVM packages: when the ``Booster`` object is serialized with the built-in functions ``saveModel``.
 
-Other language bindings are still working in progress.
-
-.. note::
-
-  The old binary format doesn't distinguish difference between model and raw memory
-  serialisation format, it's a mix of everything, which is part of the reason why we want
-  to replace it with a more robust serialisation method.  JVM Package has its own memory
-  based serialisation methods.
-
 To enable JSON format support for model IO (saving only the trees and objective), provide
 a filename with ``.json`` or ``.ubj`` as file extension, the latter is the extension for
 `Universal Binary JSON <https://ubjson.org/>`__
@@ -88,10 +73,9 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
   JSON files that were produced by an external source may lead to undefined behaviors
   and crashes.
 
-While for memory snapshot, UBJSON is the default starting with xgboost 1.6. When loading
-the model back, XGBoost recognizes the file extensions ``.json`` and ``.ubj``, and can
-dispatch accordingly. If the extension is not specified, XGBoost tries to guess the right
-one.
+When loading the model back, XGBoost recognizes the file extensions ``.json`` and
+``.ubj``, and can dispatch accordingly. If the extension is not specified, XGBoost tries
+to guess the right one.
 
 ***************************************************************
 A note on backward compatibility of models and memory snapshots
@@ -234,18 +218,18 @@ You can load it back to the model generated by same version of XGBoost by:
 
   bst.load_config(config)
 
-This way users can study the internal representation more closely.  Please note that some
+This way users can study the internal representation more closely. Please note that some
 JSON generators make use of locale dependent floating point serialization methods, which
 is not supported by XGBoost.
 
 *************************************************
 Difference between saving model and dumping model
 *************************************************
 
-XGBoost has a function called ``dump_model`` in Booster object, which lets you to export
-the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz).  The primary
-use case for it is for model interpretation or visualization, and is not supposed to be
-loaded back to XGBoost.  The JSON version has a `schema
+XGBoost has a function called ``dump_model`` in the Booster class, which lets you to
+export the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz).  The
+primary use case for it is for model interpretation and visualization, and is not supposed
+to be loaded back to XGBoost.  The JSON version has a `schema
 <https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__.  See next section for
 more info.
 
@@ -263,3 +247,15 @@ array.
 
 .. include:: ../model.schema
    :code: json
+
+
+*************
+Brief History
+*************
+
+- The JSON format was introduced in 1.0, aiming to replace the now removed old binary
+  internal format with an open format that can be easily reused
+- Later in XGBoost 1.6.0, additional support for Universal Binary JSON was introduced as
+  an optimization for more efficient model IO.
+- UBJSON has been set to default in 2.1.
+- The old binary format was removed in 3.1.