[DOCS] Adds descriptions to ML APIs (#2245)

Co-authored-by: Abdon Pijpelink <[email protected]>
elastic · Aug 24, 2023 · 06379db · 06379db
1 parent a2e0bd9
commit 06379db
Show file tree

Hide file tree

Showing 11 changed files with 754 additions and 66 deletions.
diff --git a/output/schema/schema.json b/output/schema/schema.json
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
@@ -248,6 +248,7 @@ ml-delete-snapshot,https://www.elastic.co/guide/en/elasticsearch/reference/{bran
 ml-feature-importance,https://www.elastic.co/guide/en/machine-learning/{branch}/ml-feature-importance.html
 ml-flush-job,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-flush-job.html
 ml-forecast,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-forecast.html
+ml-functions,https://www.elastic.co/guide/en/machine-learning/{branch}/ml-functions.html
 ml-get-bucket,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-bucket.html
 ml-get-calendar-event,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-calendar-event.html
 ml-get-calendar,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-calendar.html

diff --git a/specification/ml/_types/Analysis.ts b/specification/ml/_types/Analysis.ts
@@ -77,16 +77,73 @@ whole number of buckets in one day. If the anomaly detection job uses a datafeed
 }
 
 export class AnalysisConfigRead implements OverloadOf<AnalysisConfig> {
+  /**
+   * The size of the interval that the analysis is aggregated into, typically between `5m` and `1h`.
+   */
   bucket_span: Duration
+  /**
+   * If `categorization_field_name` is specified, you can also define the analyzer that is used to interpret the categorization field.
+   * This property cannot be used at the same time as `categorization_filters`.
+   * The categorization analyzer specifies how the `categorization_field` is interpreted by the categorization process.
+   */
   categorization_analyzer?: CategorizationAnalyzer
+  /**
+   * If this property is specified, the values of the specified field will be categorized.
+   * The resulting categories must be used in a detector by setting `by_field_name`, `over_field_name`, or `partition_field_name` to the keyword `mlcategory`.
+   */
   categorization_field_name?: Field
+  /**
+   * If `categorization_field_name` is specified, you can also define optional filters.
+   * This property expects an array of regular expressions.
+   * The expressions are used to filter out matching sequences from the categorization field values.
+   */
   categorization_filters?: string[]
+  /**
+   * An array of detector configuration objects.
+   * Detector configuration objects specify which data fields a job analyzes.
+   * They also specify which analytical functions are used.
+   * You can specify multiple detectors for a job.
+   */
   detectors: DetectorRead[]
+  /**
+   * A comma separated list of influencer field names.
+   * Typically these can be the by, over, or partition fields that are used in the detector configuration.
+   * You might also want to use a field name that is not specifically named in a detector, but is available as part of the input data.
+   * When you use multiple detectors, the use of influencers is recommended as it aggregates results for each influencer entity.
+   */
   influencers: Field[]
+  /**
+   * Advanced configuration option.
+   * Affects the pruning of models that have not been updated for the given time duration.
+   * The value must be set to a multiple of the `bucket_span`.
+   * If set too low, important information may be removed from the model.
+   * Typically, set to `30d` or longer.
+   * If not set, model pruning only occurs if the model memory status reaches the soft limit or the hard limit.
+   * For jobs created in 8.1 and later, the default value is the greater of `30d` or 20 times `bucket_span`.
+   */
   model_prune_window?: Duration
+  /**
+   * The size of the window in which to expect data that is out of time order.
+   * Defaults to no latency.
+   * If you specify a non-zero value, it must be greater than or equal to one second.
+   * @server_default 0
+   */
   latency?: Duration
+  /**
+   * This functionality is reserved for internal use.
+   * It is not supported for use in customer environments and is not subject to the support SLA of official GA features.
+   * If set to `true`, the analysis will automatically find correlations between metrics for a given by field value and report anomalies when those correlations cease to hold.
+   */
   multivariate_by_fields?: boolean
+  /**
+   * Settings related to how categorization interacts with partition fields.
+   */
   per_partition_categorization?: PerPartitionCategorization
+  /**
+   * If this property is specified, the data that is fed to the job is expected to be pre-summarized.
+   * This property value is the name of the field that contains the count of raw data points that have been summarized.
+   * The same `summary_count_field_name` applies to all detectors in the job.
+   */
   summary_count_field_name?: Field
 }
 

diff --git a/specification/ml/_types/Datafeed.ts b/specification/ml/_types/Datafeed.ts
@@ -138,36 +138,95 @@ export enum DatafeedState {
 }
 
 export class DatafeedStats {
+  /**
+   * For started datafeeds only, contains messages relating to the selection of a node.
+   */
   assignment_explanation?: string
+  /**
+   * A numerical character string that uniquely identifies the datafeed.
+   * This identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores.
+   * It must start and end with alphanumeric characters.
+   */
   datafeed_id: Id
   /**
+   * For started datafeeds only, this information pertains to the node upon which the datafeed is started.
    * @availability stack
    */
   node?: DiscoveryNode
+  /**
+   * The status of the datafeed, which can be one of the following values: `starting`, `started`, `stopping`, `stopped`.
+   */
   state: DatafeedState
+  /**
+   * An object that provides statistical information about timing aspect of this datafeed.
+   */
   timing_stats: DatafeedTimingStats
+  /**
+   * An object containing the running state for this datafeed.
+   * It is only provided if the datafeed is started.
+   */
   running_state?: DatafeedRunningState
 }
 
 export class DatafeedTimingStats {
+  /**
+   * The number of buckets processed.
+   */
   bucket_count: long
+  /**
+   * The exponential average search time per hour, in milliseconds.
+   */
   exponential_average_search_time_per_hour_ms: DurationValue<UnitFloatMillis>
+  /**
+   * Identifier for the anomaly detection job.
+   */
   job_id: Id
+  /**
+   * The number of searches run by the datafeed.
+   */
   search_count: long
+  /**
+   * The total time the datafeed spent searching, in milliseconds.
+   */
   total_search_time_ms: DurationValue<UnitFloatMillis>
+  /**
+   * The average search time per bucket, in milliseconds.
+   */
   average_search_time_per_bucket_ms?: DurationValue<UnitFloatMillis>
 }
 
 export class DatafeedRunningState {
+  /**
+   * Indicates if the datafeed is "real-time"; meaning that the datafeed has no configured `end` time.
+   */
   real_time_configured: boolean
+  /**
+   * Indicates whether the datafeed has finished running on the available past data.
+   * For datafeeds without a configured `end` time, this means that the datafeed is now running on "real-time" data.
+   */
   real_time_running: boolean
+  /**
+   * Provides the latest time interval the datafeed has searched.
+   */
   search_interval?: RunningStateSearchInterval
 }
 
 export class RunningStateSearchInterval {
+  /**
+   * The end time.
+   */
   end?: Duration
+  /**
+   * The end time as an epoch in milliseconds.
+   */
   end_ms: DurationValue<UnitMillis>
+  /**
+   * The start time.
+   */
   start?: Duration
+  /**
+   * The start time as an epoch in milliseconds.
+   */
   start_ms: DurationValue<UnitMillis>
 }
 

diff --git a/specification/ml/_types/DataframeAnalytics.ts b/specification/ml/_types/DataframeAnalytics.ts
@@ -381,43 +381,182 @@ export class DataframeAnalyticsStatsContainer {
 }
 
 export class DataframeAnalyticsStatsHyperparameters {
+  /**
+   * An object containing the parameters of the classification analysis job.
+   */
   hyperparameters: Hyperparameters
   /** The number of iterations on the analysis. */
   iteration: integer
+  /**
+   * The timestamp when the statistics were reported in milliseconds since the epoch.
+   */
   timestamp: EpochTime<UnitMillis>
+  /**
+   * An object containing time statistics about the data frame analytics job.
+   */
   timing_stats: TimingStats
+  /**
+   * An object containing information about validation loss.
+   */
   validation_loss: ValidationLoss
 }
 
 export class DataframeAnalyticsStatsOutlierDetection {
+  /**
+   * The list of job parameters specified by the user or determined by algorithmic heuristics.
+   */
   parameters: OutlierDetectionParameters
+  /**
+   * The timestamp when the statistics were reported in milliseconds since the epoch.
+   */
   timestamp: EpochTime<UnitMillis>
+  /**
+   * An object containing time statistics about the data frame analytics job.
+   */
   timing_stats: TimingStats
 }
 
 export class Hyperparameters {
+  /**
+   * Advanced configuration option.
+   * Machine learning uses loss guided tree growing, which means that the decision trees grow where the regularized loss decreases most quickly.
+   * This parameter affects loss calculations by acting as a multiplier of the tree depth.
+   * Higher alpha values result in shallower trees and faster training times.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be greater than or equal to zero.
+   */
   alpha?: double
+  /**
+   * Advanced configuration option.
+   * Regularization parameter to prevent overfitting on the training data set.
+   * Multiplies an L2 regularization term which applies to leaf weights of the individual trees in the forest.
+   * A high lambda value causes training to favor small leaf weights.
+   * This behavior makes the prediction function smoother at the expense of potentially not being able to capture relevant relationships between the features and the dependent variable.
+   * A small lambda value results in large individual trees and slower training.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be a nonnegative value.
+   */
   lambda?: double
+  /**
+   * Advanced configuration option.
+   * Regularization parameter to prevent overfitting on the training data set.
+   * Multiplies a linear penalty associated with the size of individual trees in the forest.
+   * A high gamma value causes training to prefer small trees.
+   * A small gamma value results in larger individual trees and slower training.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be a nonnegative value.
+   */
   gamma?: double
+  /**
+   * Advanced configuration option.
+   * The shrinkage applied to the weights.
+   * Smaller values result in larger forests which have a better generalization error.
+   * However, larger forests cause slower training.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be a value between `0.001` and `1`.
+   */
   eta?: double
+  /**
+   * Advanced configuration option.
+   * Specifies the rate at which `eta` increases for each new tree that is added to the forest.
+   * For example, a rate of 1.05 increases `eta` by 5% for each extra tree.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be between `0.5` and `2`.
+   */
   eta_growth_rate_per_tree?: double
+  /**
+   * Advanced configuration option.
+   * Defines the fraction of features that will be used when selecting a random bag for each candidate split.
+   * By default, this value is calculated during hyperparameter optimization.
+   */
   feature_bag_fraction?: double
+  /**
+   * Advanced configuration option.
+   * Controls the fraction of data that is used to compute the derivatives of the loss function for tree training.
+   * A small value results in the use of a small fraction of the data.
+   * If this value is set to be less than 1, accuracy typically improves.
+   * However, too small a value may result in poor convergence for the ensemble and so require more trees.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be greater than zero and less than or equal to 1.
+   */
   downsample_factor?: double
+  /**
+   * If the algorithm fails to determine a non-trivial tree (more than a single leaf), this parameter determines how many of such consecutive failures are tolerated.
+   * Once the number of attempts exceeds the threshold, the forest training stops.
+   */
   max_attempts_to_add_tree?: integer
+  /**
+   * Advanced configuration option.
+   * A multiplier responsible for determining the maximum number of hyperparameter optimization steps in the Bayesian optimization procedure.
+   * The maximum number of steps is determined based on the number of undefined hyperparameters times the maximum optimization rounds per hyperparameter.
+   * By default, this value is calculated during hyperparameter optimization.
+   */
   max_optimization_rounds_per_hyperparameter?: integer
+  /**
+   * Advanced configuration option.
+   * Defines the maximum number of decision trees in the forest.
+   * The maximum value is 2000.
+   * By default, this value is calculated during hyperparameter optimization.
+   */
   max_trees?: integer
+  /**
+   * The maximum number of folds for the cross-validation procedure.
+   */
   num_folds?: integer
+  /**
+   * Determines the maximum number of splits for every feature that can occur in a decision tree when the tree is trained.
+   */
   num_splits_per_feature?: integer
+  /**
+   * Advanced configuration option.
+   * Machine learning uses loss guided tree growing, which means that the decision trees grow where the regularized loss decreases most quickly.
+   * This soft limit combines with the `soft_tree_depth_tolerance` to penalize trees that exceed the specified depth; the regularized loss increases quickly beyond this depth.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be greater than or equal to 0.
+   */
   soft_tree_depth_limit?: integer
+  /**
+   * Advanced configuration option.
+   * This option controls how quickly the regularized loss increases when the tree depth exceeds `soft_tree_depth_limit`.
+   * By default, this value is calculated during hyperparameter optimization.
+   * It must be greater than or equal to 0.01.
+   */
   soft_tree_depth_tolerance?: double
 }
 
 export class OutlierDetectionParameters {
+  /**
+   * Specifies whether the feature influence calculation is enabled.
+   * @server_default true
+   */
   compute_feature_influence?: boolean
+  /**
+   * The minimum outlier score that a document needs to have in order to calculate its feature influence score.
+   * Value range: 0-1
+   * @server_default 0.1
+   */
   feature_influence_threshold?: double
+  /**
+   * The method that outlier detection uses.
+   * Available methods are `lof`, `ldof`, `distance_kth_nn`, `distance_knn`, and `ensemble`.
+   * The default value is ensemble, which means that outlier detection uses an ensemble of different methods and normalises and combines their individual outlier scores to obtain the overall outlier score.
+   */
   method?: string
+  /**
+   * Defines the value for how many nearest neighbors each method of outlier detection uses to calculate its outlier score.
+   * When the value is not set, different values are used for different ensemble members.
+   * This default behavior helps improve the diversity in the ensemble; only override it if you are confident that the value you choose is appropriate for the data set.
+   */
   n_neighbors?: integer
+  /**
+   * The proportion of the data set that is assumed to be outlying prior to outlier detection.
+   * For example, 0.05 means it is assumed that 5% of values are real outliers and 95% are inliers.
+   */
   outlier_fraction?: double
+  /**
+   * If `true`, the following operation is performed on the columns before computing outlier scores: (x_i - mean(x_i)) / sd(x_i).
+   * @server_default true
+   */
   standardization_enabled?: boolean
 }