Skip to content

Commit

Permalink
[DOCS] Adds descriptions to ML APIs (#2245)
Browse files Browse the repository at this point in the history
Co-authored-by: Abdon Pijpelink <[email protected]>
  • Loading branch information
szabosteve and abdonpijpelink authored Aug 24, 2023
1 parent a2e0bd9 commit 06379db
Show file tree
Hide file tree
Showing 11 changed files with 754 additions and 66 deletions.
253 changes: 187 additions & 66 deletions output/schema/schema.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions specification/_doc_ids/table.csv
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ ml-delete-snapshot,https://www.elastic.co/guide/en/elasticsearch/reference/{bran
ml-feature-importance,https://www.elastic.co/guide/en/machine-learning/{branch}/ml-feature-importance.html
ml-flush-job,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-flush-job.html
ml-forecast,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-forecast.html
ml-functions,https://www.elastic.co/guide/en/machine-learning/{branch}/ml-functions.html
ml-get-bucket,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-bucket.html
ml-get-calendar-event,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-calendar-event.html
ml-get-calendar,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-calendar.html
Expand Down
57 changes: 57 additions & 0 deletions specification/ml/_types/Analysis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,73 @@ whole number of buckets in one day. If the anomaly detection job uses a datafeed
}

export class AnalysisConfigRead implements OverloadOf<AnalysisConfig> {
/**
* The size of the interval that the analysis is aggregated into, typically between `5m` and `1h`.
*/
bucket_span: Duration
/**
* If `categorization_field_name` is specified, you can also define the analyzer that is used to interpret the categorization field.
* This property cannot be used at the same time as `categorization_filters`.
* The categorization analyzer specifies how the `categorization_field` is interpreted by the categorization process.
*/
categorization_analyzer?: CategorizationAnalyzer
/**
* If this property is specified, the values of the specified field will be categorized.
* The resulting categories must be used in a detector by setting `by_field_name`, `over_field_name`, or `partition_field_name` to the keyword `mlcategory`.
*/
categorization_field_name?: Field
/**
* If `categorization_field_name` is specified, you can also define optional filters.
* This property expects an array of regular expressions.
* The expressions are used to filter out matching sequences from the categorization field values.
*/
categorization_filters?: string[]
/**
* An array of detector configuration objects.
* Detector configuration objects specify which data fields a job analyzes.
* They also specify which analytical functions are used.
* You can specify multiple detectors for a job.
*/
detectors: DetectorRead[]
/**
* A comma separated list of influencer field names.
* Typically these can be the by, over, or partition fields that are used in the detector configuration.
* You might also want to use a field name that is not specifically named in a detector, but is available as part of the input data.
* When you use multiple detectors, the use of influencers is recommended as it aggregates results for each influencer entity.
*/
influencers: Field[]
/**
* Advanced configuration option.
* Affects the pruning of models that have not been updated for the given time duration.
* The value must be set to a multiple of the `bucket_span`.
* If set too low, important information may be removed from the model.
* Typically, set to `30d` or longer.
* If not set, model pruning only occurs if the model memory status reaches the soft limit or the hard limit.
* For jobs created in 8.1 and later, the default value is the greater of `30d` or 20 times `bucket_span`.
*/
model_prune_window?: Duration
/**
* The size of the window in which to expect data that is out of time order.
* Defaults to no latency.
* If you specify a non-zero value, it must be greater than or equal to one second.
* @server_default 0
*/
latency?: Duration
/**
* This functionality is reserved for internal use.
* It is not supported for use in customer environments and is not subject to the support SLA of official GA features.
* If set to `true`, the analysis will automatically find correlations between metrics for a given by field value and report anomalies when those correlations cease to hold.
*/
multivariate_by_fields?: boolean
/**
* Settings related to how categorization interacts with partition fields.
*/
per_partition_categorization?: PerPartitionCategorization
/**
* If this property is specified, the data that is fed to the job is expected to be pre-summarized.
* This property value is the name of the field that contains the count of raw data points that have been summarized.
* The same `summary_count_field_name` applies to all detectors in the job.
*/
summary_count_field_name?: Field
}

Expand Down
59 changes: 59 additions & 0 deletions specification/ml/_types/Datafeed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,36 +138,95 @@ export enum DatafeedState {
}

export class DatafeedStats {
/**
* For started datafeeds only, contains messages relating to the selection of a node.
*/
assignment_explanation?: string
/**
* A numerical character string that uniquely identifies the datafeed.
* This identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores.
* It must start and end with alphanumeric characters.
*/
datafeed_id: Id
/**
* For started datafeeds only, this information pertains to the node upon which the datafeed is started.
* @availability stack
*/
node?: DiscoveryNode
/**
* The status of the datafeed, which can be one of the following values: `starting`, `started`, `stopping`, `stopped`.
*/
state: DatafeedState
/**
* An object that provides statistical information about timing aspect of this datafeed.
*/
timing_stats: DatafeedTimingStats
/**
* An object containing the running state for this datafeed.
* It is only provided if the datafeed is started.
*/
running_state?: DatafeedRunningState
}

export class DatafeedTimingStats {
/**
* The number of buckets processed.
*/
bucket_count: long
/**
* The exponential average search time per hour, in milliseconds.
*/
exponential_average_search_time_per_hour_ms: DurationValue<UnitFloatMillis>
/**
* Identifier for the anomaly detection job.
*/
job_id: Id
/**
* The number of searches run by the datafeed.
*/
search_count: long
/**
* The total time the datafeed spent searching, in milliseconds.
*/
total_search_time_ms: DurationValue<UnitFloatMillis>
/**
* The average search time per bucket, in milliseconds.
*/
average_search_time_per_bucket_ms?: DurationValue<UnitFloatMillis>
}

export class DatafeedRunningState {
/**
* Indicates if the datafeed is "real-time"; meaning that the datafeed has no configured `end` time.
*/
real_time_configured: boolean
/**
* Indicates whether the datafeed has finished running on the available past data.
* For datafeeds without a configured `end` time, this means that the datafeed is now running on "real-time" data.
*/
real_time_running: boolean
/**
* Provides the latest time interval the datafeed has searched.
*/
search_interval?: RunningStateSearchInterval
}

export class RunningStateSearchInterval {
/**
* The end time.
*/
end?: Duration
/**
* The end time as an epoch in milliseconds.
*/
end_ms: DurationValue<UnitMillis>
/**
* The start time.
*/
start?: Duration
/**
* The start time as an epoch in milliseconds.
*/
start_ms: DurationValue<UnitMillis>
}

Expand Down
139 changes: 139 additions & 0 deletions specification/ml/_types/DataframeAnalytics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -381,43 +381,182 @@ export class DataframeAnalyticsStatsContainer {
}

export class DataframeAnalyticsStatsHyperparameters {
/**
* An object containing the parameters of the classification analysis job.
*/
hyperparameters: Hyperparameters
/** The number of iterations on the analysis. */
iteration: integer
/**
* The timestamp when the statistics were reported in milliseconds since the epoch.
*/
timestamp: EpochTime<UnitMillis>
/**
* An object containing time statistics about the data frame analytics job.
*/
timing_stats: TimingStats
/**
* An object containing information about validation loss.
*/
validation_loss: ValidationLoss
}

export class DataframeAnalyticsStatsOutlierDetection {
/**
* The list of job parameters specified by the user or determined by algorithmic heuristics.
*/
parameters: OutlierDetectionParameters
/**
* The timestamp when the statistics were reported in milliseconds since the epoch.
*/
timestamp: EpochTime<UnitMillis>
/**
* An object containing time statistics about the data frame analytics job.
*/
timing_stats: TimingStats
}

export class Hyperparameters {
/**
* Advanced configuration option.
* Machine learning uses loss guided tree growing, which means that the decision trees grow where the regularized loss decreases most quickly.
* This parameter affects loss calculations by acting as a multiplier of the tree depth.
* Higher alpha values result in shallower trees and faster training times.
* By default, this value is calculated during hyperparameter optimization.
* It must be greater than or equal to zero.
*/
alpha?: double
/**
* Advanced configuration option.
* Regularization parameter to prevent overfitting on the training data set.
* Multiplies an L2 regularization term which applies to leaf weights of the individual trees in the forest.
* A high lambda value causes training to favor small leaf weights.
* This behavior makes the prediction function smoother at the expense of potentially not being able to capture relevant relationships between the features and the dependent variable.
* A small lambda value results in large individual trees and slower training.
* By default, this value is calculated during hyperparameter optimization.
* It must be a nonnegative value.
*/
lambda?: double
/**
* Advanced configuration option.
* Regularization parameter to prevent overfitting on the training data set.
* Multiplies a linear penalty associated with the size of individual trees in the forest.
* A high gamma value causes training to prefer small trees.
* A small gamma value results in larger individual trees and slower training.
* By default, this value is calculated during hyperparameter optimization.
* It must be a nonnegative value.
*/
gamma?: double
/**
* Advanced configuration option.
* The shrinkage applied to the weights.
* Smaller values result in larger forests which have a better generalization error.
* However, larger forests cause slower training.
* By default, this value is calculated during hyperparameter optimization.
* It must be a value between `0.001` and `1`.
*/
eta?: double
/**
* Advanced configuration option.
* Specifies the rate at which `eta` increases for each new tree that is added to the forest.
* For example, a rate of 1.05 increases `eta` by 5% for each extra tree.
* By default, this value is calculated during hyperparameter optimization.
* It must be between `0.5` and `2`.
*/
eta_growth_rate_per_tree?: double
/**
* Advanced configuration option.
* Defines the fraction of features that will be used when selecting a random bag for each candidate split.
* By default, this value is calculated during hyperparameter optimization.
*/
feature_bag_fraction?: double
/**
* Advanced configuration option.
* Controls the fraction of data that is used to compute the derivatives of the loss function for tree training.
* A small value results in the use of a small fraction of the data.
* If this value is set to be less than 1, accuracy typically improves.
* However, too small a value may result in poor convergence for the ensemble and so require more trees.
* By default, this value is calculated during hyperparameter optimization.
* It must be greater than zero and less than or equal to 1.
*/
downsample_factor?: double
/**
* If the algorithm fails to determine a non-trivial tree (more than a single leaf), this parameter determines how many of such consecutive failures are tolerated.
* Once the number of attempts exceeds the threshold, the forest training stops.
*/
max_attempts_to_add_tree?: integer
/**
* Advanced configuration option.
* A multiplier responsible for determining the maximum number of hyperparameter optimization steps in the Bayesian optimization procedure.
* The maximum number of steps is determined based on the number of undefined hyperparameters times the maximum optimization rounds per hyperparameter.
* By default, this value is calculated during hyperparameter optimization.
*/
max_optimization_rounds_per_hyperparameter?: integer
/**
* Advanced configuration option.
* Defines the maximum number of decision trees in the forest.
* The maximum value is 2000.
* By default, this value is calculated during hyperparameter optimization.
*/
max_trees?: integer
/**
* The maximum number of folds for the cross-validation procedure.
*/
num_folds?: integer
/**
* Determines the maximum number of splits for every feature that can occur in a decision tree when the tree is trained.
*/
num_splits_per_feature?: integer
/**
* Advanced configuration option.
* Machine learning uses loss guided tree growing, which means that the decision trees grow where the regularized loss decreases most quickly.
* This soft limit combines with the `soft_tree_depth_tolerance` to penalize trees that exceed the specified depth; the regularized loss increases quickly beyond this depth.
* By default, this value is calculated during hyperparameter optimization.
* It must be greater than or equal to 0.
*/
soft_tree_depth_limit?: integer
/**
* Advanced configuration option.
* This option controls how quickly the regularized loss increases when the tree depth exceeds `soft_tree_depth_limit`.
* By default, this value is calculated during hyperparameter optimization.
* It must be greater than or equal to 0.01.
*/
soft_tree_depth_tolerance?: double
}

export class OutlierDetectionParameters {
/**
* Specifies whether the feature influence calculation is enabled.
* @server_default true
*/
compute_feature_influence?: boolean
/**
* The minimum outlier score that a document needs to have in order to calculate its feature influence score.
* Value range: 0-1
* @server_default 0.1
*/
feature_influence_threshold?: double
/**
* The method that outlier detection uses.
* Available methods are `lof`, `ldof`, `distance_kth_nn`, `distance_knn`, and `ensemble`.
* The default value is ensemble, which means that outlier detection uses an ensemble of different methods and normalises and combines their individual outlier scores to obtain the overall outlier score.
*/
method?: string
/**
* Defines the value for how many nearest neighbors each method of outlier detection uses to calculate its outlier score.
* When the value is not set, different values are used for different ensemble members.
* This default behavior helps improve the diversity in the ensemble; only override it if you are confident that the value you choose is appropriate for the data set.
*/
n_neighbors?: integer
/**
* The proportion of the data set that is assumed to be outlying prior to outlier detection.
* For example, 0.05 means it is assumed that 5% of values are real outliers and 95% are inliers.
*/
outlier_fraction?: double
/**
* If `true`, the following operation is performed on the columns before computing outlier scores: (x_i - mean(x_i)) / sd(x_i).
* @server_default true
*/
standardization_enabled?: boolean
}

Expand Down
Loading

0 comments on commit 06379db

Please sign in to comment.