huggingface · michaelellis003 · Feb 13, 2026 · Feb 13, 2026
diff --git a/measurements/label_distribution/README.md b/measurements/label_distribution/README.md
@@ -11,7 +11,7 @@ tags:
 - evaluate
 - measurement
 description: >-
-  Returns the label distribution and skew of the input data.
+  Returns the label distribution and entropy of the input data.
 ---
 
 # Measurement Card for Label Distribution
@@ -41,13 +41,14 @@ The measurement takes a list of labels as input:
 ### Output Values
 By default, this metric outputs a dictionary that contains :
 -**label_distribution** (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label.
--**label_skew** (`scalar`) : the asymmetry of the label distribution.
+-**label_entropy** (`float`) : the Shannon entropy of the label distribution (in nats). Maximized at log(k) for k classes when labels are uniformly distributed, and 0 when all labels are the same.
+-**label_entropy_normalized** (`float`) : the Shannon entropy normalized by log(k), giving a value between 0 and 1. A value of 1.0 means perfectly balanced; a value close to 0 means highly imbalanced.
 
 ```python
-{'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_skew': 0.7417688338666573}
+{'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_entropy': 0.8979457248567798, 'label_entropy_normalized': 0.8173454221465101}
 ```
 
-If skewness is 0, the dataset is perfectly balanced; if it is less than -1 or greater than 1, the distribution is highly skewed; anything in between can be considered moderately skewed.
+If normalized entropy is 1.0, the dataset is perfectly balanced; values closer to 0 indicate increasing imbalance. Unlike skewness, entropy is permutation-invariant and correctly measures uniformity for categorical variables.
 
 #### Values from Popular Papers
 
@@ -60,7 +61,7 @@ Calculating the label distribution of a dataset with binary labels:
 >>> distribution = evaluate.load("label_distribution")
 >>> results = distribution.compute(data=data)
 >>> print(results)
-{'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}}
+{'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_entropy': 0.6829081047004717, 'label_entropy_normalized': 0.9852281360342515}
 ```
 
 Calculating the label distribution of the test subset of the [IMDb dataset](https://huggingface.co/datasets/imdb):
@@ -70,9 +71,9 @@ Calculating the label distribution of the test subset of the [IMDb dataset](http
 >>> distribution = evaluate.load("label_distribution")
 >>> results = distribution.compute(data=imdb['label'])
 >>> print(results)
-{'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_skew': 0.0}
+{'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_entropy': 0.6931471805599453, 'label_entropy_normalized': 1.0}
 ```
-N.B. The IMDb dataset is perfectly balanced.
+N.B. The IMDb dataset is perfectly balanced (normalized entropy = 1.0).
 
 The output of the measurement can easily be passed to matplotlib to plot a histogram of each label:
 
@@ -91,4 +92,4 @@ While label distribution can be a useful signal for analyzing datasets and choos
 
 ## Further References
 - [Facing Imbalanced Data Recommendations for the Use of Performance Metrics](https://sites.pitt.edu/~jeffcohn/skew/PID2829477.pdf)
-- [Scipy Stats Skew Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html#scipy-stats-skew)
+- [Scipy Stats Entropy Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)
diff --git a/measurements/label_distribution/label_distribution.py b/measurements/label_distribution/label_distribution.py
@@ -16,14 +16,14 @@
 from collections import Counter
 
 import datasets
-import pandas as pd
-from scipy import stats
+import numpy as np
+from scipy.stats import entropy
 
 import evaluate
 
 
 _DESCRIPTION = """
-Returns the label ratios of the dataset labels, as well as a scalar for skewness.
+Returns the label ratios of the dataset labels, as well as the Shannon entropy of the label distribution.
 """
 
 _KWARGS_DESCRIPTION = """
@@ -32,13 +32,14 @@
 
 Returns:
     `label_distribution` (`dict`) :  a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label.
-    `label_skew` (`scalar`) : the asymmetry of the label distribution.
+    `label_entropy` (`float`) : the Shannon entropy of the label distribution (in nats). Maximized at log(k) for k classes when labels are uniformly distributed, and 0 when all labels are the same.
+    `label_entropy_normalized` (`float`) : the Shannon entropy normalized by log(k), giving a value between 0 and 1. A value of 1.0 means perfectly balanced; a value close to 0 means highly imbalanced.
 Examples:
     >>> data = [1, 0, 1, 1, 0, 1, 0]
     >>> distribution = evaluate.load("label_distribution")
     >>> results = distribution.compute(data=data)
     >>> print(results)
-    {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_skew': -0.2886751345948127}
+    {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_entropy': 0.6829081047004717, 'label_entropy_normalized': 0.9852281360342515}
 """
 
 _CITATION = """\
@@ -83,11 +84,16 @@ def _info(self):
         )
 
     def _compute(self, data):
-        """Returns the fraction of each label present in the data"""
+        """Returns the fraction of each label present in the data and the entropy of the distribution."""
         c = Counter(data)
-        label_distribution = {"labels": [k for k in c.keys()], "fractions": [f / len(data) for f in c.values()]}
-        if isinstance(data[0], str):
-            label2id = {label: id for id, label in enumerate(label_distribution["labels"])}
-            data = [label2id[d] for d in data]
-        skew = stats.skew(data)
-        return {"label_distribution": label_distribution, "label_skew": skew}
+        label_distribution = {"labels": list(c.keys()), "fractions": [f / len(data) for f in c.values()]}
+        label_entropy = float(entropy(label_distribution["fractions"]))
+        if len(c) > 1:
+            label_entropy_normalized = float(label_entropy / np.log(len(c)))
+        else:
+            label_entropy_normalized = 0.0
+        return {
+            "label_distribution": label_distribution,
+            "label_entropy": label_entropy,
+            "label_entropy_normalized": label_entropy_normalized,
+        }
diff --git a/measurements/label_distribution/test_label_distribution.py b/measurements/label_distribution/test_label_distribution.py
@@ -0,0 +1,97 @@
+# Copyright 2025 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the label_distribution measurement."""
+
+import math
+import unittest
+
+from label_distribution import LabelDistribution
+
+
+measurement = LabelDistribution()
+
+
+class TestLabelDistribution(unittest.TestCase):
+    def test_uniform_binary(self):
+        """Perfectly balanced binary labels should have normalized entropy of 1.0."""
+        data = [0, 1, 0, 1, 0, 1]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(result["label_entropy_normalized"], 1.0)
+        self.assertAlmostEqual(result["label_entropy"], math.log(2))
+        self.assertEqual(result["label_distribution"]["fractions"], [0.5, 0.5])
+
+    def test_uniform_multiclass(self):
+        """Perfectly balanced 3-class labels should have normalized entropy of 1.0."""
+        data = [0, 1, 2, 0, 1, 2, 0, 1, 2]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(result["label_entropy_normalized"], 1.0)
+        self.assertAlmostEqual(result["label_entropy"], math.log(3))
+
+    def test_single_class(self):
+        """All labels the same should have entropy 0."""
+        data = [1, 1, 1, 1, 1]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(result["label_entropy"], 0.0)
+        self.assertAlmostEqual(result["label_entropy_normalized"], 0.0)
+
+    def test_imbalanced(self):
+        """Imbalanced labels should have normalized entropy less than 1."""
+        data = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
+        result = measurement.compute(data=data)
+        self.assertGreater(result["label_entropy"], 0.0)
+        self.assertLess(result["label_entropy_normalized"], 1.0)
+
+    def test_permutation_invariance(self):
+        """Entropy should be the same regardless of which integer is assigned to which class.
+
+        This is the key property that skewness lacked: [0,0,1,1,1,1,1,2,2] and
+        [0,0,1,1,2,2,2,2,2] have the same class distribution (2,5,2) but
+        different skewness.  Entropy must be identical for both.
+        """
+        data_a = [0, 0, 1, 1, 1, 1, 1, 2, 2]
+        data_b = [0, 0, 1, 1, 2, 2, 2, 2, 2]
+        result_a = measurement.compute(data=data_a)
+        result_b = measurement.compute(data=data_b)
+        self.assertAlmostEqual(result_a["label_entropy"], result_b["label_entropy"])
+        self.assertAlmostEqual(result_a["label_entropy_normalized"], result_b["label_entropy_normalized"])
+
+    def test_string_labels(self):
+        """String labels should work the same as integer labels."""
+        data = ["cat", "dog", "cat", "cat", "dog"]
+        result = measurement.compute(data=data)
+        self.assertGreater(result["label_entropy"], 0.0)
+        self.assertLess(result["label_entropy_normalized"], 1.0)
+        self.assertIn("cat", result["label_distribution"]["labels"])
+        self.assertIn("dog", result["label_distribution"]["labels"])
+
+    def test_output_keys(self):
+        """Output should contain label_distribution, label_entropy, and label_entropy_normalized."""
+        data = [0, 1, 2]
+        result = measurement.compute(data=data)
+        self.assertIn("label_distribution", result)
+        self.assertIn("label_entropy", result)
+        self.assertIn("label_entropy_normalized", result)
+        self.assertIn("labels", result["label_distribution"])
+        self.assertIn("fractions", result["label_distribution"])
+
+    def test_fractions_sum_to_one(self):
+        """Label fractions should always sum to 1."""
+        data = [0, 0, 1, 2, 2, 2, 3]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(sum(result["label_distribution"]["fractions"]), 1.0)
+
+
+if __name__ == "__main__":
+    unittest.main()