Skip to content

Commit

Permalink
Update notebooks and experimental results (#12)
Browse files Browse the repository at this point in the history
* added plots to compare calibration errors

* updated plotting notebook

* added plots for comparison between prompting schemes
  • Loading branch information
AndreFCruz authored Sep 4, 2024
1 parent e78de48 commit 02f1db5
Show file tree
Hide file tree
Showing 19 changed files with 1,079 additions and 651 deletions.
1,450 changes: 799 additions & 651 deletions notebooks/paper-plots-and-tables.ipynb

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
\begin{tabular}{lllllllll}
\toprule
& ece (mult. choice) & brier score loss (mult. choice) & roc auc (mult. choice) & accuracy (mult. choice) & ece & brier score loss & roc auc & accuracy \\
Model & & & & & & & & \\
\midrule
GPT 4o mini (it) & 0.28 & 0.29 & 0.79 & 0.65 & 0.23 & 0.23 & 0.80 & 0.73 \\
Llama 3 70B (it) & 0.17 & \cellcolor{cyan!17.1} 0.19 & \cellcolor{cyan!23.3} 0.85 & \cellcolor{cyan!15.4} 0.73 & \cellcolor{cyan!0.7} 0.05 & \cellcolor{cyan!25.0} 0.14 & \cellcolor{cyan!25.0} 0.88 & \cellcolor{cyan!25.0} 0.81 \\
Llama 3 70B & 0.25 & 0.26 & \cellcolor{cyan!2.3} 0.82 & 0.52 & 0.05 & \cellcolor{cyan!16.7} 0.15 & \cellcolor{cyan!13.6} 0.86 & \cellcolor{cyan!5.8} 0.78 \\
Llama 3 8B (it) & 0.07 & \cellcolor{cyan!25.0} 0.19 & 0.79 & \cellcolor{cyan!25.0} 0.74 & 0.08 & 0.17 & 0.82 & 0.77 \\
Llama 3 8B & 0.34 & 0.34 & 0.76 & \cellcolor{orange!25.0} 0.45 & 0.15 & 0.23 & 0.75 & \cellcolor{orange!25.0} 0.46 \\
Mixtral 8x22B (it) & \cellcolor{orange!23.0} 0.38 & \cellcolor{orange!8.0} 0.39 & 0.60 & 0.51 & 0.06 & \cellcolor{cyan!21.9} 0.14 & \cellcolor{cyan!24.4} 0.87 & \cellcolor{cyan!16.5} 0.79 \\
Mixtral 8x22B & 0.21 & 0.24 & \cellcolor{cyan!25.0} 0.86 & 0.52 & 0.15 & 0.18 & 0.82 & \cellcolor{cyan!20.7} 0.80 \\
Mixtral 8x7B (it) & 0.22 & 0.24 & \cellcolor{cyan!6.8} 0.82 & \cellcolor{cyan!13.7} 0.73 & 0.07 & \cellcolor{cyan!16.7} 0.15 & \cellcolor{cyan!23.2} 0.87 & \cellcolor{cyan!7.2} 0.78 \\
Mixtral 8x7B & 0.30 & 0.31 & \cellcolor{cyan!1.1} 0.81 & \cellcolor{orange!24.1} 0.45 & 0.08 & 0.17 & 0.81 & 0.73 \\
Mistral 7B (it) & \cellcolor{orange!6.6} 0.35 & 0.36 & 0.72 & 0.63 & \cellcolor{cyan!7.4} 0.04 & 0.19 & 0.79 & 0.69 \\
Mistral 7B & 0.26 & 0.30 & 0.76 & \cellcolor{orange!25.0} 0.45 & 0.14 & 0.19 & 0.80 & \cellcolor{cyan!12.9} 0.79 \\
Yi 34B (it) & 0.14 & 0.21 & 0.79 & 0.69 & 0.15 & 0.21 & 0.81 & 0.51 \\
Yi 34B & 0.08 & 0.23 & 0.70 & 0.62 & 0.13 & 0.23 & 0.66 & 0.50 \\
Gemma 7B (it) & \cellcolor{orange!10.0} 0.36 & 0.38 & 0.59 & 0.58 & \cellcolor{cyan!8.8} 0.04 & 0.22 & 0.71 & 0.60 \\
Gemma 7B & 0.15 & 0.25 & 0.65 & \cellcolor{orange!4.1} 0.48 & \cellcolor{orange!25.0} 0.35 & \cellcolor{orange!25.0} 0.38 & \cellcolor{orange!1.6} 0.50 & 0.51 \\
Gemma 2B (it) & \cellcolor{orange!25.0} 0.38 & \cellcolor{orange!25.0} 0.41 & \cellcolor{orange!25.0} 0.42 & \cellcolor{orange!16.3} 0.46 & 0.12 & 0.27 & \cellcolor{orange!25.0} 0.46 & \cellcolor{orange!25.0} 0.46 \\
Gemma 2B & \cellcolor{cyan!25.0} 0.01 & 0.24 & 0.63 & 0.54 & \cellcolor{cyan!25.0} 0.01 & 0.23 & 0.57 & 0.53 \\
LR & 0.02 & 0.15 & 0.86 & 0.78 & 0.02 & 0.15 & 0.86 & 0.78 \\
GBM & 0.00 & 0.12 & 0.91 & 0.83 & 0.00 & 0.12 & 0.91 & 0.83 \\
XGBoost & 0.00 & 0.12 & 0.91 & 0.83 & 0.00 & 0.12 & 0.91 & 0.83 \\
\bottomrule
\end{tabular}

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
\begin{tabular}{lllllllll}
\toprule
& ece (num) & brier score loss (num) & roc auc (num) & accuracy (num) & ece & brier score loss & roc auc & accuracy \\
Model & & & & & & & & \\
\midrule
GPT 4o mini (it) & 0.23 & 0.23 & 0.80 & 0.73 & 0.28 & 0.29 & 0.79 & 0.65 \\
Llama 3 70B (it) & \cellcolor{cyan!0.7} 0.05 & \cellcolor{cyan!25.0} 0.14 & \cellcolor{cyan!25.0} 0.88 & \cellcolor{cyan!25.0} 0.81 & 0.17 & \cellcolor{cyan!17.1} 0.19 & \cellcolor{cyan!23.3} 0.85 & \cellcolor{cyan!15.4} 0.73 \\
Llama 3 70B & 0.05 & \cellcolor{cyan!16.7} 0.15 & \cellcolor{cyan!13.6} 0.86 & \cellcolor{cyan!5.8} 0.78 & 0.25 & 0.26 & \cellcolor{cyan!2.3} 0.82 & 0.52 \\
Llama 3 8B (it) & 0.08 & 0.17 & 0.82 & 0.77 & 0.07 & \cellcolor{cyan!25.0} 0.19 & 0.79 & \cellcolor{cyan!25.0} 0.74 \\
Llama 3 8B & 0.15 & 0.23 & 0.75 & \cellcolor{orange!25.0} 0.46 & 0.34 & 0.34 & 0.76 & \cellcolor{orange!25.0} 0.45 \\
Mixtral 8x22B (it) & 0.06 & \cellcolor{cyan!21.9} 0.14 & \cellcolor{cyan!24.4} 0.87 & \cellcolor{cyan!16.5} 0.79 & \cellcolor{orange!23.0} 0.38 & \cellcolor{orange!8.0} 0.39 & 0.60 & 0.51 \\
Mixtral 8x22B & 0.15 & 0.18 & 0.82 & \cellcolor{cyan!20.7} 0.80 & 0.21 & 0.24 & \cellcolor{cyan!25.0} 0.86 & 0.52 \\
Mixtral 8x7B (it) & 0.07 & \cellcolor{cyan!16.7} 0.15 & \cellcolor{cyan!23.2} 0.87 & \cellcolor{cyan!7.2} 0.78 & 0.22 & 0.24 & \cellcolor{cyan!6.8} 0.82 & \cellcolor{cyan!13.7} 0.73 \\
Mixtral 8x7B & 0.08 & 0.17 & 0.81 & 0.73 & 0.30 & 0.31 & \cellcolor{cyan!1.1} 0.81 & \cellcolor{orange!24.1} 0.45 \\
Mistral 7B (it) & \cellcolor{cyan!7.4} 0.04 & 0.19 & 0.79 & 0.69 & \cellcolor{orange!6.6} 0.35 & 0.36 & 0.72 & 0.63 \\
Mistral 7B & 0.14 & 0.19 & 0.80 & \cellcolor{cyan!12.9} 0.79 & 0.26 & 0.30 & 0.76 & \cellcolor{orange!25.0} 0.45 \\
Yi 34B (it) & 0.15 & 0.21 & 0.81 & 0.51 & 0.14 & 0.21 & 0.79 & 0.69 \\
Yi 34B & 0.13 & 0.23 & 0.66 & 0.50 & 0.08 & 0.23 & 0.70 & 0.62 \\
Gemma 7B (it) & \cellcolor{cyan!8.8} 0.04 & 0.22 & 0.71 & 0.60 & \cellcolor{orange!10.0} 0.36 & 0.38 & 0.59 & 0.58 \\
Gemma 7B & \cellcolor{orange!25.0} 0.35 & \cellcolor{orange!25.0} 0.38 & \cellcolor{orange!1.6} 0.50 & 0.51 & 0.15 & 0.25 & 0.65 & \cellcolor{orange!4.1} 0.48 \\
Gemma 2B (it) & 0.12 & 0.27 & \cellcolor{orange!25.0} 0.46 & \cellcolor{orange!25.0} 0.46 & \cellcolor{orange!25.0} 0.38 & \cellcolor{orange!25.0} 0.41 & \cellcolor{orange!25.0} 0.42 & \cellcolor{orange!16.3} 0.46 \\
Gemma 2B & \cellcolor{cyan!25.0} 0.01 & 0.23 & 0.57 & 0.53 & \cellcolor{cyan!25.0} 0.01 & 0.24 & 0.63 & 0.54 \\
LR & 0.02 & 0.15 & 0.86 & 0.78 & 0.02 & 0.15 & 0.86 & 0.78 \\
GBM & 0.00 & 0.12 & 0.91 & 0.83 & 0.00 & 0.12 & 0.91 & 0.83 \\
XGBoost & 0.00 & 0.12 & 0.91 & 0.83 & 0.00 & 0.12 & 0.91 & 0.83 \\
\bottomrule
\end{tabular}

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
\begin{tabular}{lllllllll}
\toprule
& ece (mult. choice) & brier score loss (mult. choice) & roc auc (mult. choice) & accuracy (mult. choice) & ece & brier score loss & roc auc & accuracy \\
Model & & & & & & & & \\
\midrule
GPT 4o mini (it) & 0.24 & 0.24 & \cellcolor{cyan!17.6} 0.85 & 0.74 & \cellcolor{cyan!25.0} 0.05 & \cellcolor{cyan!25.0} 0.16 & \cellcolor{cyan!20.6} 0.83 & \cellcolor{cyan!24.4} 0.78 \\
Llama 3 70B (it) & 0.27 & 0.27 & \cellcolor{cyan!25.0} 0.86 & 0.69 & 0.25 & 0.23 & \cellcolor{cyan!22.1} 0.84 & 0.67 \\
Llama 3 70B & 0.20 & \cellcolor{cyan!14.9} 0.20 & \cellcolor{cyan!20.8} 0.86 & 0.70 & 0.27 & 0.24 & \cellcolor{cyan!6.7} 0.82 & 0.54 \\
Llama 3 8B (it) & 0.32 & 0.30 & \cellcolor{cyan!13.3} 0.85 & 0.62 & 0.23 & 0.23 & \cellcolor{cyan!0.1} 0.81 & 0.67 \\
Llama 3 8B & 0.25 & 0.26 & 0.81 & \cellcolor{orange!20.8} 0.38 & 0.14 & 0.24 & 0.63 & \cellcolor{orange!3.7} 0.40 \\
Mixtral 8x22B (it) & 0.21 & \cellcolor{cyan!3.6} 0.22 & \cellcolor{cyan!11.2} 0.85 & \cellcolor{cyan!11.1} 0.76 & 0.11 & \cellcolor{cyan!15.5} 0.17 & \cellcolor{cyan!25.0} 0.84 & \cellcolor{cyan!18.9} 0.77 \\
Mixtral 8x22B & \cellcolor{cyan!13.2} 0.17 & \cellcolor{cyan!21.6} 0.19 & \cellcolor{cyan!16.5} 0.85 & 0.68 & 0.13 & \cellcolor{cyan!3.6} 0.18 & \cellcolor{cyan!9.6} 0.82 & \cellcolor{cyan!2.4} 0.74 \\
Mixtral 8x7B (it) & \cellcolor{cyan!16.8} 0.16 & \cellcolor{cyan!25.0} 0.18 & \cellcolor{cyan!19.7} 0.86 & \cellcolor{cyan!25.0} 0.78 & 0.10 & \cellcolor{cyan!15.5} 0.17 & \cellcolor{cyan!22.8} 0.84 & \cellcolor{cyan!12.8} 0.76 \\
Mixtral 8x7B & \cellcolor{cyan!10.6} 0.17 & \cellcolor{cyan!11.5} 0.21 & 0.83 & 0.65 & \cellcolor{cyan!4.0} 0.07 & \cellcolor{cyan!13.1} 0.17 & \cellcolor{cyan!3.7} 0.81 & \cellcolor{cyan!25.0} 0.78 \\
Mistral 7B (it) & 0.21 & \cellcolor{cyan!4.7} 0.22 & 0.83 & \cellcolor{cyan!16.5} 0.77 & 0.16 & 0.19 & \cellcolor{cyan!14.7} 0.83 & 0.70 \\
Mistral 7B & 0.20 & 0.23 & 0.80 & 0.73 & \cellcolor{orange!15.7} 0.36 & 0.32 & 0.75 & 0.49 \\
Yi 34B (it) & \cellcolor{cyan!1.3} 0.19 & \cellcolor{cyan!18.8} 0.19 & \cellcolor{cyan!19.7} 0.86 & 0.72 & 0.22 & 0.21 & 0.80 & 0.48 \\
Yi 34B & 0.25 & \cellcolor{cyan!2.5} 0.22 & \cellcolor{cyan!13.3} 0.85 & 0.62 & 0.15 & 0.19 & \cellcolor{cyan!17.7} 0.83 & 0.61 \\
Gemma 7B (it) & \cellcolor{orange!14.7} 0.61 & \cellcolor{orange!4.7} 0.59 & \cellcolor{cyan!3.8} 0.84 & \cellcolor{orange!25.0} 0.37 & 0.33 & 0.30 & 0.78 & 0.42 \\
Gemma 7B & 0.24 & 0.27 & 0.76 & \cellcolor{orange!25.0} 0.37 & 0.15 & 0.20 & 0.80 & 0.73 \\
Gemma 2B (it) & \cellcolor{orange!25.0} 0.63 & \cellcolor{orange!25.0} 0.63 & 0.73 & \cellcolor{orange!25.0} 0.37 & 0.28 & 0.31 & \cellcolor{orange!25.0} 0.50 & \cellcolor{orange!25.0} 0.37 \\
Gemma 2B & \cellcolor{cyan!25.0} 0.14 & 0.25 & \cellcolor{orange!25.0} 0.62 & 0.45 & \cellcolor{orange!25.0} 0.37 & \cellcolor{orange!25.0} 0.37 & \cellcolor{orange!25.0} 0.50 & 0.63 \\
LR & 0.03 & 0.18 & 0.79 & 0.74 & 0.03 & 0.18 & 0.79 & 0.74 \\
GBM & 0.01 & 0.13 & 0.89 & 0.81 & 0.01 & 0.13 & 0.89 & 0.81 \\
XGBoost & 0.00 & 0.13 & 0.90 & 0.82 & 0.00 & 0.13 & 0.90 & 0.82 \\
\bottomrule
\end{tabular}

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
\begin{tabular}{lllllllll}
\toprule
& ece (num) & brier score loss (num) & roc auc (num) & accuracy (num) & ece & brier score loss & roc auc & accuracy \\
Model & & & & & & & & \\
\midrule
GPT 4o mini (it) & \cellcolor{cyan!25.0} 0.05 & \cellcolor{cyan!25.0} 0.16 & \cellcolor{cyan!20.6} 0.83 & \cellcolor{cyan!24.4} 0.78 & 0.24 & 0.24 & \cellcolor{cyan!17.6} 0.85 & 0.74 \\
Llama 3 70B (it) & 0.25 & 0.23 & \cellcolor{cyan!22.1} 0.84 & 0.67 & 0.27 & 0.27 & \cellcolor{cyan!25.0} 0.86 & 0.69 \\
Llama 3 70B & 0.27 & 0.24 & \cellcolor{cyan!6.7} 0.82 & 0.54 & 0.20 & \cellcolor{cyan!14.9} 0.20 & \cellcolor{cyan!20.8} 0.86 & 0.70 \\
Llama 3 8B (it) & 0.23 & 0.23 & \cellcolor{cyan!0.1} 0.81 & 0.67 & 0.32 & 0.30 & \cellcolor{cyan!13.3} 0.85 & 0.62 \\
Llama 3 8B & 0.14 & 0.24 & 0.63 & \cellcolor{orange!3.7} 0.40 & 0.25 & 0.26 & 0.81 & \cellcolor{orange!20.8} 0.38 \\
Mixtral 8x22B (it) & 0.11 & \cellcolor{cyan!15.5} 0.17 & \cellcolor{cyan!25.0} 0.84 & \cellcolor{cyan!18.9} 0.77 & 0.21 & \cellcolor{cyan!3.6} 0.22 & \cellcolor{cyan!11.2} 0.85 & \cellcolor{cyan!11.1} 0.76 \\
Mixtral 8x22B & 0.13 & \cellcolor{cyan!3.6} 0.18 & \cellcolor{cyan!9.6} 0.82 & \cellcolor{cyan!2.4} 0.74 & \cellcolor{cyan!13.2} 0.17 & \cellcolor{cyan!21.6} 0.19 & \cellcolor{cyan!16.5} 0.85 & 0.68 \\
Mixtral 8x7B (it) & 0.10 & \cellcolor{cyan!15.5} 0.17 & \cellcolor{cyan!22.8} 0.84 & \cellcolor{cyan!12.8} 0.76 & \cellcolor{cyan!16.8} 0.16 & \cellcolor{cyan!25.0} 0.18 & \cellcolor{cyan!19.7} 0.86 & \cellcolor{cyan!25.0} 0.78 \\
Mixtral 8x7B & \cellcolor{cyan!4.0} 0.07 & \cellcolor{cyan!13.1} 0.17 & \cellcolor{cyan!3.7} 0.81 & \cellcolor{cyan!25.0} 0.78 & \cellcolor{cyan!10.6} 0.17 & \cellcolor{cyan!11.5} 0.21 & 0.83 & 0.65 \\
Mistral 7B (it) & 0.16 & 0.19 & \cellcolor{cyan!14.7} 0.83 & 0.70 & 0.21 & \cellcolor{cyan!4.7} 0.22 & 0.83 & \cellcolor{cyan!16.5} 0.77 \\
Mistral 7B & \cellcolor{orange!15.7} 0.36 & 0.32 & 0.75 & 0.49 & 0.20 & 0.23 & 0.80 & 0.73 \\
Yi 34B (it) & 0.22 & 0.21 & 0.80 & 0.48 & \cellcolor{cyan!1.3} 0.19 & \cellcolor{cyan!18.8} 0.19 & \cellcolor{cyan!19.7} 0.86 & 0.72 \\
Yi 34B & 0.15 & 0.19 & \cellcolor{cyan!17.7} 0.83 & 0.61 & 0.25 & \cellcolor{cyan!2.5} 0.22 & \cellcolor{cyan!13.3} 0.85 & 0.62 \\
Gemma 7B (it) & 0.33 & 0.30 & 0.78 & 0.42 & \cellcolor{orange!14.7} 0.61 & \cellcolor{orange!4.7} 0.59 & \cellcolor{cyan!3.8} 0.84 & \cellcolor{orange!25.0} 0.37 \\
Gemma 7B & 0.15 & 0.20 & 0.80 & 0.73 & 0.24 & 0.27 & 0.76 & \cellcolor{orange!25.0} 0.37 \\
Gemma 2B (it) & 0.28 & 0.31 & \cellcolor{orange!25.0} 0.50 & \cellcolor{orange!25.0} 0.37 & \cellcolor{orange!25.0} 0.63 & \cellcolor{orange!25.0} 0.63 & 0.73 & \cellcolor{orange!25.0} 0.37 \\
Gemma 2B & \cellcolor{orange!25.0} 0.37 & \cellcolor{orange!25.0} 0.37 & \cellcolor{orange!25.0} 0.50 & 0.63 & \cellcolor{cyan!25.0} 0.14 & 0.25 & \cellcolor{orange!25.0} 0.62 & 0.45 \\
LR & 0.03 & 0.18 & 0.79 & 0.74 & 0.03 & 0.18 & 0.79 & 0.74 \\
GBM & 0.01 & 0.13 & 0.89 & 0.81 & 0.01 & 0.13 & 0.89 & 0.81 \\
XGBoost & 0.00 & 0.13 & 0.90 & 0.82 & 0.00 & 0.13 & 0.90 & 0.82 \\
\bottomrule
\end{tabular}

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
\begin{tabular}{lllllllll}
\toprule
& ece (mult. choice) & brier score loss (mult. choice) & roc auc (mult. choice) & accuracy (mult. choice) & ece & brier score loss & roc auc & accuracy \\
Model & & & & & & & & \\
\midrule
GPT 4o mini (it) & 0.26 & 0.26 & \cellcolor{cyan!0.6} 0.57 & \cellcolor{cyan!25.0} 0.73 & 0.22 & 0.25 & 0.49 & \cellcolor{cyan!25.0} 0.73 \\
Llama 3 70B (it) & 0.20 & \cellcolor{cyan!1.5} 0.25 & 0.57 & 0.58 & 0.05 & \cellcolor{cyan!10.7} 0.20 & 0.52 & \cellcolor{cyan!25.0} 0.73 \\
Llama 3 70B & 0.22 & \cellcolor{cyan!4.8} 0.24 & 0.55 & 0.53 & 0.06 & \cellcolor{cyan!7.1} 0.20 & 0.53 & \cellcolor{cyan!20.7} 0.73 \\
Llama 3 8B (it) & 0.15 & \cellcolor{cyan!13.7} 0.22 & 0.56 & \cellcolor{cyan!5.3} 0.70 & 0.11 & 0.21 & 0.49 & \cellcolor{cyan!23.4} 0.73 \\
Llama 3 8B & \cellcolor{cyan!12.5} 0.10 & \cellcolor{cyan!21.7} 0.20 & 0.55 & \cellcolor{cyan!24.5} 0.73 & 0.14 & 0.21 & 0.51 & \cellcolor{cyan!19.1} 0.72 \\
Mixtral 8x22B (it) & 0.40 & 0.40 & \cellcolor{orange!12.8} 0.51 & 0.39 & 0.05 & \cellcolor{cyan!17.9} 0.20 & \cellcolor{cyan!25.0} 0.54 & \cellcolor{cyan!25.0} 0.73 \\
Mixtral 8x22B & \cellcolor{cyan!10.2} 0.11 & \cellcolor{cyan!21.2} 0.21 & 0.55 & \cellcolor{cyan!25.0} 0.73 & 0.13 & 0.22 & 0.49 & \cellcolor{cyan!25.0} 0.73 \\
Mixtral 8x7B (it) & 0.26 & 0.26 & \cellcolor{cyan!12.8} 0.58 & \cellcolor{cyan!25.0} 0.73 & 0.11 & 0.21 & 0.51 & \cellcolor{cyan!25.0} 0.73 \\
Mixtral 8x7B & \cellcolor{cyan!0.8} 0.14 & \cellcolor{cyan!18.9} 0.21 & 0.57 & \cellcolor{cyan!25.0} 0.73 & 0.24 & 0.25 & \cellcolor{orange!25.0} 0.48 & \cellcolor{cyan!25.0} 0.73 \\
Mistral 7B (it) & 0.26 & 0.26 & 0.57 & \cellcolor{cyan!25.0} 0.73 & 0.17 & 0.23 & 0.49 & \cellcolor{cyan!24.5} 0.73 \\
Mistral 7B & 0.20 & \cellcolor{cyan!8.6} 0.23 & 0.53 & \cellcolor{cyan!23.4} 0.73 & \cellcolor{orange!25.0} 0.27 & \cellcolor{orange!25.0} 0.27 & 0.50 & \cellcolor{cyan!25.0} 0.73 \\
Yi 34B (it) & \cellcolor{cyan!18.2} 0.09 & \cellcolor{cyan!23.1} 0.20 & 0.56 & \cellcolor{cyan!15.4} 0.72 & 0.23 & 0.25 & 0.50 & \cellcolor{orange!25.0} 0.27 \\
Yi 34B & \cellcolor{cyan!25.0} 0.07 & \cellcolor{cyan!25.0} 0.20 & 0.57 & \cellcolor{cyan!20.7} 0.73 & 0.15 & 0.23 & 0.52 & 0.44 \\
Gemma 7B (it) & 0.25 & 0.26 & \cellcolor{cyan!25.0} 0.58 & \cellcolor{cyan!21.8} 0.73 & \cellcolor{orange!8.9} 0.25 & 0.26 & \cellcolor{orange!3.1} 0.49 & \cellcolor{cyan!22.3} 0.73 \\
Gemma 7B & 0.41 & 0.37 & \cellcolor{orange!25.0} 0.50 & \cellcolor{orange!25.0} 0.27 & 0.19 & 0.24 & 0.49 & \cellcolor{cyan!25.0} 0.73 \\
Gemma 2B (it) & \cellcolor{orange!25.0} 0.73 & \cellcolor{orange!25.0} 0.73 & 0.52 & \cellcolor{orange!25.0} 0.27 & \cellcolor{cyan!25.0} 0.02 & \cellcolor{cyan!25.0} 0.20 & 0.50 & \cellcolor{cyan!25.0} 0.73 \\
Gemma 2B & 0.25 & 0.26 & 0.51 & 0.34 & \cellcolor{orange!25.0} 0.27 & \cellcolor{orange!25.0} 0.27 & 0.50 & \cellcolor{cyan!25.0} 0.73 \\
LR & 0.02 & 0.19 & 0.61 & 0.74 & 0.02 & 0.19 & 0.61 & 0.74 \\
GBM & 0.01 & 0.17 & 0.74 & 0.76 & 0.01 & 0.17 & 0.74 & 0.76 \\
XGBoost & 0.00 & 0.16 & 0.74 & 0.76 & 0.00 & 0.16 & 0.74 & 0.76 \\
\bottomrule
\end{tabular}

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
\begin{tabular}{lllllllll}
\toprule
& ece (num) & brier score loss (num) & roc auc (num) & accuracy (num) & ece & brier score loss & roc auc & accuracy \\
Model & & & & & & & & \\
\midrule
GPT 4o mini (it) & 0.22 & 0.25 & 0.49 & \cellcolor{cyan!25.0} 0.73 & 0.26 & 0.26 & \cellcolor{cyan!0.6} 0.57 & \cellcolor{cyan!25.0} 0.73 \\
Llama 3 70B (it) & 0.05 & \cellcolor{cyan!10.7} 0.20 & 0.52 & \cellcolor{cyan!25.0} 0.73 & 0.20 & \cellcolor{cyan!1.5} 0.25 & 0.57 & 0.58 \\
Llama 3 70B & 0.06 & \cellcolor{cyan!7.1} 0.20 & 0.53 & \cellcolor{cyan!20.7} 0.73 & 0.22 & \cellcolor{cyan!4.8} 0.24 & 0.55 & 0.53 \\
Llama 3 8B (it) & 0.11 & 0.21 & 0.49 & \cellcolor{cyan!23.4} 0.73 & 0.15 & \cellcolor{cyan!13.7} 0.22 & 0.56 & \cellcolor{cyan!5.3} 0.70 \\
Llama 3 8B & 0.14 & 0.21 & 0.51 & \cellcolor{cyan!19.1} 0.72 & \cellcolor{cyan!12.5} 0.10 & \cellcolor{cyan!21.7} 0.20 & 0.55 & \cellcolor{cyan!24.5} 0.73 \\
Mixtral 8x22B (it) & 0.05 & \cellcolor{cyan!17.9} 0.20 & \cellcolor{cyan!25.0} 0.54 & \cellcolor{cyan!25.0} 0.73 & 0.40 & 0.40 & \cellcolor{orange!12.8} 0.51 & 0.39 \\
Mixtral 8x22B & 0.13 & 0.22 & 0.49 & \cellcolor{cyan!25.0} 0.73 & \cellcolor{cyan!10.2} 0.11 & \cellcolor{cyan!21.2} 0.21 & 0.55 & \cellcolor{cyan!25.0} 0.73 \\
Mixtral 8x7B (it) & 0.11 & 0.21 & 0.51 & \cellcolor{cyan!25.0} 0.73 & 0.26 & 0.26 & \cellcolor{cyan!12.8} 0.58 & \cellcolor{cyan!25.0} 0.73 \\
Mixtral 8x7B & 0.24 & 0.25 & \cellcolor{orange!25.0} 0.48 & \cellcolor{cyan!25.0} 0.73 & \cellcolor{cyan!0.8} 0.14 & \cellcolor{cyan!18.9} 0.21 & 0.57 & \cellcolor{cyan!25.0} 0.73 \\
Mistral 7B (it) & 0.17 & 0.23 & 0.49 & \cellcolor{cyan!24.5} 0.73 & 0.26 & 0.26 & 0.57 & \cellcolor{cyan!25.0} 0.73 \\
Mistral 7B & \cellcolor{orange!25.0} 0.27 & \cellcolor{orange!25.0} 0.27 & 0.50 & \cellcolor{cyan!25.0} 0.73 & 0.20 & \cellcolor{cyan!8.6} 0.23 & 0.53 & \cellcolor{cyan!23.4} 0.73 \\
Yi 34B (it) & 0.23 & 0.25 & 0.50 & \cellcolor{orange!25.0} 0.27 & \cellcolor{cyan!18.2} 0.09 & \cellcolor{cyan!23.1} 0.20 & 0.56 & \cellcolor{cyan!15.4} 0.72 \\
Yi 34B & 0.15 & 0.23 & 0.52 & 0.44 & \cellcolor{cyan!25.0} 0.07 & \cellcolor{cyan!25.0} 0.20 & 0.57 & \cellcolor{cyan!20.7} 0.73 \\
Gemma 7B (it) & \cellcolor{orange!8.9} 0.25 & 0.26 & \cellcolor{orange!3.1} 0.49 & \cellcolor{cyan!22.3} 0.73 & 0.25 & 0.26 & \cellcolor{cyan!25.0} 0.58 & \cellcolor{cyan!21.8} 0.73 \\
Gemma 7B & 0.19 & 0.24 & 0.49 & \cellcolor{cyan!25.0} 0.73 & 0.41 & 0.37 & \cellcolor{orange!25.0} 0.50 & \cellcolor{orange!25.0} 0.27 \\
Gemma 2B (it) & \cellcolor{cyan!25.0} 0.02 & \cellcolor{cyan!25.0} 0.20 & 0.50 & \cellcolor{cyan!25.0} 0.73 & \cellcolor{orange!25.0} 0.73 & \cellcolor{orange!25.0} 0.73 & 0.52 & \cellcolor{orange!25.0} 0.27 \\
Gemma 2B & \cellcolor{orange!25.0} 0.27 & \cellcolor{orange!25.0} 0.27 & 0.50 & \cellcolor{cyan!25.0} 0.73 & 0.25 & 0.26 & 0.51 & 0.34 \\
LR & 0.02 & 0.19 & 0.61 & 0.74 & 0.02 & 0.19 & 0.61 & 0.74 \\
GBM & 0.01 & 0.17 & 0.74 & 0.76 & 0.01 & 0.17 & 0.74 & 0.76 \\
XGBoost & 0.00 & 0.16 & 0.74 & 0.76 & 0.00 & 0.16 & 0.74 & 0.76 \\
\bottomrule
\end{tabular}

Loading

0 comments on commit 02f1db5

Please sign in to comment.