|
2 | 2 |
|
3 | 3 | **Learning objectives:** |
4 | 4 |
|
5 | | -- THESE ARE NICE TO HAVE BUT NOT ABSOLUTELY NECESSARY |
| 5 | +- Plot strike zone |
| 6 | +- Learn how to model called strike probability |
| 7 | +- Learn how to model catcher framing |
6 | 8 |
|
7 | | -## SLIDE 1 {-} |
| 9 | +## Background |
8 | 10 |
|
9 | | -- ADD SLIDES AS SECTIONS (`##`). |
10 | | -- TRY TO KEEP THEM RELATIVELY SLIDE-LIKE; THESE ARE NOTES, NOT THE BOOK ITSELF. |
| 11 | +- "Historically, scouts and coaches insisted that certain catchers had the ability to “frame” pitches for umpires. The idea was that by holding the glove relatively still, you could trick the umpire into calling a pitch a strike even if it was technically outside of the strike zone" |
| 12 | +- "Part of the problem was that until the mid-2000s, pitch-level data was hard to come by. With the advent of PITCHf/x, more sophisticated modeling techniques became viable on these more granular data." |
| 13 | + |
| 14 | +## Framing Examples |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | +## Getting the data |
| 26 | + |
| 27 | +```{r, eval = FALSE} |
| 28 | +sc2022 <- here::here("data_large/statcast_rds/statcast_2022.rds") |> |
| 29 | + read_rds() |
| 30 | +sc2022 <- sc2022 |> |
| 31 | + mutate( |
| 32 | + Outcome = case_match( |
| 33 | + description, |
| 34 | + c("ball", "blocked_ball", "pitchout", |
| 35 | + "hit_by_pitch") ~ "ball", |
| 36 | + c("swinging_strike", "swinging_strike_blocked", |
| 37 | + "foul", "foul_bunt", "foul_tip", |
| 38 | + "hit_into_play", "missed_bunt" ) ~ "swing", |
| 39 | + "called_strike" ~ "called_strike"), |
| 40 | + Home = ifelse(inning_topbot == "Bot", 1, 0), |
| 41 | + Count = paste(balls, strikes, sep = "-") |
| 42 | + ) |
| 43 | +``` |
| 44 | + |
| 45 | +```{r, eval = FALSE} |
| 46 | +taken <- sc2022 |> |
| 47 | + filter(Outcome != "swing") |
| 48 | +taken_select <- select( |
| 49 | + taken, pitch_type, release_speed, |
| 50 | + description, stand, p_throws, Outcome, |
| 51 | + plate_x, plate_z, fielder_2_1, |
| 52 | + pitcher, batter, Count, Home, zone |
| 53 | +) |
| 54 | +write_rds( |
| 55 | + taken_select, |
| 56 | + here::here("data/sc_taken_2022.rds"), |
| 57 | + compress = "xz" |
| 58 | +) |
| 59 | +``` |
| 60 | + |
| 61 | +## Where is the Strike Zone? |
| 62 | + |
| 63 | +- Only part of the ball needs to cross home plate to be a strike |
| 64 | +- Home plate is 17 inches wide and ball's circumference is 9 inches |
| 65 | +- Outside edges of strike zone vary by plus or minus 0.947 feet |
| 66 | +- Top and bottom of strike zone varies by batter height (Midpoint between top of shoulders and top of players pants down to hollow beneath kneecap) |
| 67 | + |
| 68 | + |
| 69 | + |
| 70 | +```{r, eval = FALSE} |
| 71 | +plate_width <- 17 + 2 * (9/pi) |
| 72 | +k_zone_plot <- ggplot( |
| 73 | + NULL, aes(x = plate_x, y = plate_z) |
| 74 | +) + |
| 75 | + geom_rect( |
| 76 | + xmin = -(plate_width/2)/12, |
| 77 | + xmax = (plate_width/2)/12, |
| 78 | + ymin = 1.5, |
| 79 | + ymax = 3.6, color = crcblue, alpha = 0 |
| 80 | + ) + |
| 81 | + coord_equal() + |
| 82 | + scale_x_continuous( |
| 83 | + "Horizontal location (ft.)", |
| 84 | + limits = c(-2, 2) |
| 85 | + ) + |
| 86 | + scale_y_continuous( |
| 87 | + "Vertical location (ft.)", |
| 88 | + limits = c(0, 5) |
| 89 | + ) |
| 90 | +``` |
| 91 | + |
| 92 | +```{r, eval = FALSE} |
| 93 | +k_zone_plot %+% |
| 94 | + sample_n(taken, size = 2000) + |
| 95 | + aes(color = Outcome) + |
| 96 | + geom_point(alpha = 0.2) + |
| 97 | + scale_color_manual(values = crc_fc) |
| 98 | +``` |
| 99 | + |
| 100 | + |
| 101 | + |
| 102 | +```{r, eval=FALSE} |
| 103 | +zones <- taken |> |
| 104 | + group_by(zone) |> |
| 105 | + summarize( |
| 106 | + N = n(), |
| 107 | + right_edge = min(1.5, max(plate_x)), |
| 108 | + left_edge = max(-1.5, min(plate_x)), |
| 109 | + top_edge = min(5, quantile(plate_z, 0.95, na.rm = TRUE)), |
| 110 | + bottom_edge = max(0, quantile(plate_z, 0.05, na.rm = TRUE)), |
| 111 | + strike_pct = sum(Outcome == "called_strike") / n(), |
| 112 | + plate_x = mean(plate_x), |
| 113 | + plate_z = mean(plate_z) |
| 114 | + ) |
| 115 | +``` |
| 116 | + |
| 117 | +```{r, eval=FALSE} |
| 118 | +library(ggrepel) |
| 119 | +k_zone_plot %+% zones + |
| 120 | + geom_rect( |
| 121 | + aes( |
| 122 | + xmax = right_edge, xmin = left_edge, |
| 123 | + ymax = top_edge, ymin = bottom_edge, |
| 124 | + fill = strike_pct, alpha = strike_pct |
| 125 | + ), |
| 126 | + color = "lightgray" |
| 127 | + ) + |
| 128 | + geom_text_repel( |
| 129 | + size = 3, |
| 130 | + aes( |
| 131 | + label = round(strike_pct, 2), |
| 132 | + color = strike_pct < 0.5 |
| 133 | + ) |
| 134 | + ) + |
| 135 | + scale_fill_gradient(low = "gray70", high = crcblue) + |
| 136 | + scale_color_manual(values = crc_fc) + |
| 137 | + guides(color = FALSE, alpha = FALSE) |
| 138 | +``` |
| 139 | + |
| 140 | + |
| 141 | + |
| 142 | +## Modeling Called Strike Percentage |
| 143 | + |
| 144 | +- We use a Generalized Additive Model with binomial family |
| 145 | + |
| 146 | +```{r, eval=FALSE} |
| 147 | +library(mgcv) |
| 148 | +strike_mod <- gam( |
| 149 | + Outcome == "called_strike" ~ s(plate_x, plate_z), |
| 150 | + family = binomial, |
| 151 | + data = taken |
| 152 | +) |
| 153 | +``` |
| 154 | + |
| 155 | +```{r, eval=FALSE} |
| 156 | +library(broom) |
| 157 | +hats <- strike_mod |> |
| 158 | + augment(type.predict = "response") |
| 159 | +
|
| 160 | +k_zone_plot %+% sample_n(hats, 10000) + |
| 161 | + geom_point(aes(color = .fitted), alpha = 0.1) + |
| 162 | + scale_color_gradient(low = "gray70", high = crcblue) |
| 163 | +``` |
| 164 | + |
| 165 | + |
| 166 | + |
| 167 | +- We can build a continuous grid |
| 168 | + |
| 169 | +```{r, eval=FALSE} |
| 170 | +library(modelr) |
| 171 | +grid <- taken |> |
| 172 | + data_grid( |
| 173 | + plate_x = seq_range(plate_x, n = 100), |
| 174 | + plate_z = seq_range(plate_z, n = 100) |
| 175 | + ) |
| 176 | +
|
| 177 | +grid_hats <- strike_mod |> |
| 178 | + augment(type.predict = "response", newdata = grid) |
| 179 | +
|
| 180 | +tile_plot <- k_zone_plot %+% grid_hats + |
| 181 | + geom_tile(aes(fill = .fitted), alpha = 0.7) + |
| 182 | + scale_fill_gradient(low = "gray92", high = crcblue) |
| 183 | +tile_plot |
| 184 | +``` |
| 185 | + |
| 186 | + |
| 187 | + |
| 188 | +- Batter and pitcher handedness may have an effect, let's add it to our GAM |
| 189 | + |
| 190 | +```{r, eval=FALSE} |
| 191 | +hand_mod <- gam( |
| 192 | + Outcome == "called_strike" ~ |
| 193 | + p_throws + stand + s(plate_x, plate_z), |
| 194 | + family = binomial, |
| 195 | + data = taken |
| 196 | +) |
| 197 | +
|
| 198 | +hand_grid <- taken |> |
| 199 | + data_grid( |
| 200 | + plate_x = seq_range(plate_x, n = 100), |
| 201 | + plate_z = seq_range(plate_z, n = 100), |
| 202 | + p_throws, |
| 203 | + stand |
| 204 | + ) |
| 205 | +hand_grid_hats <- hand_mod |> |
| 206 | + augment(type.predict = "response", newdata = hand_grid) |
| 207 | +
|
| 208 | +diffs <- hand_grid_hats |> |
| 209 | + group_by(plate_x, plate_z) |> |
| 210 | + summarize( |
| 211 | + N = n(), |
| 212 | + .fitted = sd(.fitted), |
| 213 | + .groups = "drop" |
| 214 | + ) |
| 215 | +tile_plot %+% diffs |
| 216 | +``` |
| 217 | + |
| 218 | + |
| 219 | + |
| 220 | +## Modeling Catcher Framing |
| 221 | + |
| 222 | + |
| 223 | +```{r, eval=FALSE} |
| 224 | +taken <- taken |> |
| 225 | + filter( |
| 226 | + is.na(plate_x) == FALSE, |
| 227 | + is.na(plate_z) == FALSE |
| 228 | + ) |> |
| 229 | + mutate( |
| 230 | + strike_prob = predict( |
| 231 | + strike_mod, |
| 232 | + type = "response" |
| 233 | + ) |
| 234 | + ) |
| 235 | +``` |
| 236 | + |
| 237 | +$$\log \frac{p_j}{1 - p_j} = \beta_0 + \beta_1 \cdot strike\_prob_j + \alpha_{c(j)}$$ |
| 238 | + |
| 239 | +We fit a generalized linear mixed model using fixed effects from the catcher. |
| 240 | + |
| 241 | +```{r, eval=FALSE} |
| 242 | +library(lme4) |
| 243 | +mod_a <- glmer( |
| 244 | + Outcome == "called_strike" ~ |
| 245 | + strike_prob + (1|fielder_2_1), |
| 246 | + data = taken, |
| 247 | + family = binomial |
| 248 | +) |
| 249 | +``` |
| 250 | + |
| 251 | +```{r, eval=FALSE} |
| 252 | +fixed.effects(mod_a) |
| 253 | +
|
| 254 | +# (Intercept) strike_prob |
| 255 | +# -4.00 7.67 |
| 256 | +
|
| 257 | +VarCorr(mod_a) |
| 258 | +
|
| 259 | +# Groups Name Std.Dev. |
| 260 | +# fielder_2_1 (Intercept) 0.218 |
| 261 | +``` |
| 262 | + |
| 263 | +```{r, eval=FALSE} |
| 264 | +c_effects <- mod_a |> |
| 265 | + ranef() |> |
| 266 | + as_tibble() |> |
| 267 | + transmute( |
| 268 | + id = as.numeric(levels(grp)), |
| 269 | + effect = condval |
| 270 | + ) |
| 271 | +``` |
| 272 | + |
| 273 | +```{r, eval=FALSE} |
| 274 | +master_id <- baseballr::chadwick_player_lu() |> |
| 275 | + mutate( |
| 276 | + mlb_name = paste(name_first, name_last), |
| 277 | + mlb_id = key_mlbam |
| 278 | + ) |> |
| 279 | + select(mlb_id, mlb_name) |> |
| 280 | + filter(!is.na(mlb_id)) |
| 281 | +
|
| 282 | +c_effects <- c_effects |> |
| 283 | + left_join( |
| 284 | + select(master_id, mlb_id, mlb_name), |
| 285 | + join_by(id == mlb_id) |
| 286 | + ) |> |
| 287 | + arrange(desc(effect)) |
| 288 | +
|
| 289 | +c_effects |> slice_head(n = 6) |
| 290 | +
|
| 291 | +# A tibble: 6 × 3 |
| 292 | +# id effect mlb_name |
| 293 | +# <dbl> <dbl> <chr> |
| 294 | +# 1 664848 0.358 Donny Sands |
| 295 | +# 2 669004 0.294 MJ Melendez |
| 296 | +# 3 642020 0.287 Chuckie Robinson |
| 297 | +# 4 672832 0.275 Israel Pineda |
| 298 | +# 5 571912 0.260 Luke Maile |
| 299 | +# 6 575929 0.243 Willson Contreras |
| 300 | +
|
| 301 | +c_effects |> slice_tail(n = 6) |
| 302 | +
|
| 303 | +# A tibble: 6 × 3 |
| 304 | +# id effect mlb_name |
| 305 | +# <dbl> <dbl> <chr> |
| 306 | +# 1 664731 -0.293 P. J. Higgins |
| 307 | +# 2 455139 -0.304 Robinson Chirinos |
| 308 | +# 3 661388 -0.336 William Contreras |
| 309 | +# 4 608360 -0.357 Chris Okey |
| 310 | +# 5 435559 -0.357 Kurt Suzuki |
| 311 | +# 6 595956 -0.390 Cam Gallagher |
| 312 | +``` |
| 313 | + |
| 314 | +$$\log \frac{p_j}{1 - p_j} = \beta_0 + \beta_1 strike\_prob_j + \alpha_{c(j)} + \gamma_{p(j)} + \delta_{b(j)}$$ |
| 315 | + |
| 316 | +We add to the model with pitcher and batter effects. |
| 317 | + |
| 318 | +```{r, eval=FALSE} |
| 319 | +mod_b <- glmer( |
| 320 | + Outcome == "called_strike" ~ strike_prob + |
| 321 | + (1|fielder_2_1) + |
| 322 | + (1|batter) + (1|pitcher), |
| 323 | + data = taken, |
| 324 | + family = binomial |
| 325 | +) |
| 326 | +
|
| 327 | +VarCorr(mod_b) |
| 328 | +
|
| 329 | +# Groups Name Std.Dev. |
| 330 | +# pitcher (Intercept) 0.267 |
| 331 | +# batter (Intercept) 0.251 |
| 332 | +# fielder_2_1 (Intercept) 0.209 |
| 333 | +``` |
| 334 | + |
| 335 | + |
| 336 | +```{r, eval=FALSE} |
| 337 | +c_effects <- mod_b |> |
| 338 | + ranef() |> |
| 339 | + as_tibble() |> |
| 340 | + filter(grpvar == "fielder_2_1") |> |
| 341 | + transmute( |
| 342 | + id = as.numeric(as.character(grp)), |
| 343 | + effect = condval |
| 344 | + ) |
| 345 | +c_effects <- c_effects |> |
| 346 | + left_join( |
| 347 | + select(master_id, mlb_id, mlb_name), |
| 348 | + join_by(id == mlb_id) |
| 349 | + ) |> |
| 350 | + arrange(desc(effect)) |
| 351 | +
|
| 352 | +c_effects |> slice_head(n = 6) |
| 353 | +
|
| 354 | +# A tibble: 6 × 3 |
| 355 | +# id effect mlb_name |
| 356 | +# <dbl> <dbl> <chr> |
| 357 | +# 1 624431 0.313 Jose Trevino |
| 358 | +# 2 669221 0.277 Sean Murphy |
| 359 | +# 3 425877 0.263 Yadier Molina |
| 360 | +# 4 664874 0.253 Seby Zavala |
| 361 | +# 5 543309 0.229 Kyle Higashioka |
| 362 | +# 6 608700 0.221 Kevin Plawecki |
| 363 | +
|
| 364 | +c_effects |> slice_tail(n = 6) |
| 365 | +
|
| 366 | +# A tibble: 6 × 3 |
| 367 | +# id effect mlb_name |
| 368 | +# <dbl> <dbl> <chr> |
| 369 | +# 1 596117 -0.277 Garrett Stubbs |
| 370 | +# 2 435559 -0.281 Kurt Suzuki |
| 371 | +# 3 521692 -0.291 Salvador Perez |
| 372 | +# 4 553869 -0.327 Elias Díaz |
| 373 | +# 5 455139 -0.336 Robinson Chirinos |
| 374 | +# 6 669004 -0.347 MJ Melendez |
| 375 | +``` |
| 376 | + |
| 377 | +## Further Reading |
| 378 | + |
| 379 | +- Turkenkopf [(2008)](https://www.beyondtheboxscore.com/2008/4/5/389840/framing-the-debate) |
| 380 | +- Fast [(2011)](https://www.baseballprospectus.com/news/article/15093/spinning-yarn-removing-the-mask-encore-presentation/) |
| 381 | +- Lindbergh [(2013)](http://grantland.com/features/studying-art-pitch-framing-catchers-such-francisco-cervelli-chris-stewart-jose-molina-others/) |
| 382 | +- Brooks and Pavlidis [(2014)](https://www.baseballprospectus.com/news/article/22934/framing-and-blocking-pitches-a-regressed-probabilistic-model-a-new-method-for-measuring-catcher-defense/) |
| 383 | +- Brooks, Pavilidis, and Judge [(2015)](https://www.baseballprospectus.com/news/article/25514/moving-beyond-wowy-a-mixed-approach-to-measuring-catcher-framing/) |
| 384 | +- Deshpande and Wyner [(2017)](https://doi.org/10.1515/jqas-2017-0027) |
| 385 | +- Judge [(2018)](https://www.baseballprospectus.com/news/article/38289/bayesian-bagging-generate-uncertainty-intervals-catcher-framing-story/) |
0 commit comments