pytorch
diff --git a/‎src/torchcodec/_core/Encoder.cpp
Lines changed: 18 additions & 19 deletions b/‎src/torchcodec/_core/Encoder.cpp
Lines changed: 18 additions & 19 deletions
diff --git a/‎src/torchcodec/_core/Encoder.h
Lines changed: 9 additions & 12 deletions b/‎src/torchcodec/_core/Encoder.h
Lines changed: 9 additions & 12 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp
Lines changed: 43 additions & 44 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp
Lines changed: 43 additions & 44 deletions
@@ -101,9 +101,8 @@ AudioEncoder::AudioEncoder(
     const torch::Tensor samples,
     int sampleRate,
     std::string_view fileName,
-    std::optional<int64_t> bitRate,
-    std::optional<int64_t> numChannels)
-    : samples_(validateSamples(samples)) {
+    const AudioStreamOptions& audioStreamOptions)
+: samples_(validateSamples(samples)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = avformat_alloc_output_context2(
@@ -126,16 +125,15 @@ AudioEncoder::AudioEncoder(
       ", make sure it's a valid path? ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  initializeEncoder(sampleRate, bitRate, numChannels);
+  initializeEncoder(sampleRate, audioStreamOptions);
 }
 
 AudioEncoder::AudioEncoder(
     const torch::Tensor samples,
     int sampleRate,
     std::string_view formatName,
     std::unique_ptr<AVIOToTensorContext> avioContextHolder,
-    std::optional<int64_t> bitRate,
-    std::optional<int64_t> numChannels)
+    const AudioStreamOptions& audioStreamOptions)
     : samples_(validateSamples(samples)), avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
@@ -153,13 +151,12 @@ AudioEncoder::AudioEncoder(
 
   avFormatContext_->pb = avioContextHolder_->getAVIOContext();
 
-  initializeEncoder(sampleRate, bitRate, numChannels);
+  initializeEncoder(sampleRate, audioStreamOptions);
 }
 
 void AudioEncoder::initializeEncoder(
     int sampleRate,
-    std::optional<int64_t> bitRate,
-    std::optional<int64_t> numChannels) {
+    const AudioStreamOptions& audioStreamOptions) {
   // We use the AVFormatContext's default codec for that
   // specific format/container.
   const AVCodec* avCodec =
@@ -170,18 +167,20 @@ void AudioEncoder::initializeEncoder(
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
   avCodecContext_.reset(avCodecContext);
 
-  if (bitRate.has_value()) {
-    TORCH_CHECK(*bitRate >= 0, "bit_rate=", *bitRate, " must be >= 0.");
+  auto desiredBitRate = audioStreamOptions.bitRate;
+  if (desiredBitRate.has_value()) {
+    TORCH_CHECK(
+        *desiredBitRate >= 0, "bit_rate=", *desiredBitRate, " must be >= 0.");
   }
   // bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
   // well when "-b:a" isn't specified.
-  avCodecContext_->bit_rate = bitRate.value_or(0);
-
-  desiredNumChannels_ = static_cast<int>(numChannels.value_or(samples_.sizes()[0]));
-  validateNumChannels(*avCodec, desiredNumChannels_);
+  avCodecContext_->bit_rate = desiredBitRate.value_or(0);
+  outNumChannels_ =
+      static_cast<int>(audioStreamOptions.numChannels.value_or(samples_.sizes()[0]));
+  validateNumChannels(*avCodec, outNumChannels_);
   // The avCodecContext layout defines the layout of the encoded output, it's
   // not related to the input sampes.
-  setDefaultChannelLayout(avCodecContext_, desiredNumChannels_);
+  setDefaultChannelLayout(avCodecContext_, outNumChannels_);
 
   validateSampleRate(*avCodec, sampleRate);
   avCodecContext_->sample_rate = sampleRate;
@@ -306,7 +305,7 @@ void AudioEncoder::encodeInnerLoop(
   bool mustConvert =
       (srcAVFrame != nullptr &&
        (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP ||
-        getNumChannels(srcAVFrame) != desiredNumChannels_));
+        getNumChannels(srcAVFrame) != outNumChannels_));
 
   UniqueAVFrame convertedAVFrame;
   if (mustConvert) {
@@ -317,14 +316,14 @@ void AudioEncoder::encodeInnerLoop(
           srcAVFrame->sample_rate, // No sample rate conversion
           srcAVFrame->sample_rate,
           srcAVFrame,
-          desiredNumChannels_));
+          outNumChannels_));
     }
     convertedAVFrame = convertAudioAVFrameSamples(
         swrContext_,
         srcAVFrame,
         avCodecContext_->sample_fmt,
         srcAVFrame->sample_rate, // No sample rate conversion
-        desiredNumChannels_);
+        outNumChannels_);
     TORCH_CHECK(
         convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
         "convertedAVFrame->nb_samples=",
 
@@ -2,6 +2,7 @@
 #include <torch/types.h>
 #include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 class AudioEncoder {
@@ -13,34 +14,30 @@ class AudioEncoder {
   // like passing 0, which results in choosing the minimum supported bit rate.
   // Passing 44_100 could result in output being 44000 if only 44000 is
   // supported.
-  //
-  // TODO-ENCODING: bundle the optional params like bitRate, numChannels, etc.
-  // into an AudioStreamOptions struct, or similar.
   AudioEncoder(
       const torch::Tensor samples,
+      // TODO-ENCODING: update this comment when we support an output sample
+      // rate. This will become the input sample rate.
       // The *output* sample rate. We can't really decide for the user what it
       // should be. Particularly, the sample rate of the input samples should
       // match this, and that's up to the user. If sample rates don't match,
       // encoding will still work but audio will be distorted.
       int sampleRate,
       std::string_view fileName,
-      std::optional<int64_t> bitRate = std::nullopt,
-      std::optional<int64_t> numChannels = std::nullopt);
+      const AudioStreamOptions& audioStreamOptions);
   AudioEncoder(
       const torch::Tensor samples,
       int sampleRate,
       std::string_view formatName,
       std::unique_ptr<AVIOToTensorContext> avioContextHolder,
-      std::optional<int64_t> bitRate = std::nullopt,
-      std::optional<int64_t> numChannels = std::nullopt);
+      const AudioStreamOptions& audioStreamOptions);
   void encode();
   torch::Tensor encodeToTensor();
 
  private:
   void initializeEncoder(
       int sampleRate,
-      std::optional<int64_t> bitRate = std::nullopt,
-      std::optional<int64_t> numChannels = std::nullopt);
+      const AudioStreamOptions& audioStreamOptions);
   void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
       const UniqueAVFrame& srcAVFrame);
@@ -50,9 +47,9 @@ class AudioEncoder {
   UniqueAVCodecContext avCodecContext_;
   int streamIndex_;
   UniqueSwrContext swrContext_;
-  // TODO-ENCODING: desiredNumChannels should just be part of an options struct,
-  // see other TODO above.
-  int desiredNumChannels_ = -1;
+  AudioStreamOptions audioStreamOptions;
+
+  int outNumChannels_ = -1;
 
   const torch::Tensor samples_;
 
 
@@ -159,74 +159,74 @@ namespace {
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
 
 // Returns:
-// - the srcAVFrame's channel layout if srcAVFrame has desiredNumChannels
-// - the default channel layout with desiredNumChannels otherwise.
-AVChannelLayout getDesiredChannelLayout(
-    int desiredNumChannels,
+// - the srcAVFrame's channel layout if srcAVFrame has outNumChannels
+// - the default channel layout with outNumChannels otherwise.
+AVChannelLayout getOutputChannelLayout(
+    int outNumChannels,
     const UniqueAVFrame& srcAVFrame) {
-  AVChannelLayout desiredLayout;
-  if (desiredNumChannels == getNumChannels(srcAVFrame)) {
-    desiredLayout = srcAVFrame->ch_layout;
+  AVChannelLayout outLayout;
+  if (outNumChannels == getNumChannels(srcAVFrame)) {
+    outLayout = srcAVFrame->ch_layout;
   } else {
-    av_channel_layout_default(&desiredLayout, desiredNumChannels);
+    av_channel_layout_default(&outLayout, outNumChannels);
   }
-  return desiredLayout;
+  return outLayout;
 }
 
 #else
 
 // Same as above
-int64_t getDesiredChannelLayout(
-    int desiredNumChannels,
+int64_t getOutputChannelLayout(
+    int outNumChannels,
     const UniqueAVFrame& srcAVFrame) {
-  int64_t desiredLayout;
-  if (desiredNumChannels == getNumChannels(srcAVFrame)) {
-    desiredLayout = srcAVFrame->channel_layout;
+  int64_t outLayout;
+  if (outNumChannels == getNumChannels(srcAVFrame)) {
+    outLayout = srcAVFrame->channel_layout;
   } else {
-    desiredLayout = av_get_default_channel_layout(desiredNumChannels);
+    outLayout = av_get_default_channel_layout(outNumChannels);
   }
-  return desiredLayout;
+  return outLayout;
 }
 #endif
 } // namespace
 
-// Sets dstAVFrame' channel layout to getDesiredChannelLayout(): see doc above
+// Sets dstAVFrame' channel layout to getOutputChannelLayout(): see doc above
 void setChannelLayout(
     UniqueAVFrame& dstAVFrame,
     const UniqueAVFrame& srcAVFrame,
-    int desiredNumChannels) {
+    int outNumChannels) {
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
-  AVChannelLayout desiredLayout =
-      getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
-  auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &desiredLayout);
+  AVChannelLayout outLayout =
+      getOutputChannelLayout(outNumChannels, srcAVFrame);
+  auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &outLayout);
   TORCH_CHECK(
       status == AVSUCCESS,
       "Couldn't copy channel layout to avFrame: ",
       getFFMPEGErrorStringFromErrorCode(status));
 #else
   dstAVFrame->channel_layout =
-      getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
-  dstAVFrame->channels = desiredNumChannels;
+      getOutputChannelLayout(outNumChannels, srcAVFrame);
+  dstAVFrame->channels = outNumChannels;
 #endif
 }
 
 SwrContext* createSwrContext(
     AVSampleFormat srcSampleFormat,
-    AVSampleFormat desiredSampleFormat,
+    AVSampleFormat outSampleFormat,
     int srcSampleRate,
-    int desiredSampleRate,
+    int outSampleRate,
     const UniqueAVFrame& srcAVFrame,
-    int desiredNumChannels) {
+    int outNumChannels) {
   SwrContext* swrContext = nullptr;
   int status = AVSUCCESS;
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
-  AVChannelLayout desiredLayout =
-      getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
+  AVChannelLayout outLayout =
+      getOutputChannelLayout(outNumChannels, srcAVFrame);
   status = swr_alloc_set_opts2(
       &swrContext,
-      &desiredLayout,
-      desiredSampleFormat,
-      desiredSampleRate,
+      &outLayout,
+      outSampleFormat,
+      outSampleRate,
       &srcAVFrame->ch_layout,
       srcSampleFormat,
       srcSampleRate,
@@ -238,13 +238,12 @@ SwrContext* createSwrContext(
       "Couldn't create SwrContext: ",
       getFFMPEGErrorStringFromErrorCode(status));
 #else
-  int64_t desiredLayout =
-      getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
+  int64_t outLayout = getOutputChannelLayout(outNumChannels, srcAVFrame);
   swrContext = swr_alloc_set_opts(
       nullptr,
-      desiredLayout,
-      desiredSampleFormat,
-      desiredSampleRate,
+      outLayout,
+      outSampleFormat,
+      outSampleRate,
       srcAVFrame->channel_layout,
       srcSampleFormat,
       srcSampleRate,
@@ -267,19 +266,19 @@ SwrContext* createSwrContext(
 UniqueAVFrame convertAudioAVFrameSamples(
     const UniqueSwrContext& swrContext,
     const UniqueAVFrame& srcAVFrame,
-    AVSampleFormat desiredSampleFormat,
-    int desiredSampleRate,
-    int desiredNumChannels) {
+    AVSampleFormat outSampleFormat,
+    int outSampleRate,
+    int outNumChannels) {
   UniqueAVFrame convertedAVFrame(av_frame_alloc());
   TORCH_CHECK(
       convertedAVFrame,
       "Could not allocate frame for sample format conversion.");
 
-  convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
+  convertedAVFrame->format = static_cast<int>(outSampleFormat);
 
-  convertedAVFrame->sample_rate = desiredSampleRate;
+  convertedAVFrame->sample_rate = outSampleRate;
   int srcSampleRate = srcAVFrame->sample_rate;
-  if (srcSampleRate != desiredSampleRate) {
+  if (srcSampleRate != outSampleRate) {
     // Note that this is an upper bound on the number of output samples.
     // `swr_convert()` will likely not fill convertedAVFrame with that many
     // samples if sample rate conversion is needed. It will buffer the last few
@@ -290,14 +289,14 @@ UniqueAVFrame convertAudioAVFrameSamples(
     // tighter bound.
     convertedAVFrame->nb_samples = av_rescale_rnd(
         swr_get_delay(swrContext.get(), srcSampleRate) + srcAVFrame->nb_samples,
-        desiredSampleRate,
+        outSampleRate,
         srcSampleRate,
         AV_ROUND_UP);
   } else {
     convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
   }
 
-  setChannelLayout(convertedAVFrame, srcAVFrame, desiredNumChannels);
+  setChannelLayout(convertedAVFrame, srcAVFrame, outNumChannels);
 
   auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
   TORCH_CHECK(