Skip to content

Commit f655357

Browse files
authored
Merge branch 'main' into wf-rename
2 parents 52098c9 + ba44fdb commit f655357

File tree

6 files changed

+101
-95
lines changed

6 files changed

+101
-95
lines changed

src/torchcodec/_core/Encoder.cpp

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,8 @@ AudioEncoder::AudioEncoder(
101101
const torch::Tensor samples,
102102
int sampleRate,
103103
std::string_view fileName,
104-
std::optional<int64_t> bitRate,
105-
std::optional<int64_t> numChannels)
106-
: samples_(validateSamples(samples)) {
104+
const AudioStreamOptions& audioStreamOptions)
105+
: samples_(validateSamples(samples)) {
107106
setFFmpegLogLevel();
108107
AVFormatContext* avFormatContext = nullptr;
109108
int status = avformat_alloc_output_context2(
@@ -126,16 +125,15 @@ AudioEncoder::AudioEncoder(
126125
", make sure it's a valid path? ",
127126
getFFMPEGErrorStringFromErrorCode(status));
128127

129-
initializeEncoder(sampleRate, bitRate, numChannels);
128+
initializeEncoder(sampleRate, audioStreamOptions);
130129
}
131130

132131
AudioEncoder::AudioEncoder(
133132
const torch::Tensor samples,
134133
int sampleRate,
135134
std::string_view formatName,
136135
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
137-
std::optional<int64_t> bitRate,
138-
std::optional<int64_t> numChannels)
136+
const AudioStreamOptions& audioStreamOptions)
139137
: samples_(validateSamples(samples)), avioContextHolder_(std::move(avioContextHolder)) {
140138
setFFmpegLogLevel();
141139
AVFormatContext* avFormatContext = nullptr;
@@ -153,13 +151,12 @@ AudioEncoder::AudioEncoder(
153151

154152
avFormatContext_->pb = avioContextHolder_->getAVIOContext();
155153

156-
initializeEncoder(sampleRate, bitRate, numChannels);
154+
initializeEncoder(sampleRate, audioStreamOptions);
157155
}
158156

159157
void AudioEncoder::initializeEncoder(
160158
int sampleRate,
161-
std::optional<int64_t> bitRate,
162-
std::optional<int64_t> numChannels) {
159+
const AudioStreamOptions& audioStreamOptions) {
163160
// We use the AVFormatContext's default codec for that
164161
// specific format/container.
165162
const AVCodec* avCodec =
@@ -170,18 +167,20 @@ void AudioEncoder::initializeEncoder(
170167
TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
171168
avCodecContext_.reset(avCodecContext);
172169

173-
if (bitRate.has_value()) {
174-
TORCH_CHECK(*bitRate >= 0, "bit_rate=", *bitRate, " must be >= 0.");
170+
auto desiredBitRate = audioStreamOptions.bitRate;
171+
if (desiredBitRate.has_value()) {
172+
TORCH_CHECK(
173+
*desiredBitRate >= 0, "bit_rate=", *desiredBitRate, " must be >= 0.");
175174
}
176175
// bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
177176
// well when "-b:a" isn't specified.
178-
avCodecContext_->bit_rate = bitRate.value_or(0);
179-
180-
desiredNumChannels_ = static_cast<int>(numChannels.value_or(samples_.sizes()[0]));
181-
validateNumChannels(*avCodec, desiredNumChannels_);
177+
avCodecContext_->bit_rate = desiredBitRate.value_or(0);
178+
outNumChannels_ =
179+
static_cast<int>(audioStreamOptions.numChannels.value_or(samples_.sizes()[0]));
180+
validateNumChannels(*avCodec, outNumChannels_);
182181
// The avCodecContext layout defines the layout of the encoded output, it's
183182
// not related to the input sampes.
184-
setDefaultChannelLayout(avCodecContext_, desiredNumChannels_);
183+
setDefaultChannelLayout(avCodecContext_, outNumChannels_);
185184

186185
validateSampleRate(*avCodec, sampleRate);
187186
avCodecContext_->sample_rate = sampleRate;
@@ -306,7 +305,7 @@ void AudioEncoder::encodeInnerLoop(
306305
bool mustConvert =
307306
(srcAVFrame != nullptr &&
308307
(avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP ||
309-
getNumChannels(srcAVFrame) != desiredNumChannels_));
308+
getNumChannels(srcAVFrame) != outNumChannels_));
310309

311310
UniqueAVFrame convertedAVFrame;
312311
if (mustConvert) {
@@ -317,14 +316,14 @@ void AudioEncoder::encodeInnerLoop(
317316
srcAVFrame->sample_rate, // No sample rate conversion
318317
srcAVFrame->sample_rate,
319318
srcAVFrame,
320-
desiredNumChannels_));
319+
outNumChannels_));
321320
}
322321
convertedAVFrame = convertAudioAVFrameSamples(
323322
swrContext_,
324323
srcAVFrame,
325324
avCodecContext_->sample_fmt,
326325
srcAVFrame->sample_rate, // No sample rate conversion
327-
desiredNumChannels_);
326+
outNumChannels_);
328327
TORCH_CHECK(
329328
convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
330329
"convertedAVFrame->nb_samples=",

src/torchcodec/_core/Encoder.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <torch/types.h>
33
#include "src/torchcodec/_core/AVIOBytesContext.h"
44
#include "src/torchcodec/_core/FFMPEGCommon.h"
5+
#include "src/torchcodec/_core/StreamOptions.h"
56

67
namespace facebook::torchcodec {
78
class AudioEncoder {
@@ -13,34 +14,30 @@ class AudioEncoder {
1314
// like passing 0, which results in choosing the minimum supported bit rate.
1415
// Passing 44_100 could result in output being 44000 if only 44000 is
1516
// supported.
16-
//
17-
// TODO-ENCODING: bundle the optional params like bitRate, numChannels, etc.
18-
// into an AudioStreamOptions struct, or similar.
1917
AudioEncoder(
2018
const torch::Tensor samples,
19+
// TODO-ENCODING: update this comment when we support an output sample
20+
// rate. This will become the input sample rate.
2121
// The *output* sample rate. We can't really decide for the user what it
2222
// should be. Particularly, the sample rate of the input samples should
2323
// match this, and that's up to the user. If sample rates don't match,
2424
// encoding will still work but audio will be distorted.
2525
int sampleRate,
2626
std::string_view fileName,
27-
std::optional<int64_t> bitRate = std::nullopt,
28-
std::optional<int64_t> numChannels = std::nullopt);
27+
const AudioStreamOptions& audioStreamOptions);
2928
AudioEncoder(
3029
const torch::Tensor samples,
3130
int sampleRate,
3231
std::string_view formatName,
3332
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
34-
std::optional<int64_t> bitRate = std::nullopt,
35-
std::optional<int64_t> numChannels = std::nullopt);
33+
const AudioStreamOptions& audioStreamOptions);
3634
void encode();
3735
torch::Tensor encodeToTensor();
3836

3937
private:
4038
void initializeEncoder(
4139
int sampleRate,
42-
std::optional<int64_t> bitRate = std::nullopt,
43-
std::optional<int64_t> numChannels = std::nullopt);
40+
const AudioStreamOptions& audioStreamOptions);
4441
void encodeInnerLoop(
4542
AutoAVPacket& autoAVPacket,
4643
const UniqueAVFrame& srcAVFrame);
@@ -50,9 +47,9 @@ class AudioEncoder {
5047
UniqueAVCodecContext avCodecContext_;
5148
int streamIndex_;
5249
UniqueSwrContext swrContext_;
53-
// TODO-ENCODING: desiredNumChannels should just be part of an options struct,
54-
// see other TODO above.
55-
int desiredNumChannels_ = -1;
50+
AudioStreamOptions audioStreamOptions;
51+
52+
int outNumChannels_ = -1;
5653

5754
const torch::Tensor samples_;
5855

src/torchcodec/_core/FFMPEGCommon.cpp

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -159,74 +159,74 @@ namespace {
159159
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
160160

161161
// Returns:
162-
// - the srcAVFrame's channel layout if srcAVFrame has desiredNumChannels
163-
// - the default channel layout with desiredNumChannels otherwise.
164-
AVChannelLayout getDesiredChannelLayout(
165-
int desiredNumChannels,
162+
// - the srcAVFrame's channel layout if srcAVFrame has outNumChannels
163+
// - the default channel layout with outNumChannels otherwise.
164+
AVChannelLayout getOutputChannelLayout(
165+
int outNumChannels,
166166
const UniqueAVFrame& srcAVFrame) {
167-
AVChannelLayout desiredLayout;
168-
if (desiredNumChannels == getNumChannels(srcAVFrame)) {
169-
desiredLayout = srcAVFrame->ch_layout;
167+
AVChannelLayout outLayout;
168+
if (outNumChannels == getNumChannels(srcAVFrame)) {
169+
outLayout = srcAVFrame->ch_layout;
170170
} else {
171-
av_channel_layout_default(&desiredLayout, desiredNumChannels);
171+
av_channel_layout_default(&outLayout, outNumChannels);
172172
}
173-
return desiredLayout;
173+
return outLayout;
174174
}
175175

176176
#else
177177

178178
// Same as above
179-
int64_t getDesiredChannelLayout(
180-
int desiredNumChannels,
179+
int64_t getOutputChannelLayout(
180+
int outNumChannels,
181181
const UniqueAVFrame& srcAVFrame) {
182-
int64_t desiredLayout;
183-
if (desiredNumChannels == getNumChannels(srcAVFrame)) {
184-
desiredLayout = srcAVFrame->channel_layout;
182+
int64_t outLayout;
183+
if (outNumChannels == getNumChannels(srcAVFrame)) {
184+
outLayout = srcAVFrame->channel_layout;
185185
} else {
186-
desiredLayout = av_get_default_channel_layout(desiredNumChannels);
186+
outLayout = av_get_default_channel_layout(outNumChannels);
187187
}
188-
return desiredLayout;
188+
return outLayout;
189189
}
190190
#endif
191191
} // namespace
192192

193-
// Sets dstAVFrame' channel layout to getDesiredChannelLayout(): see doc above
193+
// Sets dstAVFrame' channel layout to getOutputChannelLayout(): see doc above
194194
void setChannelLayout(
195195
UniqueAVFrame& dstAVFrame,
196196
const UniqueAVFrame& srcAVFrame,
197-
int desiredNumChannels) {
197+
int outNumChannels) {
198198
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
199-
AVChannelLayout desiredLayout =
200-
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
201-
auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &desiredLayout);
199+
AVChannelLayout outLayout =
200+
getOutputChannelLayout(outNumChannels, srcAVFrame);
201+
auto status = av_channel_layout_copy(&dstAVFrame->ch_layout, &outLayout);
202202
TORCH_CHECK(
203203
status == AVSUCCESS,
204204
"Couldn't copy channel layout to avFrame: ",
205205
getFFMPEGErrorStringFromErrorCode(status));
206206
#else
207207
dstAVFrame->channel_layout =
208-
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
209-
dstAVFrame->channels = desiredNumChannels;
208+
getOutputChannelLayout(outNumChannels, srcAVFrame);
209+
dstAVFrame->channels = outNumChannels;
210210
#endif
211211
}
212212

213213
SwrContext* createSwrContext(
214214
AVSampleFormat srcSampleFormat,
215-
AVSampleFormat desiredSampleFormat,
215+
AVSampleFormat outSampleFormat,
216216
int srcSampleRate,
217-
int desiredSampleRate,
217+
int outSampleRate,
218218
const UniqueAVFrame& srcAVFrame,
219-
int desiredNumChannels) {
219+
int outNumChannels) {
220220
SwrContext* swrContext = nullptr;
221221
int status = AVSUCCESS;
222222
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
223-
AVChannelLayout desiredLayout =
224-
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
223+
AVChannelLayout outLayout =
224+
getOutputChannelLayout(outNumChannels, srcAVFrame);
225225
status = swr_alloc_set_opts2(
226226
&swrContext,
227-
&desiredLayout,
228-
desiredSampleFormat,
229-
desiredSampleRate,
227+
&outLayout,
228+
outSampleFormat,
229+
outSampleRate,
230230
&srcAVFrame->ch_layout,
231231
srcSampleFormat,
232232
srcSampleRate,
@@ -238,13 +238,12 @@ SwrContext* createSwrContext(
238238
"Couldn't create SwrContext: ",
239239
getFFMPEGErrorStringFromErrorCode(status));
240240
#else
241-
int64_t desiredLayout =
242-
getDesiredChannelLayout(desiredNumChannels, srcAVFrame);
241+
int64_t outLayout = getOutputChannelLayout(outNumChannels, srcAVFrame);
243242
swrContext = swr_alloc_set_opts(
244243
nullptr,
245-
desiredLayout,
246-
desiredSampleFormat,
247-
desiredSampleRate,
244+
outLayout,
245+
outSampleFormat,
246+
outSampleRate,
248247
srcAVFrame->channel_layout,
249248
srcSampleFormat,
250249
srcSampleRate,
@@ -267,19 +266,19 @@ SwrContext* createSwrContext(
267266
UniqueAVFrame convertAudioAVFrameSamples(
268267
const UniqueSwrContext& swrContext,
269268
const UniqueAVFrame& srcAVFrame,
270-
AVSampleFormat desiredSampleFormat,
271-
int desiredSampleRate,
272-
int desiredNumChannels) {
269+
AVSampleFormat outSampleFormat,
270+
int outSampleRate,
271+
int outNumChannels) {
273272
UniqueAVFrame convertedAVFrame(av_frame_alloc());
274273
TORCH_CHECK(
275274
convertedAVFrame,
276275
"Could not allocate frame for sample format conversion.");
277276

278-
convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
277+
convertedAVFrame->format = static_cast<int>(outSampleFormat);
279278

280-
convertedAVFrame->sample_rate = desiredSampleRate;
279+
convertedAVFrame->sample_rate = outSampleRate;
281280
int srcSampleRate = srcAVFrame->sample_rate;
282-
if (srcSampleRate != desiredSampleRate) {
281+
if (srcSampleRate != outSampleRate) {
283282
// Note that this is an upper bound on the number of output samples.
284283
// `swr_convert()` will likely not fill convertedAVFrame with that many
285284
// samples if sample rate conversion is needed. It will buffer the last few
@@ -290,14 +289,14 @@ UniqueAVFrame convertAudioAVFrameSamples(
290289
// tighter bound.
291290
convertedAVFrame->nb_samples = av_rescale_rnd(
292291
swr_get_delay(swrContext.get(), srcSampleRate) + srcAVFrame->nb_samples,
293-
desiredSampleRate,
292+
outSampleRate,
294293
srcSampleRate,
295294
AV_ROUND_UP);
296295
} else {
297296
convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
298297
}
299298

300-
setChannelLayout(convertedAVFrame, srcAVFrame, desiredNumChannels);
299+
setChannelLayout(convertedAVFrame, srcAVFrame, outNumChannels);
301300

302301
auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
303302
TORCH_CHECK(

0 commit comments

Comments
 (0)