@@ -42,36 +42,53 @@ def wav2(i: BytesIO, o: BufferedWriter, format: str):
42
42
43
43
def load_audio (
44
44
file : Union [str , BytesIO , Path ],
45
- sr : Optional [int ]= None ,
46
- format : Optional [str ]= None ,
47
- mono = True
45
+ sr : Optional [int ] = None ,
46
+ format : Optional [str ] = None ,
47
+ mono = True ,
48
48
) -> Union [np .ndarray , Tuple [np .ndarray , int ]]:
49
49
"""
50
50
https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/412a9950a1e371a018c381d1bfb8579c4b0de329/infer/lib/audio.py#L39
51
51
"""
52
- if (isinstance (file , str ) and not Path (file ).exists ()) or (isinstance (file , Path ) and not file .exists ()):
52
+ if (isinstance (file , str ) and not Path (file ).exists ()) or (
53
+ isinstance (file , Path ) and not file .exists ()
54
+ ):
53
55
raise FileNotFoundError (f"File not found: { file } " )
54
56
rate = 0
55
57
56
58
container = av .open (file , format = format )
57
59
audio_stream = next (s for s in container .streams if s .type == "audio" )
58
60
channels = 1 if audio_stream .layout == "mono" else 2
59
61
container .seek (0 )
60
- resampler = AudioResampler (format = "fltp" , layout = audio_stream .layout , rate = sr ) if sr is not None else None
62
+ resampler = (
63
+ AudioResampler (format = "fltp" , layout = audio_stream .layout , rate = sr )
64
+ if sr is not None
65
+ else None
66
+ )
61
67
62
68
# Estimated maximum total number of samples to pre-allocate the array
63
69
# AV stores length in microseconds by default
64
- estimated_total_samples = int (container .duration * sr // 1_000_000 ) if sr is not None else 48000
65
- decoded_audio = np .zeros (estimated_total_samples + 1 if channels == 1 else (channels , estimated_total_samples + 1 ), dtype = np .float32 )
70
+ estimated_total_samples = (
71
+ int (container .duration * sr // 1_000_000 ) if sr is not None else 48000
72
+ )
73
+ decoded_audio = np .zeros (
74
+ (
75
+ estimated_total_samples + 1
76
+ if channels == 1
77
+ else (channels , estimated_total_samples + 1 )
78
+ ),
79
+ dtype = np .float32 ,
80
+ )
66
81
67
82
offset = 0
68
83
69
84
def process_packet (packet : List [AudioFrame ]):
70
85
frames_data = []
71
86
rate = 0
72
87
for frame in packet :
73
- frame .pts = None # 清除时间戳,避免重新采样问题
74
- resampled_frames = resampler .resample (frame ) if resampler is not None else [frame ]
88
+ # frame.pts = None # 清除时间戳,避免重新采样问题
89
+ resampled_frames = (
90
+ resampler .resample (frame ) if resampler is not None else [frame ]
91
+ )
75
92
for resampled_frame in resampled_frames :
76
93
frame_data = resampled_frame .to_ndarray ()
77
94
rate = resampled_frame .rate
@@ -83,16 +100,21 @@ def frame_iter(container):
83
100
yield p .decode ()
84
101
85
102
for r , frames_data in map (process_packet , frame_iter (container )):
86
- if not rate : rate = r
103
+ if not rate :
104
+ rate = r
87
105
for frame_data in frames_data :
88
106
end_index = offset + len (frame_data [0 ])
89
107
90
108
# 检查 decoded_audio 是否有足够的空间,并在必要时调整大小
91
109
if end_index > decoded_audio .shape [1 ]:
92
- decoded_audio = np .resize (decoded_audio , (decoded_audio .shape [0 ], end_index * 4 ))
110
+ decoded_audio = np .resize (
111
+ decoded_audio , (decoded_audio .shape [0 ], end_index * 4 )
112
+ )
93
113
94
114
np .copyto (decoded_audio [..., offset :end_index ], frame_data )
95
115
offset += len (frame_data [0 ])
116
+
117
+ container .close ()
96
118
97
119
# Truncate the array to the actual size
98
120
decoded_audio = decoded_audio [..., :offset ]
0 commit comments