Skip to content

Mindnlp打卡任务:mimi模型迁移 #1937

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2024 Your Company. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from mindnlp.utils import LazyModule
from mindnlp.utils.import_utils import define_import_structure

if TYPE_CHECKING:
from .configuration_mimi import *
from .modeling_mimi import *
else:
import sys

_file = globals()["__file__"]
sys.modules[__name__] = LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
114 changes: 114 additions & 0 deletions configuration_mimi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import math
import numpy as np
from mindnlp.configs import MindNLPConfig

class MimiConfig(MindNLPConfig):
r"""
This is the configuration class to store the configuration of an [`MimiModel`].
"""

model_type = "mimi"

def __init__(
self,
sampling_rate=24_000,
frame_rate=12.5,
audio_channels=1,
hidden_size=512,
num_filters=64,
num_residual_layers=1,
upsampling_ratios=None,
kernel_size=7,
last_kernel_size=3,
residual_kernel_size=3,
dilation_growth_rate=2,
use_causal_conv=True,
pad_mode="constant",
compress=2,
trim_right_ratio=1.0,
codebook_size=2048,
codebook_dim=256,
num_quantizers=32,
use_conv_shortcut=False,
vector_quantization_hidden_dimension=256,
num_semantic_quantizers=1,
upsample_groups=512,
num_hidden_layers=8,
intermediate_size=2048,
num_attention_heads=8,
num_key_value_heads=8,
head_dim=None,
hidden_act="gelu",
max_position_embeddings=8000,
initializer_range=0.02,
norm_eps=1e-5,
use_cache=False,
rope_theta=10000.0,
sliding_window=250,
attention_dropout=0.0,
layer_scale_initial_scale=0.01,
attention_bias=False,
**kwargs,
):
# Parameters for audio processing
self.sampling_rate = sampling_rate
self.frame_rate = frame_rate
self.audio_channels = audio_channels

# Parameters for network architecture
self.hidden_size = hidden_size
self.num_filters = num_filters
self.num_residual_layers = num_residual_layers
self.upsampling_ratios = upsampling_ratios if upsampling_ratios else [8, 6, 5, 4]
self.kernel_size = kernel_size
self.last_kernel_size = last_kernel_size
self.residual_kernel_size = residual_kernel_size
self.dilation_growth_rate = dilation_growth_rate
self.use_causal_conv = use_causal_conv
self.pad_mode = pad_mode
self.compress = compress
self.trim_right_ratio = trim_right_ratio
self.codebook_size = codebook_size
self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
self.num_quantizers = num_quantizers
self.use_conv_shortcut = use_conv_shortcut
self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
self.upsample_groups = upsample_groups

# Parameters for transformer architecture
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = head_dim or hidden_size // num_attention_heads
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.norm_eps = norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.sliding_window = sliding_window
self.attention_dropout = attention_dropout
self.layer_scale_initial_scale = layer_scale_initial_scale
self.attention_bias = attention_bias

# Number of semantic quantizers validation
if num_semantic_quantizers >= self.num_quantizers:
raise ValueError(
f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {num_semantic_quantizers}."
)
self.num_semantic_quantizers = num_semantic_quantizers

super().__init__(**kwargs)

@property
def encodec_frame_rate(self) -> int:
hop_length = np.prod(self.upsampling_ratios)
return math.ceil(self.sampling_rate / hop_length)

@property
def num_codebooks(self) -> int:
# Alias to num_quantizers
return self.num_quantizers

__all__ = ["MimiConfig"]
Loading