diff --git a/.gitignore b/.gitignore index c18db5d..c97405f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ infer_out/ *.onnx data/ checkpoints/ +ckpts/ processcmd.py .vscode WPy64-38100 @@ -17,4 +18,4 @@ Winpython64-3.8.10.0dot.exe *.wav *.json *.flac -*.xmp \ No newline at end of file +*.xmp diff --git a/batch.py b/batch.py index 07b283e..ec5eea9 100644 --- a/batch.py +++ b/batch.py @@ -18,8 +18,8 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise if __name__ == '__main__': # 工程文件夹名,训练时用的那个 project_name = "firefox" - model_path = f'./checkpoints/{project_name}/clean_model_ckpt_steps_100000.ckpt' - config_path = f'./checkpoints/{project_name}/config.yaml' + model_path = f'./ckpts/{project_name}/clean_model_ckpt_steps_100000.ckpt' + config_path = f'./ckpts/{project_name}/config.yaml' # 支持多个wav/ogg文件,放在raw文件夹下,带扩展名 file_names = infer_tool.get_end_file("./batch", "wav") diff --git a/doc/train_and_inference.markdown b/doc/train_and_inference.markdown index eed0d6e..229ee96 100644 --- a/doc/train_and_inference.markdown +++ b/doc/train_and_inference.markdown @@ -16,14 +16,14 @@ pip install -r requirements_short.txt 在第一个block中修改如下参数: ``` config_path='checkpoints压缩包中config.yaml的位置' -如'./checkpoints/nyaru/config.yaml' +如'./ckpts/nyaru/config.yaml' config和checkpoints是一一对应的,请不要使用其他config project_name='这个项目的名称' 如'nyaru' model_path='ckpt文件的全路径' -如'./checkpoints/nyaru/model_ckpt_steps_112000.ckpt' +如'./ckpts/nyaru/model_ckpt_steps_112000.ckpt' hubert_gpu=True 推理时是否使用gpu推理hubert(模型中的一个模块),不影响模型的其他部分 @@ -35,7 +35,7 @@ hubert_gpu=True ``` wav_fn='xxx.wav'#传入音频的路径,默认在项目根目录中 -use_crepe=True +use_crepe=True #crepe是一个F0算法,效果好但速度慢,改成False会使用效果稍逊于crepe但较快的parselmouth算法 thre=0.05 @@ -98,7 +98,7 @@ test_prefixes: endless_ds:False 如果你的数据集过小,每个epoch时间很短,请将此项打开,将把正常的1000epoch作为一个epoch计算 -hubert_path: checkpoints/hubert/hubert.pt +hubert_path: ckpts/hubert/hubert.pt hubert模型的存放地址,确保这个路径是对的,一般解压checkpoints包之后就是这个路径不需要改,现已使用torch版本推理 hubert_gpu:True 是否在预处理时使用gpu运行hubert(模型的一个模块),关闭后使用cpu,但耗时会显著增加。另外模型训练完推理时hubert是否用gpu是在inference中单独控制的,不受此处影响。目前hubert改为torch版后已经可以做到在1060 6G显存gpu上进行预处理,与直接推理1分钟内的音频不超出显存限制,一般不需要关了。 @@ -117,7 +117,7 @@ max_sentences: 88 max_tokens: 128000 #batchsize是由这几个参数动态算出来的,如果不太清楚具体含义,可以只改动max_sentences这个参数,填入batchsize的最大限制值,以免炸显存 -pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +pe_ckpt: ckpts/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt #pe模型路径,确保这个文件存在,具体作用参考inference部分 raw_data_dir: data/raw/nyaru @@ -136,10 +136,10 @@ use_crepe: true val_check_interval: 2000 #每2000steps推理测试集并保存ckpt -vocoder_ckpt:checkpoints/0109_hifigan_bigpopcs_hop128 +vocoder_ckpt: ckpts/0109_hifigan_bigpopcs_hop128 #24kHz下为对应声码器的目录, 44.1kHz下为对应声码器的文件名, 注意不要填错 -work_dir: checkpoints/nyaru +work_dir: ckpts/nyaru #修改后缀为工程名(也可以删掉或完全留空自动生成,但别乱填) no_fs2: true #对网络encoder的精简,能缩减模型体积,加快训练,且并未发现有对网络表现损害的直接证据。默认打开 @@ -152,7 +152,7 @@ no_fs2: true #windows ``` set PYTHONPATH=. -set CUDA_VISIBLE_DEVICES=0 +set CUDA_VISIBLE_DEVICES=0 python preprocessing/binarize.py --config training/config.yaml ``` #linux @@ -165,12 +165,12 @@ CUDA_VISIBLE_DEVICES=0 python preprocessing/binarize.py --config training/config ### 2.4 训练 #windows ``` -set CUDA_VISIBLE_DEVICES=0 -python run.py --config training/config.yaml --exp_name nyaru --reset +set CUDA_VISIBLE_DEVICES=0 +python run.py --config training/config.yaml --exp_name nyaru --reset ``` #linux ``` -CUDA_VISIBLE_DEVICES=0 python run.py --config training/config.yaml --exp_name nyaru --reset +CUDA_VISIBLE_DEVICES=0 python run.py --config training/config.yaml --exp_name nyaru --reset ``` >需要将exp_name改为你的工程名,并修改config路径,请确保和预处理使用的是同一个config文件\ *重要* :训练完成后,若之前不是在本地数据预处理,除了需要下载对应的ckpt文件,也需要将config文件下载下来,作为推理时使用的config,不可以使用本地之前上传上去那份。因为预处理时会向config文件中写入内容。推理时要保持使用的config和预处理使用的config是同一份。 diff --git a/doc/training_and_inference_EN.markdown b/doc/training_and_inference_EN.markdown index d7a406d..7026de2 100644 --- a/doc/training_and_inference_EN.markdown +++ b/doc/training_and_inference_EN.markdown @@ -1,7 +1,7 @@ # Diff-SVC(train/inference by yourself) ## 0. Setting up the environment ->Notice: The requirements files have been updated, and there are now three versions to choose from. - +>Notice: The requirements files have been updated, and there are now three versions to choose from. + 1. requirements.txt contains the entire environment during development and testing. It includes Torch1.12.1+cu113, and you can use pip to install it directly or remove the packages related to PyTorch inside (torch/torchvision) and then use pip to install it and use your own torch environment. ``` pip install -r requirements.txt @@ -17,32 +17,32 @@ Edit the parameters below in the first block: ``` config_path= 'location of config.yaml in the checkpoints archive' -# E.g.: './checkpoints/nyaru/config.yaml' +# E.g.: './ckpts/nyaru/config.yaml' # The config and checkpoints are one-to-one correspondences. Please do not use other config files. project_name='name of the current project' # E.g.: 'nyaru' model_path='full path to the ckpt file' -# E.g.: './checkpoints/nyaru/model_ckpt_steps_112000.ckpt' +# E.g.: './ckpts/nyaru/model_ckpt_steps_112000.ckpt' hubert_gpu=True -# Whether or not to use GPU for HuBERT (a module in the model) during inference. It will not affect any other parts of the model. +# Whether or not to use GPU for HuBERT (a module in the model) during inference. It will not affect any other parts of the model. # The current version significantly reduces the GPU usage for inferencing the HuBERT module. As full inference can be made on a 1060 6G GPU, there is no need to turn it off. # Also, auto-slice of long audio is now supported (both inference.ipynb and infer.py support this). Audio longer than 30 seconds will be automatically sliced at silences, thanks to @IceKyrin's code. ``` ### Adjustable parameters: ``` -wav_fn='xxx.wav' +wav_fn='xxx.wav' # The path to the input audio. The default path is in the project's root directory. -use_crepe=True +use_crepe=True # CREPE is an F0 extraction algorithm. It has good performance but is slow. Changing this to False will use the slightly inferior but much faster Parselmouth algorithm. -thre=0.05 +thre=0.05 # CREPE's noise filtering threshold. It can be increased if the input audio is clean, but if the input audio is noisy, keep this value or decrease it. This parameter will have no effect if the previous parameter is set to False. -pndm_speedup=20 +pndm_speedup=20 # Inference acceleration multiplier. The default number of diffusion steps is 1000, so changing this value to 10 means synthesizing in 100 steps. The default, 20, is a moderate value. This value can go up to 50x (synthesizing in 20 steps) without obvious loss in quality, but any higher may result in a significant quality loss. Note: if use_gt_mel below is enabled, make sure this value is lower than add_noise_step. This value should also be divisible by the number of diffusion steps. key=0 @@ -50,7 +50,7 @@ key=0 use_pe=True # F0 extraction algorithm for synthesizing audio from the Mel spectrogram. Changing this to False will use the input audio's F0. -# There is a slight difference in results between using True and False. Usually, setting it to True is better, but not always. It has almost no effect on the synthesizing speed. +# There is a slight difference in results between using True and False. Usually, setting it to True is better, but not always. It has almost no effect on the synthesizing speed. # (Regardless of what the value of the key parameter is, this value is always changeable and does not affect it) # This function is not supported in 44.1kHz models and will be turned off automatically. Leaving it on will not cause any errors as well. @@ -63,7 +63,7 @@ add_noise_step=500 wav_gen='yyy.wav' -# The path to the output audio. The default is in the project's root directory. The file type can be changed by changing the file extension here. +# The path to the output audio. The default is in the project's root directory. The file type can be changed by changing the file extension here. ``` If using infer.py, the way to change parameters is similar. Change values inside `__name__=='__main__'`, then run `python infer.py` in the project's root directory. @@ -102,7 +102,7 @@ test_prefixes: endless_ds:False # If your dataset is too small, each epoch will pass very fast. Setting this to True will treat 1000 epochs as a single one. -hubert_path: checkpoints/hubert/hubert.pt +hubert_path: ckpts/hubert/hubert.pt # The path to the HuBERT model, make sure this path is correct. In most cases, the decompressed checkpoints.zip archive would put the model under the right path, so no edits are needed. The torch version is now used for inference. hubert_gpu:True @@ -122,7 +122,7 @@ max_sentences: 88 max_tokens: 128000 # The batch size is calculated dynamically based on these parameters. If unsure about their exact meaning, you can change the max_sentences parameter only, which sets the maximum limit for the batch size to avoid exceeding VRAM limits. -pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +pe_ckpt: ckpts/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt # Path to the pe model. Make sure this file exists. Refer to the inference section for its purpose. raw_data_dir: data/raw/nyaru @@ -130,7 +130,7 @@ raw_data_dir: data/raw/nyaru residual_channels: 384 residual_layers: 20 -# A group of parameters that control the core network size. The higher the values, the more parameters the network has and the slower it trains, but this does not necessarily lead to better results. For larger datasets, you can change the first parameter to 512. You can experiment with them on your own. However, it is best to leave them as they are if you are not sure what you are doing. +# A group of parameters that control the core network size. The higher the values, the more parameters the network has and the slower it trains, but this does not necessarily lead to better results. For larger datasets, you can change the first parameter to 512. You can experiment with them on your own. However, it is best to leave them as they are if you are not sure what you are doing. speaker_id: nyaru # The name of the target speaker. Currently, only single-speaker is supported. (This parameter is for reference only and has no functional impact) @@ -141,17 +141,17 @@ use_crepe: true val_check_interval: 2000 # Inference on the test set and save checkpoints every 2000 steps. -vocoder_ckpt:checkpoints/0109_hifigan_bigpopcs_hop128 +vocoder_ckpt: ckpts/0109_hifigan_bigpopcs_hop128 # For 24kHz models, this should be the path to the directory of the corresponding vocoder. For 44.1kHz models, this should be the path to the corresponding vocoder file itself. Be careful, do not put the wrong one. -work_dir: checkpoints/nyaru +work_dir: ckpts/nyaru # Change the last part to the project name. (Or it can also be deleted or left completely empty to generate this directory automatically, but do not put some random names) no_fs2: true # Simplification of the network encoder. It can reduce the model size and speed up training. No direct evidence of damage to the network performance has been found so far. Enabled by default. ``` -> Do not edit the other parameters if you do not know that they do, even if you think you may know by judging from their names. +> Do not edit the other parameters if you do not know that they do, even if you think you may know by judging from their names. ### 2.3 Data pre-processing Run the following commands under the diff-svc directory: \ @@ -172,14 +172,14 @@ For pre-processing, @IceKyrin has prepared a code for processing HuBERT and othe #windows ``` set CUDA_VISIBLE_DEVICES=0 -python run.py --config training/config.yaml --exp_name nyaru --reset +python run.py --config training/config.yaml --exp_name nyaru --reset ``` #linux ``` CUDA_VISIBLE_DEVICES=0 python run.py --config training/config.yaml --exp_name nyaru --reset ``` >You need to change `exp_name` to your project name and edit the config path. Please make sure that the config file used for training is the same as the one used for pre-processing.\ -*Important*: After finishing training (on the cloud), if you did not pre-process the data locally, you will need to download the corresponding ckpt file AND the config file for inference. Do not use the one on your local machine since pre-processing writes data into the config file. Make sure the config file used for inference is the same as the one from pre-processing. +*Important*: After finishing training (on the cloud), if you did not pre-process the data locally, you will need to download the corresponding ckpt file AND the config file for inference. Do not use the one on your local machine since pre-processing writes data into the config file. Make sure the config file used for inference is the same as the one from pre-processing. ### 2.5 Possible issues: @@ -216,4 +216,3 @@ Check if `use_crepe` is enabled in config. Turning it off can significantly incr Check if `hubert_gpu` is enabled in config. If there are any other questions, feel free to join the QQ channel or Discord server to ask. - diff --git a/flask_api.py b/flask_api.py index eaecd0b..39061b7 100644 --- a/flask_api.py +++ b/flask_api.py @@ -41,8 +41,8 @@ def voice_change_model(): if __name__ == '__main__': # 工程文件夹名,训练时用的那个 project_name = "firefox" - model_path = f'./checkpoints/{project_name}/model_ckpt_steps_188000.ckpt' - config_path = f'./checkpoints/{project_name}/config.yaml' + model_path = f'./ckpts/{project_name}/model_ckpt_steps_188000.ckpt' + config_path = f'./ckpts/{project_name}/config.yaml' # 加速倍数 accelerate = 50 diff --git a/infer.py b/infer.py index a671ed0..38c35eb 100644 --- a/infer.py +++ b/infer.py @@ -74,8 +74,8 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise if __name__ == '__main__': # 工程文件夹名,训练时用的那个 project_name = "yilanqiu" - model_path = f'./checkpoints/{project_name}/model_ckpt_steps_246000.ckpt' - config_path = f'./checkpoints/{project_name}/config.yaml' + model_path = f'./ckpts/{project_name}/model_ckpt_steps_246000.ckpt' + config_path = f'./ckpts/{project_name}/config.yaml' # 支持多个wav/ogg文件,放在raw文件夹下,带扩展名 file_names = ["青花瓷.wav"] diff --git a/inference.ipynb b/inference.ipynb index 129c5ee..a143430 100644 --- a/inference.ipynb +++ b/inference.ipynb @@ -43,8 +43,8 @@ "\n", "# 工程文件夹名,训练时用的那个\n", "project_name = \"nyaru\"\n", - "model_path = f'./checkpoints/{project_name}/model_ckpt_steps_112000.ckpt'\n", - "config_path=f'./checkpoints/{project_name}/config.yaml'\n", + "model_path = f'./ckpts/{project_name}/model_ckpt_steps_112000.ckpt'\n", + "config_path=f'./ckpts/{project_name}/config.yaml'\n", "hubert_gpu=True\n", "svc_model = Svc(project_name,config_path,hubert_gpu, model_path)\n", "print('model loaded')" @@ -218,7 +218,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.13 ('diffsvc')", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -232,11 +232,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.13 (main, Aug 28 2022, 13:01:04) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]" }, "vscode": { "interpreter": { - "hash": "5cf89e54348a1bdadbb0ca2d227dcc30cc7e2d47cc75a8605923523671b5b7c7" + "hash": "ab41aa816281d0658b69de58af1224d73d8c5ea2e72429878c5d8dbd5d7844b5" } } }, diff --git a/network/hubert/vec_model.py b/network/hubert/vec_model.py index ee4b7a1..fd87f72 100644 --- a/network/hubert/vec_model.py +++ b/network/hubert/vec_model.py @@ -46,7 +46,7 @@ def get_vec_units(con_model, audio_path, dev): if __name__ == '__main__': device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model_path = "../../checkpoints/checkpoint_best_legacy_500.pt" # checkpoint_best_legacy_500.pt + model_path = "../../ckpts/checkpoint_best_legacy_500.pt" # checkpoint_best_legacy_500.pt vec_model = load_model(model_path) # 这个不用改,自动在根目录下所有wav的同文件夹生成其对应的npy file_lists = list(Path("../../data/vecfox").rglob('*.wav')) diff --git a/preprocessing/hubertinfer.py b/preprocessing/hubertinfer.py index 1922e28..8bf6e4e 100644 --- a/preprocessing/hubertinfer.py +++ b/preprocessing/hubertinfer.py @@ -11,11 +11,11 @@ class Hubertencoder(): - def __init__(self, pt_path='checkpoints/hubert/hubert_soft.pt'): + def __init__(self, pt_path='ckpts/hubert/hubert_soft.pt'): if not 'use_vec' in hparams.keys(): hparams['use_vec'] = False if hparams['use_vec']: - pt_path = "checkpoints/vec/checkpoint_best_legacy_500.pt" + pt_path = "ckpts/vec/checkpoint_best_legacy_500.pt" self.dev = torch.device("cuda") self.hbt_model = load_model(pt_path) else: diff --git a/simplify.py b/simplify.py index 75c187c..83187d9 100644 --- a/simplify.py +++ b/simplify.py @@ -4,7 +4,7 @@ def simplify_pth(pth_name, project_name): - model_path = f'./checkpoints/{project_name}' + model_path = f'./ckpts/{project_name}' checkpoint_dict = torch.load(f'{model_path}/{pth_name}') torch.save({'epoch': checkpoint_dict['epoch'], 'state_dict': checkpoint_dict['state_dict'], diff --git a/training/config.yaml b/training/config.yaml index e8e3bfd..ca2abdc 100644 --- a/training/config.yaml +++ b/training/config.yaml @@ -61,7 +61,7 @@ gen_tgt_spk_id: -1 hidden_size: 256 hop_size: 128 hubert_gpu: true -hubert_path: checkpoints/hubert/hubert_soft.pt +hubert_path: ckpts/hubert/hubert_soft.pt infer: false keep_bins: 80 lambda_commit: 0.25 @@ -98,7 +98,7 @@ num_valid_plots: 10 optimizer_adam_beta1: 0.9 optimizer_adam_beta2: 0.98 out_wav_norm: false -pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +pe_ckpt: ckpts/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt pe_enable: false perform_enhance: true pitch_ar: false @@ -340,10 +340,10 @@ val_check_interval: 2000 valid_num: 0 valid_set_name: valid vocoder: network.vocoders.hifigan.HifiGAN -vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 +vocoder_ckpt: ckpts/0109_hifigan_bigpopcs_hop128 warmup_updates: 2000 wav2spec_eps: 1e-6 weight_decay: 0 win_size: 512 -work_dir: checkpoints/atri +work_dir: ckpts/atri no_fs2: true diff --git a/training/config_nsf.yaml b/training/config_nsf.yaml index b93e9e4..d38ba9b 100644 --- a/training/config_nsf.yaml +++ b/training/config_nsf.yaml @@ -60,7 +60,7 @@ gen_dir_name: '' gen_tgt_spk_id: -1 hidden_size: 256 hop_size: 512 -hubert_path: checkpoints/hubert/hubert_soft.pt +hubert_path: ckpts/hubert/hubert_soft.pt hubert_gpu: true infer: false keep_bins: 128 @@ -98,7 +98,7 @@ num_valid_plots: 10 optimizer_adam_beta1: 0.9 optimizer_adam_beta2: 0.98 out_wav_norm: false -pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +pe_ckpt: ckpts/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt pe_enable: false perform_enhance: true pitch_ar: false @@ -182,10 +182,10 @@ val_check_interval: 2000 valid_num: 0 valid_set_name: valid vocoder: network.vocoders.nsf_hifigan.NsfHifiGAN -vocoder_ckpt: checkpoints/nsf_hifigan/model +vocoder_ckpt: ckpts/nsf_hifigan/model warmup_updates: 2000 wav2spec_eps: 1e-6 weight_decay: 0 win_size: 2048 -work_dir: checkpoints/nyaru +work_dir: ckpts/nyaru no_fs2: true diff --git a/utils/hparams.py b/utils/hparams.py index 6d5e655..20bc9a8 100644 --- a/utils/hparams.py +++ b/utils/hparams.py @@ -46,7 +46,7 @@ def set_hparams(config='', exp_name='', hparams_str='', print_hparams=True, glob args_work_dir = '' if args.exp_name != '': args.work_dir = args.exp_name - args_work_dir = f'checkpoints/{args.work_dir}' + args_work_dir = f'ckpts/{args.work_dir}' config_chains = [] loaded_config = set() @@ -74,7 +74,7 @@ def load_config(config_fn): # deep first global hparams assert args.config != '' or args_work_dir != '' saved_hparams = {} - if args_work_dir != 'checkpoints/': + if args_work_dir != 'ckpts/': ckpt_config_path = f'{args_work_dir}/config.yaml' if os.path.exists(ckpt_config_path): try: @@ -88,7 +88,7 @@ def load_config(config_fn): # deep first hparams_ = {} hparams_.update(load_config(args.config)) - + if not args.reset: hparams_.update(saved_hparams) hparams_['work_dir'] = args_work_dir