examples/falcon/convert_checkpoint.py

import argparse
import json
import os
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Optional, Tuple

import safetensors
import torch
from transformers import AutoModelForCausalLM, FalconConfig, FalconForCausalLM

import tensorrt_llm
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models.llama.utils import (  # TODO: move the utils to common dir shared by models
    iterate_shard_files, load_state_dict, retrieved_layer_index_from_name)
from tensorrt_llm.quantization import QuantAlgo


def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir', type=str, default=None)
    parser.add_argument('--tp_size',
                        type=int,
                        default=1,
                        help='N-way tensor parallelism size')
    parser.add_argument('--pp_size',
                        type=int,
                        default=1,
                        help='N-way pipeline parallelism size')
    parser.add_argument('--dtype',
                        type=str,
                        default='float16',
                        choices=['float32', 'bfloat16', 'float16'])
    parser.add_argument(
        '--use_parallel_embedding',
        action="store_true",
        default=False,
        help=
        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
    )
    parser.add_argument(
        '--embedding_sharding_dim',
        type=int,
        default=0,
        choices=[0, 1],
        help=
        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
        'To shard it along hidden dimension, set embedding_sharding_dim=1'
        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
    )
    parser.add_argument(
        '--use_embedding_sharing',
        action="store_true",
        default=False,
        help=
        'Try to reduce the engine size by sharing the embedding lookup table between two layers.'
        'Note: the flag might not take effect when the criteria are not met.')

    parser.add_argument(
        '--use_weight_only',
        default=False,
        action="store_true",
        help='Quantize weights for the various GEMMs to INT4/INT8.'
        'See --weight_only_precision to set the precision')
    parser.add_argument(
        '--weight_only_precision',
        const='int8',
        type=str,
        nargs='?',
        default='int8',
        choices=['int8', 'int4'],
        help=
        'Define the precision for the weights when using weight-only quantization.'
        'You must also use --use_weight_only for that argument to have an impact.'
    )

    parser.add_argument('--load_by_shard',
                        action='store_true',
                        help='Load a pretrained model shard-by-shard.')
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
                        help='The path to save the TensorRT-LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
        default=1,
        help='The number of workers for converting checkpoint in parallel')
    parser.add_argument('--log_level', type=str, default='info')
    args = parser.parse_args()

    tensorrt_llm.logger.set_level(args.log_level)
    return args


def load_falcon_config(model_dir: str) -> FalconConfig:
    """ Helper utility to load FalconConfig.

    A pretrained checkpoint from modeling_RW.py has a different structure
    and is not compatible with `transformers.FalconConfig` and
    `transformers.FalconModel`. We need to manually set the config values.
    """

    config = FalconConfig.from_pretrained(model_dir)
    config.architectures = ["FalconForCausalLM"]
    # Falcon-7B config may not have num_kv_heads or n_head_kv.
    # Although Falcon-180B uses GQA (num_kv_heads=8), its config
    # has multi_query=True.
    if getattr(config, 'multi_query', False) and \
            not getattr(config, 'new_decoder_architecture', False):
        config.num_kv_heads = 1

    if config.model_type not in ['RefinedWebModel', 'RefinedWeb']:
        return config

    if config.model_type == 'RefinedWeb':
        # Case 1. Falcon-40B / Falcon-40B-instruct
        # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
        config.num_hidden_layers = config.n_layer
        config.num_attention_heads = config.n_head
        config.num_kv_heads = config.n_head_kv
        config.new_decoder_architecture = True
    elif config.model_type == 'RefinedWebModel':
        # Case 2. Falcon-7B / Falcon-7B-instruct
        # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
        config.num_hidden_layers = config.n_layer
        config.num_attention_heads = config.n_head
        config.num_kv_heads = 1 if config.multi_query else config.n_head
        config.new_decoder_architecture = False
    else:
        raise ValueError("Shouldn't reach here.")
    config.model_type = 'falcon'

    return config


def split(weight: torch.Tensor,
          tp_size: int,
          rank: int = 0,
          dim: int = 0) -> torch.Tensor:
    if tp_size == 1:
        return weight
    elif weight.ndim == 1:
        return torch.chunk(weight, tp_size)[rank].clone()
    else:
        return torch.chunk(weight, tp_size, dim=dim)[rank].clone()


def reorder_qkv_weight_or_bias(weight: torch.Tensor,
                               head_dim: int,
                               num_heads: int,
                               num_kv_heads: Optional[int] = None,
                               tp_size: int = 1,
                               is_bias: bool = False) -> torch.Tensor:
    """ Reorder the qkv weight for TRT-LLM use.

    The shape of the fused QKV weights in HF is different from the shape that
    TRT-LLM requires. In particular, the weight of HF consists of interleaved
    q, k, v head weights, while that of TRT-LLM is contiguous.
        HF     : [q1, k1, v1, ..., qh, kh, vh]
        TRT-LLM: [q1, ..., qh, k1, ..., kh, v1, vh]
    where qi, vi, ki are weight vectors corresponding to attention head i.
    It's similar to multi/grouped query attention cases.

    We reorder and split the weight of an attention layer to fit into TRT-LLM.
    The reordered weight and bias will be
        weight: (T, Qh * D + 2 * KVh * D, H)
        bias  : (T, Qh * D + 2 * KVh * D)
    where T=tp_size, Qh=local_num_q_heads, KVh=local_num_kv_heads, D=head_dim,
    H=hidden_dim. In the multi/grouped query attention, the number of K/V
    attention heads are less than that of Q attention, so that K/V attention
    heads may be shared across different ranks if necessary.

    For tensor parallelism, we use the first dimension to select the
    corresponding weights.
    """

    # Query types and expected kv heads.
    #  - Conventional MHA: num_heads = num_kv_heads
    #  - Multi-Query Attention: num_kv_heads = 1
    #  - Grouped-Query Attention: num_heads % num_kv_heads = 0
    num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
    assert num_heads % num_kv_heads == 0, \
        f'num_heads({num_heads}) must be divisible by '\
        f'num_kv_heads({num_kv_heads})).'

    # The number of attention heads per group: N q head + 1 k head + 1 v head.
    num_group_heads = num_heads // num_kv_heads + 2
    assert weight.shape[0] == num_kv_heads * num_group_heads * head_dim, \
        f'{weight.shape[0]} != {num_kv_heads} * {num_group_heads} * {head_dim}'

    qkv_in = num_heads * head_dim if not is_bias else 1

    # Split Q/K/V weights
    weight = weight.reshape(num_kv_heads, num_heads // num_kv_heads + 2,
                            head_dim, qkv_in)
    q_w = weight[:, :-2, ...]  # (nKV, num_heads // nKV, head_dim, qkv_in)
    k_w = weight[:, -2:-1, ...]  # (nKV, 1, head_dim, qkv_in)
    v_w = weight[:, -1:, ...]  # (nKV, 1, head_dim, qkv_in)

    if num_kv_heads < num_heads and num_kv_heads < tp_size:
        # Duplicate K/V heads to make sure that each rank has at least one
        # K/V heads. For instance, num_heads=8, num_kv_heads=2, tp_size=4,
        # we will make the qkv weight as below.
        #   Orig: [q0 q1 q2 q3 k0 v0 q4 q5 q6 q7 k1 v0 v1]
        #   >>>>  [[q0 q1 k0 v0], [q2 q3 k0 v0], [q4 q5 k1 v1], [q6 q7 k1 v1]]
        assert tp_size % num_kv_heads == 0
        num_dups = tp_size // num_kv_heads

        # k_w and v_w have the same shape.
        new_shape = (num_kv_heads, num_dups) + k_w.shape[2:]
        k_w = torch.broadcast_to(k_w, size=new_shape)
        v_w = torch.broadcast_to(v_w, size=new_shape)

        # Update the number of kv heads.
        num_kv_heads = tp_size

    reordered = torch.concat(
        [
            q_w.reshape(tp_size, num_heads // tp_size, head_dim, qkv_in),
            k_w.reshape(tp_size, num_kv_heads // tp_size, head_dim, qkv_in),
            v_w.reshape(tp_size, num_kv_heads // tp_size, head_dim, qkv_in),
        ],
        dim=1,
    )

    qkv_out = (num_heads + 2 * num_kv_heads) // tp_size * head_dim
    return reordered.reshape((tp_size, qkv_out, -1))


def split_qkv_weight(weight: torch.Tensor,
                     hidden_size: int,
                     num_heads: int,
                     tp_size: int,
                     rank: int,
                     is_bias: bool,
                     num_kv_heads: Optional[int] = None) -> torch.Tensor:
    """ Splits the QKV matrix according to tensor parallelism """
    head_dim = hidden_size // num_heads
    weight = reorder_qkv_weight_or_bias(weight,
                                        head_dim=head_dim,
                                        num_heads=num_heads,
                                        num_kv_heads=num_kv_heads,
                                        tp_size=tp_size,
                                        is_bias=is_bias)

    # Copy a sliced tensor to prevent memory leak. A sliced tensor shares the
    # memory buffer of the original tensor. So, returning without copying makes
    # the buffer of a loaded "qkv" be referenced, resulting GC can't release
    # those weights until the whole process ends.
    if not is_bias:
        return weight[rank, ...].clone()
    else:
        return weight[rank, ...].ravel().clone()


def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
                 dim: int) -> torch.Tensor:
    return split(weight, tp_size, rank, dim=dim)


def get_weight(params: Dict[str, torch.Tensor], prefix: str,
               dtype: torch.dtype) -> torch.Tensor:
    if f'{prefix}.weight' not in params:
        return None
    return params[f'{prefix}.weight'].to(dtype).detach().cpu()


def get_bias(params: Dict[str, torch.Tensor], prefix: str,
             dtype: torch.dtype) -> torch.Tensor:
    if f'{prefix}.bias' not in params:
        return None
    return params[f'{prefix}.bias'].to(dtype).detach().cpu()


def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
                        dtype: torch.dtype) -> Tuple[torch.Tensor]:
    return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)


def get_tllm_linear_weight(
    weight: torch.Tensor,
    prefix: str,
    bias: Optional[torch.Tensor] = None,
    use_weight_only: bool = False,
    plugin_weight_only_quant_type: torch.dtype = torch.int8
) -> Dict[str, torch.Tensor]:
    results = {}
    if use_weight_only:
        v = weight.t().contiguous()
        processed_torch_weights, torch_weight_scales = \
            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
                v, plugin_weight_only_quant_type)
        results[f'{prefix}.weight'] = processed_torch_weights
        results[f'{prefix}.per_channel_scale'] = torch_weight_scales
    else:
        results[f'{prefix}.weight'] = weight

    if bias is not None:
        results[f'{prefix}.bias'] = bias

    return results


def get_tllm_param(
    param: torch.Tensor,
    name: str,
    use_weight_only: bool = False,
    plugin_weight_only_quant_type: torch.dtype = torch.int8
) -> Dict[str, torch.Tensor]:
    results = {}
    if name.endswith('.weight') and use_weight_only:
        v = param.t().contiguous()
        processed_torch_weights, torch_weight_scales = \
            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
                v, plugin_weight_only_quant_type)
        results[name] = processed_torch_weights
        results[name.replace('weight',
                             'per_channel_scale')] = torch_weight_scales
    else:
        results[name] = param

    return results


def convert_hf_falcon(hf_model: FalconForCausalLM,
                      hf_config: FalconConfig,
                      mapping: Mapping,
                      dtype: str = 'float32',
                      use_parallel_embedding: bool = False,
                      sharding_dim: int = 0,
                      share_embedding_table: bool = False,
                      use_weight_only: bool = False,
                      plugin_weight_only_quant_type: torch.dtype = torch.int8):
    weights = {}
    tik = time.time()

    model_params = dict(hf_model.named_parameters())
    dtype = getattr(torch, dtype)
    num_attention_heads = hf_config.num_attention_heads
    hidden_size = hf_config.hidden_size
    vocab_size = hf_config.vocab_size
    num_kv_heads = getattr(hf_config, 'num_kv_heads', num_attention_heads)
    num_hidden_layers = hf_config.num_hidden_layers
    parallel_attention = hf_config.parallel_attn
    new_decoder_architecture = hf_config.new_decoder_architecture

    layers_range = mapping.pp_layers(num_hidden_layers)
    for l in layers_range:
        prefix = f'transformer.h.{l}'
        tllm_prex = f'transformer.layers.{l-layers_range[0]}'
        qkv_weight, qkv_bias = get_weight_and_bias(
            model_params, f'{prefix}.self_attention.query_key_value', dtype)
        qkv_w = split_qkv_weight(qkv_weight,
                                 hidden_size,
                                 num_attention_heads,
                                 mapping.tp_size,
                                 mapping.tp_rank,
                                 is_bias=False,
                                 num_kv_heads=num_kv_heads)
        if qkv_bias is None:
            qkv_b = None
        else:
            qkv_b = split_qkv_weight(qkv_bias,
                                     hidden_size,
                                     num_attention_heads,
                                     mapping.tp_size,
                                     mapping.tp_rank,
                                     is_bias=True,
                                     num_kv_heads=num_kv_heads)
        weights.update(
            get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', qkv_b,
                                   use_weight_only,
                                   plugin_weight_only_quant_type))

        attn_dense_weight, attn_dense_bias = get_weight_and_bias(
            model_params, f'{prefix}.self_attention.dense', dtype)
        attn_dense_w = split_matrix(attn_dense_weight,
                                    mapping.tp_size,
                                    mapping.tp_rank,
                                    dim=1)
        weights.update(
            get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
                                   attn_dense_bias, use_weight_only,
                                   plugin_weight_only_quant_type))

        mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
            model_params, f'{prefix}.mlp.dense_h_to_4h', dtype)
        mlp_fc_w = split_matrix(mlp_fc_weight,
                                mapping.tp_size,
                                mapping.tp_rank,
                                dim=0)
        if mlp_fc_bias is None:
            mlp_fc_b = None
        else:
            mlp_fc_b = split_matrix(mlp_fc_bias,
                                    mapping.tp_size,
                                    mapping.tp_rank,
                                    dim=0)
        weights.update(
            get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
                                   use_weight_only,
                                   plugin_weight_only_quant_type))

        mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
            model_params, f'{prefix}.mlp.dense_4h_to_h', dtype)
        mlp_proj_w = split_matrix(mlp_proj_weight,
                                  mapping.tp_size,
                                  mapping.tp_rank,
                                  dim=1)
        weights.update(
            get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
                                   mlp_proj_bias, use_weight_only,
                                   plugin_weight_only_quant_type))

        if new_decoder_architecture:
            input_ln_weight, input_ln_bias = get_weight_and_bias(
                model_params, f'{prefix}.ln_attn', dtype)
            weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
            if input_ln_bias is not None:
                weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias

            mlp_ln_weight, mlp_ln_bias = get_weight_and_bias(
                model_params, f'{prefix}.ln_mlp', dtype)
            weights[f'{tllm_prex}.mlp_layernorm.weight'] = mlp_ln_weight
            if mlp_ln_bias is not None:
                weights[f'{tllm_prex}.mlp_layernorm.bias'] = mlp_ln_bias
        else:
            input_ln_weight, input_ln_bias = get_weight_and_bias(
                model_params, f'{prefix}.input_layernorm', dtype)
            weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
            if input_ln_bias is not None:
                weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias

            if not parallel_attention:
                post_ln_weight, post_ln_bias = get_weight_and_bias(
                    model_params, f'{prefix}.post_attention_layernorm', dtype)
                if post_ln_weight is not None:
                    weights[
                        f'{tllm_prex}.post_layernorm.weight'] = post_ln_weight
                if post_ln_bias is not None:
                    weights[f'{tllm_prex}.post_layernorm.bias'] = post_ln_bias

    embed_w = get_weight(model_params, 'transformer.word_embeddings', dtype)
    if mapping.is_first_pp_rank():
        if not use_parallel_embedding:
            weights['transformer.vocab_embedding.weight'] = embed_w
        else:
            if sharding_dim == 0:
                assert vocab_size % mapping.tp_size == 0
            else:
                assert hidden_size % mapping.tp_size == 0
            weights['transformer.vocab_embedding.weight'] = split_matrix(
                embed_w, mapping.tp_size, mapping.tp_rank, sharding_dim)

    if mapping.is_last_pp_rank():
        if not share_embedding_table:
            weights['lm_head.weight'] = split_matrix(embed_w.clone(),
                                                     mapping.tp_size,
                                                     mapping.tp_rank,
                                                     dim=0)
        ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
                                             dtype)
        weights['transformer.ln_f.weight'] = ln_f_w
        if ln_f_b is not None:
            weights['transformer.ln_f.bias'] = ln_f_b

    tok = time.time()
    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
    print(f'Weights loaded. Total time: {t}')
    return weights


def load_from_hf_falcon_checkpoint(
        hf_model_dir: str,
        hf_config: FalconConfig,
        mapping: Mapping,
        dtype: str = 'float32',
        use_parallel_embedding: bool = False,
        sharding_dim: int = 0,
        share_embedding_table: bool = False,
        use_weight_only: bool = False,
        plugin_weight_only_quant_type: torch.dtype = torch.int8):

    weights = {}
    tik = time.time()

    dtype = getattr(torch, dtype)
    num_attention_heads = hf_config.num_attention_heads
    hidden_size = hf_config.hidden_size
    vocab_size = hf_config.vocab_size
    num_kv_heads = getattr(hf_config, 'num_kv_heads', num_attention_heads)
    num_hidden_layers = hf_config.num_hidden_layers

    layers_range = mapping.pp_layers(num_hidden_layers)
    for model_file in iterate_shard_files(hf_model_dir, mapping.tp_rank):
        state_dict = load_state_dict(model_file, dtype)
        for name, param in state_dict.items():
            l = retrieved_layer_index_from_name(name)
            if l is not None:
                if l not in layers_range:
                    continue
                prefix = f'transformer.layers.{l-layers_range[0]}'

                if 'self_attention.query_key_value' in name:
                    if name.endswith('weight'):
                        qkv_w = split_qkv_weight(param,
                                                 hidden_size,
                                                 num_attention_heads,
                                                 mapping.tp_size,
                                                 mapping.tp_rank,
                                                 is_bias=False,
                                                 num_kv_heads=num_kv_heads)
                        weights.update(
                            get_tllm_param(qkv_w,
                                           f'{prefix}.attention.qkv.weight',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))
                    else:
                        qkv_b = split_qkv_weight(param,
                                                 hidden_size,
                                                 num_attention_heads,
                                                 mapping.tp_size,
                                                 mapping.tp_rank,
                                                 is_bias=True,
                                                 num_kv_heads=num_kv_heads)
                        weights.update(
                            get_tllm_param(qkv_b,
                                           f'{prefix}.attention.qkv.bias',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))

                elif 'self_attention.dense' in name:
                    if name.endswith('weight'):
                        attn_dense_w = split_matrix(param,
                                                    mapping.tp_size,
                                                    mapping.tp_rank,
                                                    dim=1)
                        weights.update(
                            get_tllm_param(attn_dense_w,
                                           f'{prefix}.attention.dense.weight',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))
                    else:
                        weights.update(
                            get_tllm_param(param,
                                           f'{prefix}.attention.dense.bias',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))

                elif 'mlp.dense_h_to_4h' in name:
                    if name.endswith('weight'):
                        mlp_fc_w = split_matrix(param,
                                                mapping.tp_size,
                                                mapping.tp_rank,
                                                dim=0)
                        weights.update(
                            get_tllm_param(mlp_fc_w, f'{prefix}.mlp.fc.weight',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))
                    else:
                        mlp_fc_b = split_matrix(param,
                                                mapping.tp_size,
                                                mapping.tp_rank,
                                                dim=0)
                        weights.update(
                            get_tllm_param(mlp_fc_b, f'{prefix}.mlp.fc.bias',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))

                elif 'mlp.dense_4h_to_h' in name:
                    if name.endswith('weight'):
                        mlp_proj_w = split_matrix(param,
                                                  mapping.tp_size,
                                                  mapping.tp_rank,
                                                  dim=1)
                        weights.update(
                            get_tllm_param(mlp_proj_w,
                                           f'{prefix}.mlp.proj.weight',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))
                    else:
                        weights.update(
                            get_tllm_param(param, f'{prefix}.mlp.proj.bias',
                                           use_weight_only,
                                           plugin_weight_only_quant_type))

                elif 'ln_attn' in name or 'input_layernorm' in name:
                    if name.endswith('weight'):
                        weights[f'{prefix}.input_layernorm.weight'] = param
                    else:
                        weights[f'{prefix}.input_layernorm.bias'] = param
                elif 'ln_mlp' in name:
                    if name.endswith('weight'):
                        weights[f'{prefix}.mlp_layernorm.weight'] = param
                    else:
                        weights[f'{prefix}.mlp_layernorm.bias'] = param
                elif 'post_attention_layernorm' in name:
                    if name.endswith('weight'):
                        weights[f'{prefix}.post_layernorm.weight'] = param
                    else:
                        weights[f'{prefix}.post_layernorm.bias'] = param
            elif 'word_embeddings' in name:
                if mapping.is_first_pp_rank():
                    if not use_parallel_embedding:
                        weights['transformer.vocab_embedding.weight'] = param
                    else:
                        if sharding_dim == 0:
                            assert vocab_size % mapping.tp_size == 0
                        else:
                            assert hidden_size % mapping.tp_size == 0
                        weights[
                            'transformer.vocab_embedding.weight'] = split_matrix(
                                param, mapping.tp_size, mapping.tp_rank,
                                sharding_dim)
                if mapping.is_last_pp_rank() and not share_embedding_table:
                    weights['lm_head.weight'] = split_matrix(param,
                                                             mapping.tp_size,
                                                             mapping.tp_rank,
                                                             dim=0)
            elif 'ln_f' in name:
                if mapping.is_last_pp_rank():
                    if name.endswith('weight'):
                        weights['transformer.ln_f.weight'] = param
                    else:
                        weights['transformer.ln_f.bias'] = param
        del state_dict

    tok = time.time()
    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
    print(f'Weights loaded. Total time: {t}')
    return weights


if __name__ == '__main__':
    # TODO(qijun): Currently, the convert script depends on a torch op:
    # torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix,
    # which is included in tensorrt_llm Python package. Otherwise, the convert
    # script does not need to import tensorrt_llm. Will remove it after reimplementing
    # the op with PyTorch.
    print(tensorrt_llm.__version__)
    args = parse_arguments()
    world_size = args.tp_size * args.pp_size

    tik = time.time()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    quant_algo = None
    plugin_weight_only_quant_type = None
    if args.use_weight_only and args.weight_only_precision == 'int8':
        plugin_weight_only_quant_type = torch.int8
        quant_algo = QuantAlgo.W8A16
    elif args.use_weight_only and args.weight_only_precision == 'int4':
        plugin_weight_only_quant_type = torch.quint4x2
        quant_algo = QuantAlgo.W4A16

    hf_config = load_falcon_config(args.model_dir)
    config = {
        'architecture': hf_config.architectures[0],
        'dtype': args.dtype,
        'num_hidden_layers': hf_config.num_hidden_layers,
        'num_attention_heads': hf_config.num_attention_heads,
        'num_key_value_heads': hf_config.num_kv_heads,
        'hidden_size': hf_config.hidden_size,
        'norm_epsilon': hf_config.layer_norm_epsilon,
        'vocab_size': hf_config.vocab_size,
        'position_embedding_type':
        'alibi_with_scale' if hf_config.alibi else 'rope_gpt_neox',
        'max_position_embeddings': hf_config.max_position_embeddings,
        'hidden_act': 'gelu',
        'use_parallel_embedding': args.use_parallel_embedding,
        'embedding_sharding_dim': args.embedding_sharding_dim,
        'share_embedding_table': args.use_embedding_sharing,
        'quantization': {
            'quant_algo': quant_algo,
        },
        'mapping': {
            'world_size': world_size,
            'tp_size': args.tp_size,
            'pp_size': args.pp_size,
        },
        'bias': hf_config.bias,
        'parallel_attention': hf_config.parallel_attn,
        'new_decoder_architecture': hf_config.new_decoder_architecture,
    }

    with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)

    def covert_and_save(rank):
        mapping = Mapping(world_size=world_size,
                          rank=rank,
                          tp_size=args.tp_size,
                          pp_size=args.pp_size)

        if args.load_by_shard:
            weights = load_from_hf_falcon_checkpoint(
                args.model_dir,
                hf_config,
                mapping,
                dtype=args.dtype,
                use_parallel_embedding=args.use_parallel_embedding,
                sharding_dim=args.embedding_sharding_dim,
                share_embedding_table=args.use_embedding_sharing,
                use_weight_only=args.use_weight_only,
                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
        else:
            hf_model = AutoModelForCausalLM.from_pretrained(
                args.model_dir, trust_remote_code=True, torch_dtype="auto")
            weights = convert_hf_falcon(
                hf_model,
                hf_config,
                mapping,
                dtype=args.dtype,
                use_parallel_embedding=args.use_parallel_embedding,
                sharding_dim=args.embedding_sharding_dim,
                share_embedding_table=args.use_embedding_sharing,
                use_weight_only=args.use_weight_only,
                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
            del hf_model

        safetensors.torch.save_file(
            weights, os.path.join(args.output_dir, f'rank{rank}.safetensors'))

    if args.workers == 1:
        for rank in range(world_size):
            covert_and_save(rank)
    else:
        with ThreadPoolExecutor(max_workers=args.workers) as p:
            futures = [
                p.submit(covert_and_save, rank) for rank in range(world_size)
            ]
            exceptions = []
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    traceback.print_exc()
                    exceptions.append(e)
            assert len(
                exceptions
            ) == 0, "Checkpoint conversion failed, please check error log."

    tok = time.time()
    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
    print(f'Total time of converting checkpoints: {t}')