feat: add quantized lfm2 model support

fffonion · fffonion · commit 990b059c5e66 · 2025-12-16T13:45:05.000+08:00
diff --git a/candle-examples/examples/quantized-lfm2/README.md b/candle-examples/examples/quantized-lfm2/README.md
@@ -0,0 +1,21 @@
+# candle-quantized-lfm2
+
+Candle implementation of various quantized lfm2 models.
+
+## Running an example
+
+```bash
+$ cargo run --example quantized-lfm2 --release -- --prompt "Tell me a story in 100 words."
+avx: false, neon: true, simd128: false, f16c: false
+temp: 0.80 repeat-penalty: 1.10 repeat-last-n: 64
+Running on CPU, to run on GPU(metal), build this example with `--features metal`
+loaded 266 tensors (1.56GB) in 0.13s
+model ready
+Starting the inference loop:
+Tell me a story in 100 words.
+
+A quiet town nestled between rolling hills, where every springtime arrives with laughter and blossoms. Clara, the town’s beloved baker, opens her shop at dawn—cinnamon swirling into warm air, fresh pastries glowing on wooden racks. Each customer greets her with a smile, sharing tales while savoring sweet treats. One day, an old man hands her a faded photo: him and Clara, decades ago, when she’d kneaded dough for his wedding cake. Now he waits in silence, unseen. Clara bakes him another batch—hope rising from the oven, turning cold hearts into laughter again.
+
+  10 prompt tokens processed: 39.28 token/s
+ 133 tokens generated: 43.34 token/s
+```
diff --git a/candle-examples/examples/quantized-lfm2/main.rs b/candle-examples/examples/quantized-lfm2/main.rs
@@ -0,0 +1,352 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::Result;
+use clap::{Parser, ValueEnum};
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use tokenizers::Tokenizer;
+
+use candle::quantized::gguf_file;
+use candle::Tensor;
+use candle_transformers::generation::{LogitsProcessor, Sampling};
+
+use candle_examples::token_output_stream::TokenOutputStream;
+use candle_transformers::models::quantized_lfm2::ModelWeights;
+
+const DEFAULT_PROMPT: &str = "Explain how Rotary Position Embeddings work in transformers.";
+
+#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
+enum Which {
+    /// 350M base model, Q4_K_M quantization.
+    #[value(name = "lfm2-350m-q4_k_m")]
+    Lfm2_350MQ4KM,
+    /// 350M base model, Q8_0 quantization.
+    #[value(name = "lfm2-350m-q8_0")]
+    Lfm2_350MQ8_0,
+    /// 2.6B model, Q4_K_M quantization.
+    #[value(name = "lfm2-2.6b-q4_k_m")]
+    Lfm2_2_6BQ4KM,
+    /// 2.6B model, Q8_0 quantization.
+    #[value(name = "lfm2-2.6b-q8_0")]
+    Lfm2_2_6BQ8_0,
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// GGUF file to load, typically a .gguf file generated by llama.cpp.
+    #[arg(long)]
+    model: Option<String>,
+
+    /// Hugging Face repo id (eg `user/model`) to download the weights from when --model is not set.
+    #[arg(long, default_value = "lfm2-2.6b-q4_k_m")]
+    which: Which,
+
+    /// Repo revision to download from when using --which.
+    #[arg(long, default_value = "main")]
+    revision: String,
+
+    /// Path to tokenizer.json. Defaults to the same folder as the model or is fetched from Hugging Face.
+    #[arg(long)]
+    tokenizer: Option<String>,
+
+    /// The initial prompt to feed to the model.
+    #[arg(long)]
+    prompt: Option<String>,
+
+    /// The number of tokens to sample (including the first token after the prompt).
+    #[arg(short = 'n', long, default_value_t = 512)]
+    sample_len: usize,
+
+    /// The temperature used to generate samples, use 0 for greedy sampling.
+    #[arg(long, default_value_t = 0.8)]
+    temperature: f64,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// Only sample among the top K samples.
+    #[arg(long)]
+    top_k: Option<usize>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    /// Process prompt elements separately.
+    #[arg(long)]
+    split_prompt: bool,
+
+    /// Run on CPU rather than GPU even if a GPU is available.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+}
+
+impl Args {
+    fn model_path(&self) -> Result<PathBuf> {
+        if let Some(model) = &self.model {
+            return Ok(PathBuf::from(model));
+        }
+        let (repo, filename) = match self.which {
+            Which::Lfm2_350MQ4KM => ("LiquidAI/LFM2-350M-GGUF", "LFM2-350M-Q4_K_M.gguf"),
+            Which::Lfm2_350MQ8_0 => ("LiquidAI/LFM2-350M-GGUF", "LFM2-350M-Q8_0.gguf"),
+            Which::Lfm2_2_6BQ4KM => ("LiquidAI/LFM2-2.6B-GGUF", "LFM2-2.6B-Q4_K_M.gguf"),
+            Which::Lfm2_2_6BQ8_0 => ("LiquidAI/LFM2-2.6B-GGUF", "LFM2-2.6B-Q8_0.gguf"),
+        };
+        let api = hf_hub::api::sync::Api::new()?;
+        api.repo(hf_hub::Repo::with_revision(
+            repo.to_string(),
+            hf_hub::RepoType::Model,
+            self.revision.clone(),
+        ))
+        .get(filename)
+        .map_err(Into::into)
+    }
+
+    fn tokenizer(&self, model_path: &Path) -> Result<Tokenizer> {
+        if let Some(path) = &self.tokenizer {
+            return Tokenizer::from_file(path).map_err(anyhow::Error::msg);
+        }
+
+        if let Some(dir) = model_path.parent() {
+            let candidate = dir.join("tokenizer.json");
+            if candidate.exists() {
+                return Tokenizer::from_file(candidate).map_err(anyhow::Error::msg);
+            }
+        }
+
+        let tokenizer_repo = match self.which {
+            Which::Lfm2_350MQ4KM | Which::Lfm2_350MQ8_0 => "LiquidAI/LFM2-350M",
+            Which::Lfm2_2_6BQ4KM | Which::Lfm2_2_6BQ8_0 => "LiquidAI/LFM2-2.6B",
+        };
+        let api = hf_hub::api::sync::Api::new()?;
+        let tokenizer_path = api
+            .repo(hf_hub::Repo::with_revision(
+                tokenizer_repo.to_string(),
+                hf_hub::RepoType::Model,
+                self.revision.clone(),
+            ))
+            .get("tokenizer.json")?;
+        Tokenizer::from_file(tokenizer_path).map_err(anyhow::Error::msg)
+    }
+}
+
+fn format_size(size_in_bytes: usize) -> String {
+    if size_in_bytes < 1_000 {
+        format!("{size_in_bytes}B")
+    } else if size_in_bytes < 1_000_000 {
+        format!("{:.2}KB", size_in_bytes as f64 / 1e3)
+    } else if size_in_bytes < 1_000_000_000 {
+        format!("{:.2}MB", size_in_bytes as f64 / 1e6)
+    } else {
+        format!("{:.2}GB", size_in_bytes as f64 / 1e9)
+    }
+}
+
+fn guess_eos_id(tokenizer: &Tokenizer) -> Option<u32> {
+    let vocab = tokenizer.get_vocab(true);
+    let candidates = [
+        "</s>",
+        "<|im_end|>",
+        "<|eot_id|>",
+        "<|end|>",
+        "<|end_of_text|>",
+        "<|endoftext|>",
+    ];
+    candidates
+        .iter()
+        .find_map(|token| vocab.get(*token).copied())
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature, args.repeat_penalty, args.repeat_last_n
+    );
+
+    let model_path = args.model_path()?;
+    let mut file = std::fs::File::open(&model_path)?;
+    let start = std::time::Instant::now();
+    let device = candle_examples::device(args.cpu)?;
+
+    let gguf = gguf_file::Content::read(&mut file).map_err(|e| e.with_path(model_path.clone()))?;
+    let mut total_size_in_bytes = 0;
+    for (_, tensor) in gguf.tensor_infos.iter() {
+        let elem_count = tensor.shape.elem_count();
+        total_size_in_bytes +=
+            elem_count * tensor.ggml_dtype.type_size() / tensor.ggml_dtype.block_size();
+    }
+
+    let context_length = gguf
+        .metadata
+        .get("lfm2.context_length")
+        .and_then(|v| v.to_u32().ok().map(|v| v as usize));
+
+    println!(
+        "loaded {:?} tensors ({}) in {:.2}s",
+        gguf.tensor_infos.len(),
+        format_size(total_size_in_bytes),
+        start.elapsed().as_secs_f32()
+    );
+
+    let mut model = ModelWeights::from_gguf(gguf, &mut file, &device)?;
+    println!("model ready");
+
+    let tokenizer = args.tokenizer(&model_path)?;
+    let mut tos = TokenOutputStream::new(tokenizer);
+    let mut tokens = tos
+        .tokenizer()
+        .encode(args.prompt.as_deref().unwrap_or(DEFAULT_PROMPT), true)
+        .map_err(anyhow::Error::msg)?
+        .get_ids()
+        .to_vec();
+
+    if let Some(max_ctx) = context_length {
+        if tokens.len() >= max_ctx {
+            let trim = tokens.len() - max_ctx + 1;
+            tokens.drain(0..trim);
+            println!("prompt trimmed to last {max_ctx} tokens to fit context");
+        }
+    }
+
+    let mut all_tokens = tokens.clone();
+    let to_sample = args.sample_len.saturating_sub(1);
+
+    let mut logits_processor = {
+        let temperature = args.temperature;
+        let sampling = if temperature <= 0. {
+            Sampling::ArgMax
+        } else {
+            match (args.top_k, args.top_p) {
+                (None, None) => Sampling::All { temperature },
+                (Some(k), None) => Sampling::TopK { k, temperature },
+                (None, Some(p)) => Sampling::TopP { p, temperature },
+                (Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature },
+            }
+        };
+        LogitsProcessor::from_sampling(args.seed, sampling)
+    };
+
+    println!("Starting the inference loop:");
+    let prompt_str = args.prompt.as_deref().unwrap_or(DEFAULT_PROMPT);
+    print!("{prompt_str}");
+    std::io::stdout().flush()?;
+
+    let start_prompt_processing = std::time::Instant::now();
+    let mut next_token = if !args.split_prompt {
+        let input = Tensor::new(tokens.as_slice(), &device)?.unsqueeze(0)?;
+        let logits = model.forward(&input, 0)?;
+        let logits = logits.squeeze(0)?;
+        logits_processor.sample(&logits)?
+    } else {
+        let mut next_token = 0;
+        for (pos, token) in tokens.iter().enumerate() {
+            let input = Tensor::new(&[*token], &device)?.unsqueeze(0)?;
+            let logits = model.forward(&input, pos)?;
+            let logits = logits.squeeze(0)?;
+            next_token = logits_processor.sample(&logits)?
+        }
+        next_token
+    };
+
+    let mut index_pos = tokens.len();
+    let prompt_dt = start_prompt_processing.elapsed();
+
+    all_tokens.push(next_token);
+    if let Some(t) = tos.next_token(next_token)? {
+        print!("{t}");
+        std::io::stdout().flush()?;
+    }
+
+    let eos_token = guess_eos_id(tos.tokenizer());
+    let mut sampled = 0;
+    let start_post_prompt = std::time::Instant::now();
+    for _ in 0..to_sample {
+        if let Some(max_ctx) = context_length {
+            if index_pos + 1 > max_ctx {
+                println!("\n\ncontext window of {max_ctx} reached, stopping generation");
+                break;
+            }
+        }
+
+        let input = Tensor::new(&[next_token], &device)?.unsqueeze(0)?;
+        let logits = model.forward(&input, index_pos)?;
+        let logits = logits.squeeze(0)?;
+        let logits = if args.repeat_penalty == 1. {
+            logits
+        } else {
+            let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
+            candle_transformers::utils::apply_repeat_penalty(
+                &logits,
+                args.repeat_penalty,
+                &all_tokens[start_at..],
+            )?
+        };
+        next_token = logits_processor.sample(&logits)?;
+        index_pos += 1;
+        all_tokens.push(next_token);
+        if let Some(t) = tos.next_token(next_token)? {
+            print!("{t}");
+            std::io::stdout().flush()?;
+        }
+        sampled += 1;
+        if let Some(eos) = eos_token {
+            if next_token == eos {
+                break;
+            }
+        }
+    }
+
+    if let Some(rest) = tos.decode_rest().map_err(candle::Error::msg)? {
+        print!("{rest}");
+    }
+    std::io::stdout().flush()?;
+
+    let dt = start_post_prompt.elapsed();
+    println!(
+        "\n\n{:4} prompt tokens processed: {:.2} token/s",
+        tokens.len(),
+        tokens.len() as f64 / prompt_dt.as_secs_f64(),
+    );
+    println!(
+        "{sampled:4} tokens generated: {:.2} token/s",
+        sampled as f64 / dt.as_secs_f64(),
+    );
+    Ok(())
+}
diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs
@@ -83,6 +83,7 @@ pub mod pixtral;
 pub mod quantized_blip;
 pub mod quantized_blip_text;
 pub mod quantized_gemma3;
+pub mod quantized_lfm2;
 pub mod quantized_llama;
 pub mod quantized_llama2_c;
 pub mod quantized_metavoice;
diff --git a/candle-transformers/src/models/quantized_lfm2.rs b/candle-transformers/src/models/quantized_lfm2.rs