Add SmolVLM model (#65)

* Add SmolVLM model
jamjamjon · Feb 7, 2025 · e234735 · e234735
1 parent bdd77a6
commit e234735
Show file tree

Hide file tree

Showing 10 changed files with 489 additions and 2 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,7 +20,7 @@ anyhow = { version = "1.0.75" }
 regex = { version = "1.5.4" }
 rand = { version = "0.8.5" }
 chrono = { version = "0.4.30" }
-tokenizers = { version = "0.15.2" }
+tokenizers = { version = "0.21.0" }
 log = { version = "0.4.22" }
 indicatif = "0.17.8"
 serde_json = "1.0"
@@ -62,6 +62,6 @@ trt = [ "ort/tensorrt" ]
 mps = [ "ort/coreml" ]
 
 [profile.release]
-# lto = true
+lto = true
 strip = true
 panic = "abort"
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@
 | [Florence2](https://arxiv.org/abs/2311.06242)                                                                     | a Variety of Vision Tasks                                                                                                    | [demo](examples/florence2)      | ✅     | ✅             | ✅             |                    |                    |
 | [Moondream2](https://github.com/vikhyat/moondream/tree/main)                                                      | Open-Set Object Detection<br />Open-Set Keypoints Detection<br />Image Caption<br />Visual Question Answering               | [demo](examples/moondream2)     | ✅     | ✅             | ✅             |                    |                    |
 | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble)                                                | Open-Set Object Detection                                                                                                    | [demo](examples/owlv2)          | ✅     | ✅             | ✅             |                    |                    |
+| [SmolVLM(256M, 500M)](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)                                                | Visual Question Answering                                                                                                    | [demo](examples/smolvlm)          | ✅     | ✅             | ✅             |                    |                    |
 
 </details>
 

diff --git a/examples/smolvlm/README.md b/examples/smolvlm/README.md
@@ -0,0 +1,8 @@
+## Quick Start
+
+```shell
+cargo run -r --example smolvlm -- --scale 500m --source "images/green-car.jpg" --prompt "What's in it?"
+cargo run -r --example smolvlm -- --scale 500m --source "images/green-car.jpg" --prompt "What color is the car?"
+cargo run -r --example smolvlm -- --scale 500m --source "images/slanted-text-number.jpg" --prompt "What are these numbers?"
+cargo run -r --example smolvlm -- --scale 256m --source "images/Statue-of-Liberty-Island-New-York-Bay.jpg" --prompt "Can you describe this image?"
+```
diff --git a/examples/smolvlm/main.rs b/examples/smolvlm/main.rs
@@ -0,0 +1,74 @@
+use anyhow::Result;
+use usls::{models::SmolVLM, DataLoader, Options, Scale};
+
+#[derive(argh::FromArgs)]
+/// Example
+struct Args {
+    /// device
+    #[argh(option, default = "String::from(\"cpu:0\")")]
+    device: String,
+
+    /// source image
+    #[argh(option, default = "vec![String::from(\"./assets/bus.jpg\")]")]
+    source: Vec<String>,
+
+    /// promt
+    #[argh(option, default = "String::from(\"Can you describe this image?\")")]
+    prompt: String,
+
+    /// scale
+    #[argh(option, default = "String::from(\"256m\")")]
+    scale: String,
+}
+
+fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
+        .init();
+    let args: Args = argh::from_env();
+
+    // build model
+    let (options_vision_encoder, options_text_embed, options_decode) =
+        match args.scale.as_str().try_into()? {
+            Scale::Million(256.) => (
+                Options::smolvlm_vision_256m(),
+                Options::smolvlm_text_embed_256m(),
+                Options::smolvlm_decoder_256m(),
+            ),
+            Scale::Million(500.) => (
+                Options::smolvlm_vision_500m(),
+                Options::smolvlm_text_embed_500m(),
+                Options::smolvlm_decoder_500m(),
+            ),
+            _ => unimplemented!(),
+        };
+
+    let mut model = SmolVLM::new(
+        options_vision_encoder
+            .with_model_device(args.device.as_str().try_into()?)
+            .commit()?,
+        options_text_embed
+            .with_model_device(args.device.as_str().try_into()?)
+            .commit()?,
+        options_decode
+            .with_model_device(args.device.as_str().try_into()?)
+            .commit()?,
+    )?;
+
+    // load images
+    let xs = DataLoader::try_read_batch(&args.source)?;
+
+    // run
+    let ys = model.forward(&xs, &args.prompt)?;
+
+    for y in ys.iter() {
+        if let Some(texts) = y.texts() {
+            for text in texts {
+                println!("[User]: {}\n\n[Assistant]:{}", args.prompt, text);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/models/mod.rs b/src/models/mod.rs
@@ -25,6 +25,7 @@ mod rtmo;
 mod sam;
 mod sapiens;
 mod slanet;
+mod smolvlm;
 mod svtr;
 mod trocr;
 mod yolo;
@@ -48,6 +49,7 @@ pub use rtmo::*;
 pub use sam::*;
 pub use sapiens::*;
 pub use slanet::*;
+pub use smolvlm::*;
 pub use svtr::*;
 pub use trocr::*;
 pub use yolo::*;

diff --git a/src/models/smolvlm/README.md b/src/models/smolvlm/README.md
@@ -0,0 +1,11 @@
+# SmolVLM - small yet mighty Vision Language Model
+
+## Official Repository
+
+The official repository can be found on: 
+* [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
+* [SmolVLM-500M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct)
+
+## Example
+
+Refer to the [example](../../../examples/smolvlm)
diff --git a/src/models/smolvlm/config.rs b/src/models/smolvlm/config.rs
@@ -0,0 +1,58 @@
+/// Model configuration for `SmolVLM`
+impl crate::Options {
+    pub fn smolvlm() -> Self {
+        Self::default()
+            .with_batch_size(1)
+            .with_model_name("smolvlm")
+            .with_model_num_dry_run(3)
+    }
+
+    pub fn smolvlm_vision() -> Self {
+        Self::smolvlm()
+            .with_model_kind(crate::Kind::Vision)
+            .with_image_mean(&[0.5, 0.5, 0.5])
+            .with_image_std(&[0.5, 0.5, 0.5])
+            .with_resize_filter("lanczos3")
+            .with_normalize(true)
+    }
+
+    pub fn smolvlm_text() -> Self {
+        Self::smolvlm().with_model_kind(crate::Kind::Language)
+    }
+
+    pub fn smolvlm_vision_256m() -> Self {
+        Self::smolvlm_vision()
+            .with_model_scale(crate::Scale::Million(256.))
+            .with_model_file("256m-vision-encoder.onnx")
+    }
+
+    pub fn smolvlm_text_embed_256m() -> Self {
+        Self::smolvlm_text()
+            .with_model_scale(crate::Scale::Million(256.))
+            .with_model_file("256m-embed-tokens.onnx")
+    }
+
+    pub fn smolvlm_decoder_256m() -> Self {
+        Self::smolvlm_text()
+            .with_model_scale(crate::Scale::Million(256.))
+            .with_model_file("256m-decoder-model-merged.onnx")
+    }
+
+    pub fn smolvlm_vision_500m() -> Self {
+        Self::smolvlm_vision()
+            .with_model_scale(crate::Scale::Million(500.))
+            .with_model_file("500m-vision-encoder.onnx")
+    }
+
+    pub fn smolvlm_text_embed_500m() -> Self {
+        Self::smolvlm_text()
+            .with_model_scale(crate::Scale::Million(500.))
+            .with_model_file("500m-embed-tokens.onnx")
+    }
+
+    pub fn smolvlm_decoder_500m() -> Self {
+        Self::smolvlm_text()
+            .with_model_scale(crate::Scale::Million(500.))
+            .with_model_file("500m-decoder-model-merged.onnx")
+    }
+}