Skip to content

Commit

Permalink
Add SmolVLM model (#65)
Browse files Browse the repository at this point in the history
* Add SmolVLM model
  • Loading branch information
jamjamjon authored Feb 7, 2025
1 parent bdd77a6 commit e234735
Show file tree
Hide file tree
Showing 10 changed files with 489 additions and 2 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ anyhow = { version = "1.0.75" }
regex = { version = "1.5.4" }
rand = { version = "0.8.5" }
chrono = { version = "0.4.30" }
tokenizers = { version = "0.15.2" }
tokenizers = { version = "0.21.0" }
log = { version = "0.4.22" }
indicatif = "0.17.8"
serde_json = "1.0"
Expand Down Expand Up @@ -62,6 +62,6 @@ trt = [ "ort/tensorrt" ]
mps = [ "ort/coreml" ]

[profile.release]
# lto = true
lto = true
strip = true
panic = "abort"
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
| [Florence2](https://arxiv.org/abs/2311.06242) | a Variety of Vision Tasks | [demo](examples/florence2) |||| | |
| [Moondream2](https://github.com/vikhyat/moondream/tree/main) | Open-Set Object Detection<br />Open-Set Keypoints Detection<br />Image Caption<br />Visual Question Answering | [demo](examples/moondream2) |||| | |
| [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble) | Open-Set Object Detection | [demo](examples/owlv2) |||| | |
| [SmolVLM(256M, 500M)](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct) | Visual Question Answering | [demo](examples/smolvlm) |||| | |

</details>

Expand Down
8 changes: 8 additions & 0 deletions examples/smolvlm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Quick Start

```shell
cargo run -r --example smolvlm -- --scale 500m --source "images/green-car.jpg" --prompt "What's in it?"
cargo run -r --example smolvlm -- --scale 500m --source "images/green-car.jpg" --prompt "What color is the car?"
cargo run -r --example smolvlm -- --scale 500m --source "images/slanted-text-number.jpg" --prompt "What are these numbers?"
cargo run -r --example smolvlm -- --scale 256m --source "images/Statue-of-Liberty-Island-New-York-Bay.jpg" --prompt "Can you describe this image?"
```
74 changes: 74 additions & 0 deletions examples/smolvlm/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use anyhow::Result;
use usls::{models::SmolVLM, DataLoader, Options, Scale};

#[derive(argh::FromArgs)]
/// Example
struct Args {
/// device
#[argh(option, default = "String::from(\"cpu:0\")")]
device: String,

/// source image
#[argh(option, default = "vec![String::from(\"./assets/bus.jpg\")]")]
source: Vec<String>,

/// promt
#[argh(option, default = "String::from(\"Can you describe this image?\")")]
prompt: String,

/// scale
#[argh(option, default = "String::from(\"256m\")")]
scale: String,
}

fn main() -> Result<()> {
tracing_subscriber::fmt()
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init();
let args: Args = argh::from_env();

// build model
let (options_vision_encoder, options_text_embed, options_decode) =
match args.scale.as_str().try_into()? {
Scale::Million(256.) => (
Options::smolvlm_vision_256m(),
Options::smolvlm_text_embed_256m(),
Options::smolvlm_decoder_256m(),
),
Scale::Million(500.) => (
Options::smolvlm_vision_500m(),
Options::smolvlm_text_embed_500m(),
Options::smolvlm_decoder_500m(),
),
_ => unimplemented!(),
};

let mut model = SmolVLM::new(
options_vision_encoder
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_text_embed
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_decode
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
)?;

// load images
let xs = DataLoader::try_read_batch(&args.source)?;

// run
let ys = model.forward(&xs, &args.prompt)?;

for y in ys.iter() {
if let Some(texts) = y.texts() {
for text in texts {
println!("[User]: {}\n\n[Assistant]:{}", args.prompt, text);
}
}
}

Ok(())
}
2 changes: 2 additions & 0 deletions src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ mod rtmo;
mod sam;
mod sapiens;
mod slanet;
mod smolvlm;
mod svtr;
mod trocr;
mod yolo;
Expand All @@ -48,6 +49,7 @@ pub use rtmo::*;
pub use sam::*;
pub use sapiens::*;
pub use slanet::*;
pub use smolvlm::*;
pub use svtr::*;
pub use trocr::*;
pub use yolo::*;
Expand Down
11 changes: 11 additions & 0 deletions src/models/smolvlm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# SmolVLM - small yet mighty Vision Language Model

## Official Repository

The official repository can be found on:
* [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
* [SmolVLM-500M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct)

## Example

Refer to the [example](../../../examples/smolvlm)
58 changes: 58 additions & 0 deletions src/models/smolvlm/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/// Model configuration for `SmolVLM`
impl crate::Options {
pub fn smolvlm() -> Self {
Self::default()
.with_batch_size(1)
.with_model_name("smolvlm")
.with_model_num_dry_run(3)
}

pub fn smolvlm_vision() -> Self {
Self::smolvlm()
.with_model_kind(crate::Kind::Vision)
.with_image_mean(&[0.5, 0.5, 0.5])
.with_image_std(&[0.5, 0.5, 0.5])
.with_resize_filter("lanczos3")
.with_normalize(true)
}

pub fn smolvlm_text() -> Self {
Self::smolvlm().with_model_kind(crate::Kind::Language)
}

pub fn smolvlm_vision_256m() -> Self {
Self::smolvlm_vision()
.with_model_scale(crate::Scale::Million(256.))
.with_model_file("256m-vision-encoder.onnx")
}

pub fn smolvlm_text_embed_256m() -> Self {
Self::smolvlm_text()
.with_model_scale(crate::Scale::Million(256.))
.with_model_file("256m-embed-tokens.onnx")
}

pub fn smolvlm_decoder_256m() -> Self {
Self::smolvlm_text()
.with_model_scale(crate::Scale::Million(256.))
.with_model_file("256m-decoder-model-merged.onnx")
}

pub fn smolvlm_vision_500m() -> Self {
Self::smolvlm_vision()
.with_model_scale(crate::Scale::Million(500.))
.with_model_file("500m-vision-encoder.onnx")
}

pub fn smolvlm_text_embed_500m() -> Self {
Self::smolvlm_text()
.with_model_scale(crate::Scale::Million(500.))
.with_model_file("500m-embed-tokens.onnx")
}

pub fn smolvlm_decoder_500m() -> Self {
Self::smolvlm_text()
.with_model_scale(crate::Scale::Million(500.))
.with_model_file("500m-decoder-model-merged.onnx")
}
}
Loading

0 comments on commit e234735

Please sign in to comment.