-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add SmolVLM model
- Loading branch information
Showing
10 changed files
with
489 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
## Quick Start | ||
|
||
```shell | ||
cargo run -r --example smolvlm -- --scale 500m --source "images/green-car.jpg" --prompt "What's in it?" | ||
cargo run -r --example smolvlm -- --scale 500m --source "images/green-car.jpg" --prompt "What color is the car?" | ||
cargo run -r --example smolvlm -- --scale 500m --source "images/slanted-text-number.jpg" --prompt "What are these numbers?" | ||
cargo run -r --example smolvlm -- --scale 256m --source "images/Statue-of-Liberty-Island-New-York-Bay.jpg" --prompt "Can you describe this image?" | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
use anyhow::Result; | ||
use usls::{models::SmolVLM, DataLoader, Options, Scale}; | ||
|
||
#[derive(argh::FromArgs)] | ||
/// Example | ||
struct Args { | ||
/// device | ||
#[argh(option, default = "String::from(\"cpu:0\")")] | ||
device: String, | ||
|
||
/// source image | ||
#[argh(option, default = "vec![String::from(\"./assets/bus.jpg\")]")] | ||
source: Vec<String>, | ||
|
||
/// promt | ||
#[argh(option, default = "String::from(\"Can you describe this image?\")")] | ||
prompt: String, | ||
|
||
/// scale | ||
#[argh(option, default = "String::from(\"256m\")")] | ||
scale: String, | ||
} | ||
|
||
fn main() -> Result<()> { | ||
tracing_subscriber::fmt() | ||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) | ||
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) | ||
.init(); | ||
let args: Args = argh::from_env(); | ||
|
||
// build model | ||
let (options_vision_encoder, options_text_embed, options_decode) = | ||
match args.scale.as_str().try_into()? { | ||
Scale::Million(256.) => ( | ||
Options::smolvlm_vision_256m(), | ||
Options::smolvlm_text_embed_256m(), | ||
Options::smolvlm_decoder_256m(), | ||
), | ||
Scale::Million(500.) => ( | ||
Options::smolvlm_vision_500m(), | ||
Options::smolvlm_text_embed_500m(), | ||
Options::smolvlm_decoder_500m(), | ||
), | ||
_ => unimplemented!(), | ||
}; | ||
|
||
let mut model = SmolVLM::new( | ||
options_vision_encoder | ||
.with_model_device(args.device.as_str().try_into()?) | ||
.commit()?, | ||
options_text_embed | ||
.with_model_device(args.device.as_str().try_into()?) | ||
.commit()?, | ||
options_decode | ||
.with_model_device(args.device.as_str().try_into()?) | ||
.commit()?, | ||
)?; | ||
|
||
// load images | ||
let xs = DataLoader::try_read_batch(&args.source)?; | ||
|
||
// run | ||
let ys = model.forward(&xs, &args.prompt)?; | ||
|
||
for y in ys.iter() { | ||
if let Some(texts) = y.texts() { | ||
for text in texts { | ||
println!("[User]: {}\n\n[Assistant]:{}", args.prompt, text); | ||
} | ||
} | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# SmolVLM - small yet mighty Vision Language Model | ||
|
||
## Official Repository | ||
|
||
The official repository can be found on: | ||
* [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct) | ||
* [SmolVLM-500M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct) | ||
|
||
## Example | ||
|
||
Refer to the [example](../../../examples/smolvlm) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/// Model configuration for `SmolVLM` | ||
impl crate::Options { | ||
pub fn smolvlm() -> Self { | ||
Self::default() | ||
.with_batch_size(1) | ||
.with_model_name("smolvlm") | ||
.with_model_num_dry_run(3) | ||
} | ||
|
||
pub fn smolvlm_vision() -> Self { | ||
Self::smolvlm() | ||
.with_model_kind(crate::Kind::Vision) | ||
.with_image_mean(&[0.5, 0.5, 0.5]) | ||
.with_image_std(&[0.5, 0.5, 0.5]) | ||
.with_resize_filter("lanczos3") | ||
.with_normalize(true) | ||
} | ||
|
||
pub fn smolvlm_text() -> Self { | ||
Self::smolvlm().with_model_kind(crate::Kind::Language) | ||
} | ||
|
||
pub fn smolvlm_vision_256m() -> Self { | ||
Self::smolvlm_vision() | ||
.with_model_scale(crate::Scale::Million(256.)) | ||
.with_model_file("256m-vision-encoder.onnx") | ||
} | ||
|
||
pub fn smolvlm_text_embed_256m() -> Self { | ||
Self::smolvlm_text() | ||
.with_model_scale(crate::Scale::Million(256.)) | ||
.with_model_file("256m-embed-tokens.onnx") | ||
} | ||
|
||
pub fn smolvlm_decoder_256m() -> Self { | ||
Self::smolvlm_text() | ||
.with_model_scale(crate::Scale::Million(256.)) | ||
.with_model_file("256m-decoder-model-merged.onnx") | ||
} | ||
|
||
pub fn smolvlm_vision_500m() -> Self { | ||
Self::smolvlm_vision() | ||
.with_model_scale(crate::Scale::Million(500.)) | ||
.with_model_file("500m-vision-encoder.onnx") | ||
} | ||
|
||
pub fn smolvlm_text_embed_500m() -> Self { | ||
Self::smolvlm_text() | ||
.with_model_scale(crate::Scale::Million(500.)) | ||
.with_model_file("500m-embed-tokens.onnx") | ||
} | ||
|
||
pub fn smolvlm_decoder_500m() -> Self { | ||
Self::smolvlm_text() | ||
.with_model_scale(crate::Scale::Million(500.)) | ||
.with_model_file("500m-decoder-model-merged.onnx") | ||
} | ||
} |
Oops, something went wrong.