jamjamjon · jamjamjon · Sep 28, 2024 · Sep 24, 2024 · Sep 25, 2024 · Sep 26, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "usls"
-version = "0.0.15"
+version = "0.0.16"
 edition = "2021"
 description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
 repository = "https://github.com/jamjamjon/usls"
@@ -38,6 +38,7 @@ video-rs = { version = "0.9.0", features = ["ndarray"] }
 natord = "1.0.9"
 tracing = "0.1.40"
 tracing-subscriber = "0.3.18"
+minifb = "0.27.0"
 
 
 [features]

diff --git a/benches/yolo.rs b/benches/yolo.rs
@@ -56,9 +56,8 @@ pub fn benchmark_cuda(c: &mut Criterion, h: isize, w: isize) -> Result<()> {
         .with_cuda(0)
         // .with_cpu()
         .with_dry_run(0)
-        .with_i00((1, 1, 4).into())
-        .with_i02((320, h, 1280).into())
-        .with_i03((320, w, 1280).into())
+        .with_ixx(0, 2, (320, h, 1280).into())
+        .with_ixx(0, 3, (320, w, 1280).into())
         .with_confs(&[0.2, 0.15]);
     let mut model = YOLO::new(options)?;
 

diff --git a/examples/blip/main.rs b/examples/blip/main.rs
@@ -4,19 +4,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // visual
     let options_visual = Options::default()
         .with_model("blip/visual-base.onnx")?
-        .with_i00((1, 1, 4).into())
+        // .with_ixx(0, 2, 384.into())
+        // .with_ixx(0, 3, 384.into())
         .with_profile(false);
 
     // textual
     let options_textual = Options::default()
         .with_model("blip/textual-base.onnx")?
         .with_tokenizer("blip/tokenizer.json")?
-        .with_i00((1, 1, 4).into()) // input_id: batch
-        .with_i01((1, 1, 4).into()) // input_id: seq_len
-        .with_i10((1, 1, 4).into()) // attention_mask: batch
-        .with_i11((1, 1, 4).into()) // attention_mask: seq_len
-        .with_i20((1, 1, 4).into()) // encoder_hidden_states: batch
-        .with_i30((1, 1, 4).into()) // encoder_attention_mask: batch
         .with_profile(false);
 
     // build model

diff --git a/examples/clip/main.rs b/examples/clip/main.rs
@@ -2,17 +2,12 @@ use usls::{models::Clip, DataLoader, Options};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     // visual
-    let options_visual = Options::default()
-        .with_model("clip/visual-base-dyn.onnx")?
-        .with_i00((1, 1, 4).into())
-        .with_profile(false);
+    let options_visual = Options::default().with_model("clip/visual-base-dyn.onnx")?;
 
     // textual
     let options_textual = Options::default()
         .with_model("clip/textual-base-dyn.onnx")?
-        .with_tokenizer("clip/tokenizer.json")?
-        .with_i00((1, 1, 4).into())
-        .with_profile(false);
+        .with_tokenizer("clip/tokenizer.json")?;
 
     // build model
     let mut model = Clip::new(options_visual, options_textual)?;

diff --git a/examples/dataloader/main.rs b/examples/dataloader/main.rs
@@ -1,45 +1,64 @@
-use usls::{models::YOLO, Annotator, DataLoader, Options, Vision, YOLOTask, YOLOVersion};
+use usls::{
+    models::YOLO, Annotator, DataLoader, Device, Options, Viewer, Vision, YOLOTask, YOLOVersion,
+};
 
 fn main() -> anyhow::Result<()> {
     tracing_subscriber::fmt()
         .with_max_level(tracing::Level::ERROR)
         .init();
 
     let options = Options::new()
-        .with_cuda(0)
+        .with_device(Device::Cuda(0))
         .with_model("yolo/v8-m-dyn.onnx")?
         .with_yolo_version(YOLOVersion::V8)
         .with_yolo_task(YOLOTask::Detect)
-        .with_i00((1, 1, 4).into())
-        .with_i02((0, 640, 640).into())
-        .with_i03((0, 640, 640).into())
+        .with_batch(2)
+        .with_ixx(0, 2, (416, 640, 800).into())
+        .with_ixx(0, 3, (416, 640, 800).into())
         .with_confs(&[0.2]);
     let mut model = YOLO::new(options)?;
 
+    // build annotator
+    let annotator = Annotator::new()
+        .with_bboxes_thickness(4)
+        .with_saveout("YOLO-DataLoader");
+
     // build dataloader
     let dl = DataLoader::new(
-        // "images/bus.jpg",  // remote image
+        // "images/bus.jpg", // remote image
         // "../images", // image folder
         // "../demo.mp4",   // local video
         // "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", // remote video
         // "rtsp://admin:xyz@192.168.2.217:554/h265/ch1/",  // rtsp h264 stream
-        "./assets/bus.jpg", // local image
+        // "./assets/bus.jpg", // local image
+        "../7.mp4",
     )?
     .with_batch(1)
     .build()?;
 
-    // build annotator
-    let annotator = Annotator::new()
-        .with_bboxes_thickness(4)
-        .with_saveout("YOLO-DataLoader");
+    let mut viewer = Viewer::new().with_delay(10).with_scale(1.).resizable(true);
 
-    // run
+    // iteration
     for (xs, _) in dl {
-        // std::thread::sleep(std::time::Duration::from_millis(100));
-        let ys = model.forward(&xs, false)?;
-        annotator.annotate(&xs, &ys);
+        // inference & annotate
+        let ys = model.run(&xs)?;
+        let images_plotted = annotator.plot(&xs, &ys, false)?;
+
+        // show image
+        viewer.imshow(&images_plotted)?;
+
+        // check out window and key event
+        if !viewer.is_open() || viewer.is_key_pressed(usls::Key::Escape) {
+            break;
+        }
+
+        // write video
+        viewer.write_batch(&images_plotted)?;
     }
 
+    // finish video write
+    viewer.finish_write()?;
+
     // images -> video
     // DataLoader::is2v("runs/YOLO-DataLoader", &["runs", "is2v"], 24)?;
 

diff --git a/examples/db/main.rs b/examples/db/main.rs
@@ -3,9 +3,9 @@ use usls::{models::DB, Annotator, DataLoader, Options};
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     // build model
     let options = Options::default()
-        .with_i00((1, 4, 8).into())
-        .with_i02((608, 960, 1280).into())
-        .with_i03((608, 960, 1280).into())
+        .with_ixx(0, 0, (1, 4, 8).into())
+        .with_ixx(0, 2, (608, 960, 1280).into())
+        .with_ixx(0, 3, (608, 960, 1280).into())
         // .with_trt(0)
         .with_confs(&[0.4])
         .with_min_width(5.0)

diff --git a/examples/depth-anything/main.rs b/examples/depth-anything/main.rs
@@ -5,13 +5,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let options = Options::default()
         // .with_model("depth-anything/v1-s-dyn.onnx")?
         .with_model("depth-anything/v2-s.onnx")?
-        .with_i00((1, 1, 8).into())
-        .with_i02((384, 512, 1024).into())
-        .with_i03((384, 512, 1024).into());
+        .with_ixx(0, 2, (384, 512, 1024).into())
+        .with_ixx(0, 3, (384, 512, 1024).into());
     let mut model = DepthAnything::new(options)?;
 
     // load
-    let x = [DataLoader::try_read("images/2.jpg")?];
+    let x = [DataLoader::try_read("images/street.jpg")?];
 
     // run
     let y = model.run(&x)?;

diff --git a/examples/dinov2/main.rs b/examples/dinov2/main.rs
@@ -4,9 +4,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // build model
     let options = Options::default()
         .with_model("dinov2/s-dyn.onnx")?
-        .with_i00((1, 1, 1).into())
-        .with_i02((224, 224, 224).into())
-        .with_i03((224, 224, 224).into());
+        .with_ixx(0, 2, 224.into())
+        .with_ixx(0, 3, 224.into());
     let mut model = Dinov2::new(options)?;
     let x = [DataLoader::try_read("images/bus.jpg")?];
     let y = model.run(&x)?;

diff --git a/examples/florence2/main.rs b/examples/florence2/main.rs
@@ -1,130 +1,35 @@
 use usls::{models::Florence2, Annotator, DataLoader, Options, Task};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let batch_size = 3;
+
     // vision encoder
     let options_vision_encoder = Options::default()
-        .with_model("florence2/base-vision-encoder.onnx")?
-        .with_i00((1, 2, 4).into())
-        .with_i02((512, 768, 800).into())
-        .with_i03((512, 768, 800).into())
-        .with_profile(false)
-        .with_cuda(0);
+        .with_model("florence2/base-vision-encoder-f16.onnx")?
+        .with_ixx(0, 2, (512, 768, 800).into())
+        .with_ixx(0, 3, 768.into())
+        .with_ixx(0, 0, (1, batch_size as _, 8).into());
 
     // text embed
     let options_text_embed = Options::default()
-        .with_model("florence2/base-embed-tokens.onnx")?
-        .with_i00((1, 2, 4).into())
-        .with_i01((1, 2, 20).into()) // seq_length
+        .with_model("florence2/base-embed-tokens-f16.onnx")?
         .with_tokenizer("florence2/tokenizer.json")?
-        .with_profile(false);
+        .with_batch(batch_size);
 
     // transformer encoder
     let options_encoder = Options::default()
-        .with_model("florence2/base-encoder.onnx")?
-        .with_i00((1, 2, 4).into())
-        .with_i01((1, 2, 300).into()) // encoder_sequence_length
-        .with_i10((1, 2, 4).into())
-        .with_i11((1, 2, 300).into()) // encoder_sequence_length
-        .with_profile(false);
+        .with_model("florence2/base-encoder-f16.onnx")?
+        .with_batch(batch_size);
 
     // transformer decoder
     let options_decoder = Options::default()
-        .with_model("florence2/base-decoder.onnx")?
-        .with_i00((1, 2, 4).into())
-        .with_i01((1, 2, 300).into()) // encoder_sequence_length
-        .with_i10((1, 2, 4).into())
-        .with_i11((1, 2, 300).into()) // encoder_sequence_length
-        .with_i20((1, 2, 4).into())
-        .with_i21((1, 2, 300).into()) // encoder_sequence_length
-        .with_profile(false);
+        .with_model("florence2/base-decoder-f16.onnx")?
+        .with_batch(batch_size);
 
     // transformer decoder merged
     let options_decoder_merged = Options::default()
-        .with_model("florence2/base-decoder-merged.onnx")?
-        // encoder_attention_mask
-        .with_i00((1, 2, 4).into())
-        .with_i01((1, 2, 300).into()) // encoder_sequence_length
-        // encoder_hidden_states
-        .with_i10((1, 2, 4).into())
-        .with_i11((1, 2, 300).into()) // encoder_sequence_length
-        // inputs_embeds
-        .with_i20((1, 2, 4).into())
-        .with_i21((1, 2, 300).into()) // encoder_sequence_length
-        // past_key_values.0.decoder.key
-        .with_i30((1, 2, 4).into())
-        .with_i32_((1, 2, 1).into())
-        // past_key_values.0.decoder.value
-        .with_i40((1, 2, 4).into())
-        .with_i42((1, 2, 1).into())
-        // past_key_values.0.encoder.key
-        .with_i50((1, 2, 4).into())
-        .with_i52((1, 2, 1).into())
-        // past_key_values.0.decoder.value
-        .with_i60((1, 2, 4).into())
-        .with_i62((1, 2, 1).into())
-        // past_key_values.1.decoder.key
-        .with_i70((1, 2, 4).into())
-        .with_i72((1, 2, 1).into())
-        // past_key_values.1.decoder.value
-        .with_i80((1, 2, 4).into())
-        .with_i82((1, 2, 1).into())
-        // past_key_values.1.encoder.key
-        .with_i90((1, 2, 4).into())
-        .with_i92((1, 2, 1).into())
-        // past_key_values.1.decoder.value
-        .with_i100((1, 2, 4).into())
-        .with_i102((1, 2, 1).into())
-        // past_key_values.2.decoder.key
-        .with_i110((1, 2, 4).into())
-        .with_i112((1, 2, 1).into())
-        // past_key_values.2.decoder.value
-        .with_i120((1, 2, 4).into())
-        .with_i122((1, 2, 1).into())
-        // past_key_values.2.encoder.key
-        .with_i130((1, 2, 4).into())
-        .with_i132((1, 2, 1).into())
-        // past_key_values.2.decoder.value
-        .with_i140((1, 2, 4).into())
-        .with_i142((1, 2, 1).into())
-        // past_key_values.3.decoder.key
-        .with_i150((1, 2, 4).into())
-        .with_i152((1, 2, 1).into())
-        // past_key_values.3.decoder.value
-        .with_i160((1, 2, 4).into())
-        .with_i162((1, 2, 1).into())
-        // past_key_values.3.encoder.key
-        .with_i170((1, 2, 4).into())
-        .with_i172((1, 2, 1).into())
-        // past_key_values.3.decoder.value
-        .with_i180((1, 2, 4).into())
-        .with_i182((1, 2, 1).into())
-        // past_key_values.4.decoder.key
-        .with_i190((1, 2, 4).into())
-        .with_i192((1, 2, 1).into())
-        // past_key_values.4.decoder.value
-        .with_i200((1, 2, 4).into())
-        .with_i202((1, 2, 1).into())
-        // past_key_values.4.encoder.key
-        .with_i210((1, 2, 4).into())
-        .with_i212((1, 2, 1).into())
-        // past_key_values.4.decoder.value
-        .with_i220((1, 2, 4).into())
-        .with_i222((1, 2, 1).into())
-        // past_key_values.5.decoder.key
-        .with_i230((1, 2, 4).into())
-        .with_i232((1, 2, 1).into())
-        // past_key_values.5.decoder.value
-        .with_i240((1, 2, 4).into())
-        .with_i242((1, 2, 1).into())
-        // past_key_values.5.encoder.key
-        .with_i250((1, 2, 4).into())
-        .with_i252((1, 2, 1).into())
-        // past_key_values.5.decoder.value
-        .with_i260((1, 2, 4).into())
-        .with_i262((1, 2, 1).into())
-        //use_cache_branch
-        .with_i270((1, 2, 1).into())
-        .with_profile(false);
+        .with_model("florence2/base-decoder-merged-f16.onnx")?
+        .with_batch(batch_size);
 
     // build model
     let mut model = Florence2::new(

diff --git a/examples/grounding-dino/main.rs b/examples/grounding-dino/main.rs
@@ -2,20 +2,20 @@ use usls::{models::GroundingDINO, Annotator, DataLoader, Options};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     let opts = Options::default()
-        .with_i00((1, 1, 4).into())
-        .with_i02((640, 800, 1200).into())
-        .with_i03((640, 1200, 1200).into())
-        .with_i10((1, 1, 4).into())
-        .with_i11((256, 256, 512).into())
-        .with_i20((1, 1, 4).into())
-        .with_i21((256, 256, 512).into())
-        .with_i30((1, 1, 4).into())
-        .with_i31((256, 256, 512).into())
-        .with_i40((1, 1, 4).into())
-        .with_i41((256, 256, 512).into())
-        .with_i50((1, 1, 4).into())
-        .with_i51((256, 256, 512).into())
-        .with_i52((256, 256, 512).into())
+        .with_ixx(0, 0, (1, 1, 4).into())
+        .with_ixx(0, 2, (640, 800, 1200).into())
+        .with_ixx(0, 3, (640, 1200, 1200).into())
+        // .with_i10((1, 1, 4).into())
+        // .with_i11((256, 256, 512).into())
+        // .with_i20((1, 1, 4).into())
+        // .with_i21((256, 256, 512).into())
+        // .with_i30((1, 1, 4).into())
+        // .with_i31((256, 256, 512).into())
+        // .with_i40((1, 1, 4).into())
+        // .with_i41((256, 256, 512).into())
+        // .with_i50((1, 1, 4).into())
+        // .with_i51((256, 256, 512).into())
+        // .with_i52((256, 256, 512).into())
         .with_model("grounding-dino/swint-ogc-dyn-u8.onnx")? // TODO: current onnx model does not support bs > 1
         // .with_model("grounding-dino/swint-ogc-dyn-f32.onnx")?
         .with_tokenizer("grounding-dino/tokenizer.json")?

diff --git a/examples/modnet/main.rs b/examples/modnet/main.rs
@@ -4,9 +4,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // build model
     let options = Options::default()
         .with_model("modnet/dyn-f32.onnx")?
-        .with_i00((1, 1, 4).into())
-        .with_i02((416, 512, 800).into())
-        .with_i03((416, 512, 800).into());
+        .with_ixx(0, 2, (416, 512, 800).into())
+        .with_ixx(0, 3, (416, 512, 800).into());
     let mut model = MODNet::new(options)?;
 
     // load image

diff --git a/examples/rtmo/main.rs b/examples/rtmo/main.rs
@@ -4,7 +4,6 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // build model
     let options = Options::default()
         .with_model("rtmo/s-dyn.onnx")?
-        .with_i00((1, 1, 8).into())
         .with_nk(17)
         .with_confs(&[0.3])
         .with_kconfs(&[0.5]);