Skip to content

Commit 2e7af17

Browse files
authored
feat(yoloe): add visual prompt support and fix font loading bug (#152)
1 parent 5cfbb20 commit 2e7af17

File tree

17 files changed

+435
-184
lines changed

17 files changed

+435
-184
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "usls"
33
edition = "2021"
4-
version = "0.1.4"
4+
version = "0.1.5"
55
rust-version = "1.85"
66
description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
77
repository = "https://github.com/jamjamjon/usls"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ usls = "latest-version"
9393
| [SAM-HQ](https://github.com/SysCV/sam-hq) | Segment Anything | [demo](examples/sam) |
9494
| [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) | Instance Segmentation | [demo](examples/yolo) |
9595
| [YOLO-World](https://github.com/AILab-CVC/YOLO-World) | Open-Set Detection With Language | [demo](examples/yolo) |
96-
| [YOLOE](https://github.com/THU-MIG/yoloe) | Open-Set Detection And Segmentation | [demo-prompt-free](examples/yoloe)<br />[demo-text-prompt](examples/yoloe-text-prompt) |
96+
| [YOLOE](https://github.com/THU-MIG/yoloe) | Open-Set Detection And Segmentation | [demo-prompt-free](examples/yoloe-prompt-free)<br />[demo-prompt(visual & textual)](examples/yoloe-prompt) |
9797
| [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) |
9898
| [CLIP](https://github.com/openai/CLIP) | Vision-Language Embedding | [demo](examples/clip) |
9999
| [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1) | Vision-Language Embedding | [demo](examples/clip) |
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
## Quick Start
2+
3+
```shell
4+
cargo run -r -F cuda --example yoloe-prompt-free -- --device cuda
5+
```
6+
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ fn main() -> Result<()> {
4949
annotator.annotate(x, y)?.save(format!(
5050
"{}.jpg",
5151
usls::Dir::Current
52-
.base_dir_with_subs(&["runs", model.spec()])?
52+
.base_dir_with_subs(&["runs", "YOLOE-prompt-free", model.spec()])?
5353
.join(usls::timestamp(None))
5454
.display(),
5555
))?;

examples/yoloe-prompt/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
## Quick Start
2+
3+
```shell
4+
# Textual prompt (CPU)
5+
cargo run -r --example yoloe-prompt -- --source ./assets/bus.jpg --visual false
6+
7+
# Visual prompt (TensorRT)
8+
cargo run -r --example yoloe-prompt -F tensorrt -- --source ./assets/bus.jpg --visual true --device tensorrt
9+
```
10+
Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use anyhow::Result;
2-
use usls::{models::YOLO, Annotator, Config, DataLoader, Style};
2+
use usls::{models::YOLO, Annotator, Config, DataLoader, Hbb, Style};
33

44
#[derive(argh::FromArgs)]
55
/// Example
@@ -9,7 +9,7 @@ struct Args {
99
source: String,
1010

1111
/// dtype
12-
#[argh(option, default = "String::from(\"fp16\")")]
12+
#[argh(option, default = "String::from(\"fp32\")")]
1313
dtype: String,
1414

1515
/// device
@@ -33,6 +33,10 @@ struct Args {
3333
/// batch size
3434
#[argh(option, default = "1")]
3535
batch_size: usize,
36+
37+
/// visual or textual
38+
#[argh(option, default = "true")]
39+
visual: bool,
3640
}
3741

3842
fn main() -> Result<()> {
@@ -43,14 +47,32 @@ fn main() -> Result<()> {
4347
let args: Args = argh::from_env();
4448

4549
// config
46-
let config = Config::yoloe_v8m_seg_tp()
47-
.with_batch_size_all_min_opt_max(1, args.batch_size, 8)
48-
.with_model_dtype(args.dtype.as_str().parse()?)
49-
.with_textual_dtype("fp16".parse()?) // Use FP32 when TensorRT is enabled
50-
.with_device_all(args.device.as_str().parse()?)
51-
.commit()?;
50+
let config = if args.visual {
51+
Config::yoloe_11m_seg_vp()
52+
} else {
53+
Config::yoloe_v8m_seg_tp().with_textual_encoder_dtype("fp16".parse()?) // Use FP32 when TensorRT is enabled
54+
}
55+
.with_batch_size_all_min_opt_max(1, args.batch_size, 8)
56+
.with_model_dtype(args.dtype.as_str().parse()?)
57+
.with_device_all(args.device.as_str().parse()?)
58+
.with_class_confs(&[0.25])
59+
.commit()?;
5260
let mut model = YOLO::new(config)?;
5361

62+
// encode visual or textual
63+
let embedding = if args.visual {
64+
let prompt_image = DataLoader::try_read_one("./assets/bus.jpg")?;
65+
model.encode_visual_prompt(
66+
prompt_image,
67+
&[
68+
Hbb::from_xyxy(221.52, 405.8, 344.98, 857.54).with_name("person"),
69+
// Hbb::from_xyxy(120., 425., 160., 445.).with_name("glasses"), // TODO
70+
],
71+
)?
72+
} else {
73+
model.encode_class_names(&args.labels.iter().map(|x| x.as_str()).collect::<Vec<_>>())?
74+
};
75+
5476
// build dataloader
5577
let dl = DataLoader::new(&args.source)?
5678
.with_batch(model.batch() as _)
@@ -61,14 +83,9 @@ fn main() -> Result<()> {
6183
.with_hbb_style(Style::hbb().with_draw_fill(true))
6284
.with_mask_style(Style::mask().with_draw_mask_polygon_largest(true));
6385

64-
// encode text prompts
65-
let text_embeddings =
66-
model.encode_class_names(&args.labels.iter().map(|x| x.as_str()).collect::<Vec<_>>())?;
67-
6886
// run & annotate
6987
for xs in &dl {
70-
// infer with text embeddings
71-
let ys = model.forward_with_te(&xs, &text_embeddings)?;
88+
let ys = model.forward_with_embedding(&xs, &embedding)?;
7289
println!("ys: {:?}", ys);
7390

7491
for (x, y) in xs.iter().zip(ys.iter()) {
@@ -78,7 +95,7 @@ fn main() -> Result<()> {
7895
annotator.annotate(x, y)?.save(format!(
7996
"{}.jpg",
8097
usls::Dir::Current
81-
.base_dir_with_subs(&["runs", model.spec()])?
98+
.base_dir_with_subs(&["runs", "YOLOE-prompt", model.spec()])?
8299
.join(usls::timestamp(None))
83100
.display(),
84101
))?;

examples/yoloe-text-prompt/README.md

Lines changed: 0 additions & 6 deletions
This file was deleted.

examples/yoloe/README.md

Lines changed: 0 additions & 6 deletions
This file was deleted.

src/models/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ mod svtr;
4242
mod swin2sr;
4343
mod trocr;
4444
mod yolo;
45+
mod yoloe;
4546
mod yolop;
4647

4748
pub use blip::*;

src/models/yolo/config.rs

Lines changed: 1 addition & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use crate::{
22
models::YOLOPredsFormat, Config, ResizeMode, Scale, Task, NAMES_COCO_80,
3-
NAMES_COCO_KEYPOINTS_17, NAMES_DOTA_V1_15, NAMES_IMAGENET_1K, NAMES_YOLOE_4585,
4-
NAMES_YOLO_DOCLAYOUT_10,
3+
NAMES_COCO_KEYPOINTS_17, NAMES_DOTA_V1_15, NAMES_IMAGENET_1K, NAMES_YOLO_DOCLAYOUT_10,
54
};
65

76
impl Config {
@@ -90,64 +89,6 @@ impl Config {
9089
.with_model_file("doclayout-docstructbench.onnx") // TODO: batch_size > 1
9190
}
9291

93-
/// Creates a base YOLOE configuration with 4585 classes.
94-
///
95-
/// Configures the model for instance segmentation with a large class vocabulary.
96-
pub fn yoloe() -> Self {
97-
Self::yolo()
98-
.with_task(Task::InstanceSegmentation)
99-
.with_class_names(&NAMES_YOLOE_4585)
100-
}
101-
/// Creates a configuration for YOLOE-v8s segmentation model.
102-
/// Uses the small variant of YOLOv8 architecture.
103-
pub fn yoloe_v8s_seg_pf() -> Self {
104-
Self::yoloe()
105-
.with_version(8.into())
106-
.with_scale(Scale::S)
107-
.with_model_file("yoloe-v8s-seg-pf.onnx")
108-
}
109-
110-
/// Creates a configuration for YOLOE-v8m segmentation model.
111-
/// Uses the medium variant of YOLOv8 architecture.
112-
pub fn yoloe_v8m_seg_pf() -> Self {
113-
Self::yoloe()
114-
.with_version(8.into())
115-
.with_scale(Scale::M)
116-
.with_model_file("yoloe-v8m-seg-pf.onnx")
117-
}
118-
119-
/// Creates a configuration for YOLOE-v8l segmentation model.
120-
/// Uses the large variant of YOLOv8 architecture.
121-
pub fn yoloe_v8l_seg_pf() -> Self {
122-
Self::yoloe()
123-
.with_version(8.into())
124-
.with_scale(Scale::L)
125-
.with_model_file("yoloe-v8l-seg-pf.onnx")
126-
}
127-
128-
/// Creates a configuration for YOLOE-11s segmentation model.
129-
/// Uses the small variant of YOLOv11 architecture.
130-
pub fn yoloe_11s_seg_pf() -> Self {
131-
Self::yoloe()
132-
.with_version(11.into())
133-
.with_scale(Scale::S)
134-
.with_model_file("yoloe-11s-seg-pf.onnx")
135-
}
136-
137-
pub fn yoloe_11m_seg_pf() -> Self {
138-
Self::yoloe()
139-
.with_version(11.into())
140-
.with_scale(Scale::M)
141-
.with_model_file("yoloe-11m-seg-pf.onnx")
142-
}
143-
144-
pub fn yoloe_11l_seg_pf() -> Self {
145-
Self::yoloe()
146-
.with_version(11.into())
147-
.with_scale(Scale::L)
148-
.with_model_file("yoloe-11l-seg-pf.onnx")
149-
}
150-
15192
pub fn fastsam_s() -> Self {
15293
Self::yolo_segment()
15394
.with_class_names(&["object"])
@@ -177,60 +118,4 @@ impl Config {
177118
.with_scale(Scale::X)
178119
.with_model_file("rtdetr-x.onnx")
179120
}
180-
181-
fn yoloe_seg_tp() -> Self {
182-
Self::yolo()
183-
.with_batch_size_all(1)
184-
.with_nc(80)
185-
.with_model_ixx(1, 1, (1, 80, 300).into()) // max_text_classes
186-
.with_task(Task::InstanceSegmentation)
187-
.with_textual_file("mobileclip/blt-textual.onnx")
188-
.with_model_max_length(77)
189-
.with_textual_ixx(0, 1, 77.into())
190-
.with_tokenizer_file("clip/tokenizer.json")
191-
.with_tokenizer_config_file("clip/tokenizer_config.json")
192-
.with_special_tokens_map_file("clip/special_tokens_map.json")
193-
}
194-
195-
pub fn yoloe_v8s_seg_tp() -> Self {
196-
Self::yoloe_seg_tp()
197-
.with_version(8.into())
198-
.with_scale(Scale::S)
199-
.with_model_file("yoloe-v8s-seg-tp.onnx")
200-
}
201-
202-
pub fn yoloe_v8m_seg_tp() -> Self {
203-
Self::yoloe_seg_tp()
204-
.with_version(8.into())
205-
.with_scale(Scale::M)
206-
.with_model_file("yoloe-v8m-seg-tp.onnx")
207-
}
208-
209-
pub fn yoloe_v8l_seg_tp() -> Self {
210-
Self::yoloe_seg_tp()
211-
.with_version(8.into())
212-
.with_scale(Scale::L)
213-
.with_model_file("yoloe-v8l-seg-tp.onnx")
214-
}
215-
216-
pub fn yoloe_11s_seg_tp() -> Self {
217-
Self::yoloe_seg_tp()
218-
.with_version(11.into())
219-
.with_scale(Scale::S)
220-
.with_model_file("yoloe-11s-seg-tp.onnx")
221-
}
222-
223-
pub fn yoloe_11m_seg_tp() -> Self {
224-
Self::yoloe_seg_tp()
225-
.with_version(11.into())
226-
.with_scale(Scale::M)
227-
.with_model_file("yoloe-11m-seg-tp.onnx")
228-
}
229-
230-
pub fn yoloe_11l_seg_tp() -> Self {
231-
Self::yoloe_seg_tp()
232-
.with_version(11.into())
233-
.with_scale(Scale::L)
234-
.with_model_file("yoloe-11l-seg-tp.onnx")
235-
}
236121
}

0 commit comments

Comments
 (0)