version: v20260503_p10_dinov3_small_re_v6_camaug · 訓練日 2026-05-03 · backbone vit_small_patch16_dinov3 @ 1280×720
arch: ViT-S patch16 + RoIAlign + MLP 2-cls,外擴 1.0 / 0.2 / 1.5(X / Y_top / Y_bot)
| 版本 | data | test_AP | F1 | P | R | TP | FP | FN | TN | best ep | train s |
|---|---|---|---|---|---|---|---|---|---|---|---|
| v1 | 178 task baseline | 0.9167 | 0.8449 | 0.854 | 0.836 | 1414 | 241 | 278 | 2627 | 4 | 2401 |
| v2 | +14 hard-neg (192) | 0.8755 | 0.8236 | 0.798 | 0.851 | 1473 | 374 | 257 | 2548 | 10 | 2736 |
| v4 | 226 task baseline | 0.8651 | 0.8102 | 0.780 | 0.843 | 1528 | 432 | 284 | 2531 | 8 | 2612 |
| v6 ⭐ | 226 + rotation+blur ⭐ | 0.8884 | 0.8267 | 0.839 | 0.815 | 1476 | 283 | 336 | 2680 | 12 | 3300 |
v1 仍持有最高 AP(test set 規模 4560 相對小,且 178 task 純度高),但 FP 241 是在小 test 上算的;v6 在 226 task / 4775 row 大 test 上達 FP 283 已逼近 v1。
| v4 (no camaug) | v6 (+rot+blur) | Δ | |
|---|---|---|---|
| TP | 1528 | 1476 | -52 |
| FP | 432 | 283 | -149 |
| FN | 284 | 336 | +52 |
| TN | 2531 | 2680 | +149 |
| ep | train_loss | val_AP | val_F1 | val_AP bar |
|---|---|---|---|---|
| 1 | 0.3885 | 0.8681 | 0.7884 | 0.868 |
| 2 | 0.2949 | 0.8382 | 0.7603 | 0.838 |
| 3 | 0.1857 | 0.8613 | 0.7740 | 0.861 |
| 4 | 0.1516 | 0.8805 | 0.8271 | 0.881 |
| 5 | 0.1254 | 0.8600 | 0.7852 | 0.860 |
| 6 | 0.1086 | 0.8806 | 0.8432 | 0.881 |
| 7 | 0.0986 | 0.8573 | 0.8110 | 0.857 |
| 8 | 0.0805 | 0.8627 | 0.8299 | 0.863 |
| 9 | 0.0771 | 0.7951 | 0.7686 | 0.795 |
| 10 | 0.0641 | 0.8848 | 0.8356 | 0.885 |
| 11 | 0.0520 | 0.8711 | 0.8178 | 0.871 |
| 12 ⭐ | 0.0548 | 0.8939 | 0.8427 | 0.894 |
| 13 | 0.0432 | 0.8578 | 0.8126 | 0.858 |
| 14 | 0.0372 | 0.8489 | 0.8221 | 0.849 |
| 15 | 0.0286 | 0.8330 | 0.7970 | 0.833 |
| 16 | 0.0222 | 0.8553 | 0.8298 | 0.855 |
| 17 | 0.0273 | 0.8504 | 0.8278 | 0.850 |
| 18 | 0.0206 | 0.8687 | 0.8285 | 0.869 |
| 19 | 0.0144 | 0.8550 | 0.8310 | 0.855 |
| 20 | 0.0131 | 0.8537 | 0.8126 | 0.854 |
best_epoch=12, val_AP=0.8939(patience=8 在 ep20 觸發 early stop)
| backbone | params | YOLO ms | ViT+ROI ms | total | FPS |
|---|---|---|---|---|---|
| v6 ViT-S DINOv3 | 22.5M | 47 | 82 | 129 | 7.7 |
| v5 ViT-Tiny (對照) | 6.6M | 49 | 7.6 | 56 | 17.8 |
v6 跟 v4 同 backbone,速度一致(差異 <2ms in noise)。ViT-Tiny 速度 2.3× 但 FP 707(v6 的 2.5×),不適合主版。
| 檔案 | 大小 | 用途 | 下載 |
|---|---|---|---|
safety_rope_v20260503_v6_camaug/best.pt | 86 MB | fp32 完整 ckpt(訓練/評估) | R2 link |
safety_rope_v20260503_v6_camaug/best_fp16.pt | 43 MB | fp16 inference ckpt(推論部署用,下載快一半) | R2 link |
safety_rope_v20260503_v6_camaug/summary.json | 5 KB | 訓練 metadata(hyperparams、test metrics、history) | R2 link |
person_yolo11n_v20260501/best.pt | 5.5 MB | YOLO person detector(pipeline 第一階段必備) | R2 link |
完整 pipeline:RTSP/影片 → YOLO 偵 person → 對每個 bbox 外擴 1.0/0.2/1.5 → ViT RoIAlign → 2-cls prob (correct vs wrong)。Apple MPS / CUDA 都支援。
pip install torch torchvision timm ultralytics opencv-python pillow numpy
# 推論部署用 fp16(推薦,43MB)
curl -L -o best_fp16.pt \
https://pub-478929a98a5c440cb22c2241c0bde314.r2.dev/safety_rope_v20260503_v6_camaug/best_fp16.pt
# YOLO person detector
curl -L -o person_yolo11n_v20260501.pt \
https://pub-478929a98a5c440cb22c2241c0bde314.r2.dev/person_yolo11n_v20260501/best.pt
import torch, torch.nn as nn, timm
import torchvision.ops as tvops
class SafetyRopeModel(nn.Module):
"""DINOv3 ViT-S/16 backbone + RoIAlign + MLP 2-cls。
Forward 一次整張 1280×720 圖,N 個 person bbox 共用 backbone feature。"""
def __init__(self, backbone_name="vit_small_patch16_dinov3",
img_w=1280, img_h=720, patch=16, n_special=5):
super().__init__()
self.backbone = timm.create_model(backbone_name, pretrained=False,
num_classes=0, global_pool="")
self.feat_ch = self.backbone.embed_dim # 384
self.grid_h, self.grid_w = img_h // patch, img_w // patch # 45, 80
self.n_special = n_special # DINOv3 = 1 CLS + 4 REG
self.roi_align = tvops.RoIAlign(output_size=(7, 7),
spatial_scale=1.0/patch, sampling_ratio=2)
self.head = nn.Sequential(
nn.Conv2d(self.feat_ch, 256, 3, padding=1), nn.ReLU(inplace=True),
nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(),
nn.Dropout(0.3), nn.Linear(256, 2),
)
def forward(self, image, rois):
# image: [1, 3, 720, 1280] normalized; rois: [N, 5] (batch_idx, x1,y1,x2,y2 in resized coords)
feats = self.backbone.forward_features(image)
# 去 CLS+REG token:DINOv3 是 [B, 5+H*W, D],留 patch tokens
if feats.shape[1] == self.grid_h*self.grid_w + self.n_special:
feats = feats[:, self.n_special:]
elif feats.shape[1] == self.grid_h*self.grid_w + 1:
feats = feats[:, 1:] # CLS only fallback
B, N, D = feats.shape
feats = feats.transpose(1,2).reshape(B, D, self.grid_h, self.grid_w)
return self.head(self.roi_align(feats, rois))
import cv2, numpy as np, torch
from ultralytics import YOLO
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
# 1. 載 ckpt(fp16 也可,下面會自動 cast)
ck = torch.load("best_fp16.pt", map_location=DEVICE, weights_only=False)
IMG_W, IMG_H = ck["img_size"] # 1280, 720
EXPAND_X, EXPAND_YT, EXPAND_YB = ck["expand_x"], ck["expand_y_top"], ck["expand_y_bot"] # 1.0, 0.2, 1.5
THR = float(ck["thr"]) # 0.432(v6 default,可調)
LABELS = ck.get("labels", ["wrong", "correct"])
model = SafetyRopeModel(ck["backbone_name"], IMG_W, IMG_H).to(DEVICE).eval()
# fp16 → fp32 for forward stability
model.load_state_dict({k: v.float() if v.dtype == torch.float16 else v
for k, v in ck["model_state"].items()})
yolo = YOLO("person_yolo11n_v20260501.pt") # 從 R2 下載 person_yolo11n_v20260501/best.pt 改名即可
@torch.no_grad()
def infer(frame_bgr, conf=0.35):
rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
H, W = rgb.shape[:2]
# 1) YOLO person 偵測
res = yolo(rgb, verbose=False, imgsz=640, conf=conf)[0]
if res.boxes is None or len(res.boxes) == 0:
return []
persons = res.boxes.xyxy.cpu().numpy()
# 2) 整張 resize + normalize 給 ViT
img_resized = cv2.resize(rgb, (IMG_W, IMG_H))
arr = (img_resized.astype(np.float32)/255.0 - MEAN) / STD
x = torch.from_numpy(arr.transpose(2,0,1)).unsqueeze(0).float().to(DEVICE)
# 3) 對每個 bbox 外擴 + 縮到 1280×720 座標
sx, sy = IMG_W/W, IMG_H/H
rois = []
for x1, y1, x2, y2 in persons:
bw, bh = x2-x1, y2-y1
ex1 = max(0, x1 - bw*EXPAND_X); ey1 = max(0, y1 - bh*EXPAND_YT)
ex2 = min(W, x2 + bw*EXPAND_X); ey2 = min(H, y2 + bh*EXPAND_YB)
rois.append([0.0, ex1*sx, ey1*sy, ex2*sx, ey2*sy])
rois_t = torch.tensor(rois, dtype=torch.float32).to(DEVICE)
# 4) ViT forward 一次 → N 個 prob
logits = model(x, rois_t)
probs = torch.softmax(logits, dim=-1)[:, 1].float().cpu().numpy() # 第 1 類 = correct
# 5) 組結果
out = []
for (x1,y1,x2,y2), p in zip(persons, probs):
out.append({
"bbox": [float(x1), float(y1), float(x2), float(y2)],
"prob": float(p),
"pred": LABELS[1] if p >= THR else LABELS[0], # "correct" or "wrong"
})
return out
# === 用法 ===
frame = cv2.imread("sample.jpg")
results = infer(frame)
for r in results:
print(f" bbox={r['bbox']} prob={r['prob']:.3f} → {r['pred']}")
thr=0.432 是 val set 上 best F1 點。場域要再降誤報,調到 0.5~0.6(會犧牲 recall)。要再敏感調到 0.3。Raw artifacts: 5090-2:~/runs_new/safety_rope_v20260503_p10_dinov3_small_re_v6_camaug/
對照 audit (v4 e2e): safety_rope_v4_audit_e2e.html(顯示 YOLO 假框 338 / 漏框 3314 等場域真實狀況)
R2 bucket: rai-models (public read) · 帳號 rai.mobile.studio@gmail.com