Hello @afa1414 ,
Have you downgraded PyTorch version?
Hello @afa1414 ,
Have you downgraded PyTorch version?
@Louis-Cheng-Liu
Thank you for the good feedback. After double-checking, I found that I was using PyTorch 2.4.1. I downloaded version 1.10.1 again and attempted the conversion, but the output of the ONNX model still appears the same. Additionally, when converting using aml_npu_sdk, I encountered the following error: ValueError: operands could not be broadcast together with shapes (1,0,160,160) (1,16,160,160).
[68541] Failed to execute script pegasus
Traceback (most recent call last):
File "pegasus.py", line 131, in <module>
File "pegasus.py", line 112, in main
File "acuitylib/app/importer/commands.py", line 245, in execute
File "acuitylib/vsi_nn.py", line 171, in load_onnx
File "acuitylib/app/importer/import_onnx.py", line 123, in run
File "acuitylib/converter/onnx/convert_onnx.py", line 61, in __init__
File "acuitylib/converter/onnx/convert_onnx.py", line 761, in _shape_inference
File "acuitylib/onnx_ir/onnx_numpy_backend/shape_inference.py", line 65, in infer_shape
File "acuitylib/onnx_ir/onnx_numpy_backend/smart_graph_engine.py", line 70, in smart_onnx_scanner
File "acuitylib/onnx_ir/onnx_numpy_backend/smart_node.py", line 48, in calc_and_assign_smart_info
File "acuitylib/onnx_ir/onnx_numpy_backend/smart_toolkit.py", line 636, in multi_direction_broadcast_shape
ValueError: operands could not be broadcast together with shapes (1,0,160,160) (1,16,160,160)
Lastly, is running a pose model on the Khadas VIM3 board inefficient? I can’t seem to find any pose models attempted on the Khadas VIM3 board on the internet.
Hello @afa1414 ,
I try to modify pose output and then convert ONNX. Here is my modification You can refer.
class Detect(nn.Module):
"""YOLOv8 Detect head for detection models."""
dynamic = False # force grid reconstruction
export = False # export mode
shape = None
anchors = torch.empty(0) # init
strides = torch.empty(0) # init
def __init__(self, nc=80, ch=()): # detection layer
super().__init__()
self.nc = nc # number of classes
self.nl = len(ch) # number of detection layers
self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
self.no = nc + self.reg_max * 4 # number of outputs per anchor
self.stride = torch.zeros(self.nl) # strides computed during build
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc) # channels
self.cv2 = nn.ModuleList(
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
def forward(self, x):
"""Concatenates and returns predicted bounding boxes and class probabilities."""
# if torch.onnx.is_in_onnx_export():
return self.forward_export(x)
shape = x[0].shape # BCHW
for i in range(self.nl):
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
if self.training:
# print("######################")
# print(self.dfl)
return x
elif self.dynamic or self.shape != shape:
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
self.shape = shape
x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
box = x_cat[:, :self.reg_max * 4]
cls = x_cat[:, self.reg_max * 4:]
else:
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
y = torch.cat((dbox, cls.sigmoid()), 1)
return y if self.export else (y, x)
def forward_export(self, x):
results = []
for i in range(self.nl):
dfl = self.cv2[i](x[i]).contiguous()
cls = self.cv3[i](x[i]).contiguous()
# results.append(torch.cat([cls, dfl], 1).permute(0, 2, 3, 1))
results.append(torch.cat([cls, dfl], 1))
return tuple(results)
class Pose(Detect):
"""YOLOv8 Pose head for keypoints models."""
def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
"""Initialize YOLO network with default parameters and Convolutional Layers."""
super().__init__(nc, ch)
self.kpt_shape = kpt_shape # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
self.nk = kpt_shape[0] * kpt_shape[1] # number of keypoints total
self.detect = Detect.forward
c4 = max(ch[0] // 4, self.nk)
self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
def forward(self, x):
"""Perform forward pass through YOLO model and return predictions."""
bs = x[0].shape[0] # batch size
# kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1) # (bs, 17*3, h*w)
kpt = [self.cv4[i](x[i]) for i in range(self.nl)]
x = self.detect(self, x)
result = []
for i in range(self.nl):
result.append(torch.cat([x[i], kpt[i]], 1))
return result
if self.training:
return x, kpt
pred_kpt = self.kpts_decode(kpt)
return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
Then convert nb model, Successfully.
You can refer infer time of YOLOv8n Demo. The inference time of the pose model and the detection model is not much different.
@Louis-Cheng-Liu
Thank you, but do you know why 116 is coming up?
Hello @afa1414 ,
I have explained before.
116=16×4+1+17×3
16×4 is boxes information. Postprocess will do them to four sides of box (64→4).
1 is boxes confidence. Postprocess will normalize it between 0 to 1.
17×3 is body points. Postprocess will map it to the position of the input image.
@Louis-Cheng-Liu
Thank you for your response. I tried writing the code this way, but it didn’t work well. Do I necessarily have to go through a process like the yolov3_post_process
function for the output of the YOLOv3 model?
import numpy as np
import cv2 as cv
import time
from ksnn.api import KSNN
from ksnn.types import *
# Constants
GRID0, GRID1, GRID2 = 80, 40, 20 # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17 # Number of keypoints
IMG_SIZE = 640 # Input image size for the model
THRESHOLD = 0.1 # Confidence threshold for keypoints (lowered for testing)
# COCO format keypoint connections
SKELETON = [
[0, 1], [0, 2], [1, 3], [2, 4], # Face
[5, 6], # Shoulders
[5, 7], [7, 9], [6, 8], [8, 10], # Arms
[11, 12], # Hips
[11, 13], [13, 15], [12, 14], [14, 16] # Legs
]
def preprocess_image(image_path):
"""
Preprocess the input image according to the model requirements.
"""
orig_img = cv.imread(image_path)
img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)
img = img / 255.0 # Normalize the image
img = img.transpose(2, 0, 1) # Convert to NCHW format
return orig_img, img
def postprocess_output(outputs, orig_img):
"""
Post-process the model output to extract keypoints and draw the skeleton on the image.
"""
# Reshape each output array to the correct size
input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0)) # 1x116x80x80
input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1)) # 1x116x40x40
input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2)) # 1x116x20x20
# Transform data and add debugging output
input_data = [
np.transpose(input0_data, (2, 3, 0, 1)), # (80, 80, 1, 116)
np.transpose(input1_data, (2, 3, 0, 1)), # (40, 40, 1, 116)
np.transpose(input2_data, (2, 3, 0, 1)) # (20, 20, 1, 116)
]
keypoints = []
# Post-process keypoints and visualize
for i in range(3):
for grid_y in range(input_data[i].shape[0]):
for grid_x in range(input_data[i].shape[1]):
# Extract keypoint information (last 51 elements of 116)
kpts = input_data[i][grid_y, grid_x, 0, 65:116].reshape((-1, NUM_POINTS, 3))
# Debug: Verify keypoint data
for pt in kpts:
for idx, (x, y, conf) in enumerate(pt):
if conf > THRESHOLD:
x = int(x * orig_img.shape[1])
y = int(y * orig_img.shape[0])
if 0 <= x < orig_img.shape[1] and 0 <= y < orig_img.shape[0]:
keypoints.append((x, y, conf))
cv.circle(orig_img, (x, y), 3, (0, 255, 0), -1)
# Draw skeleton
for joint in SKELETON:
pt1 = keypoints[joint[0]]
pt2 = keypoints[joint[1]]
if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD: # Check confidence
cv.line(orig_img, (pt1[0], pt1[1]), (pt2[0], pt2[1]), (0, 255, 0), 2)
return orig_img
def run_model(library_path, model_path, image_path):
"""
Run the model on the VIM3 board and draw the skeleton on the image.
"""
# Initialize KSNN API
model = KSNN('VIM3')
model.nn_init(library=library_path, model=model_path, level=0)
# Preprocess input image
orig_img, img = preprocess_image(image_path)
# Inference
img = [img] # Wrap the image in a list
outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
# Post-process and draw the skeleton
result_img = postprocess_output(outputs, orig_img)
# Save and display the result
cv.imwrite("result_pose.jpg", result_img)
cv.imshow("Pose Estimation", result_img)
cv.waitKey(0)
cv.destroyAllWindows()
if __name__ == '__main__':
# Set paths
library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so" # Path to C static library
model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb" # Path to compiled model file
image_path = "/home/khadas/test/Screenshot.png" # Path to input image
run_model(library_path, model_path, image_path)
Hello @afa1414 ,
First, you forgot to decode keypoints.
Second, although you do only need keypoints information, you still need to decode box. Because you need to do NMS by box. For box decode you can refer this.
A mistake in my explanation. The location of box and the confidence of box are reversed.
116=1+16×4+17×3
@Louis-Cheng-Liu
Thank you for your response. I have made additional changes to the code as you suggested, but the output coordinates are clustered in one place. Could there be an issue with the model or the code?
import numpy as np
import cv2 as cv
import time
from ksnn.api import KSNN
from ksnn.types import *
# Constants
GRID0, GRID1, GRID2 = 80, 40, 20 # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17 # Number of keypoints
IMG_SIZE = 640 # Input image size for the model
THRESHOLD = 0.1 # Confidence threshold for keypoints (lowered for testing)
# COCO format keypoint connections
SKELETON = [
[0, 1], [0, 2], [1, 3], [2, 4], # Face
[5, 6], # Shoulders
[5, 7], [7, 9], [6, 8], [8, 10], # Arms
[11, 12], # Hips
[11, 13], [13, 15], [12, 14], [14, 16] # Legs
]
def preprocess_image(image_path):
orig_img = cv.imread(image_path)
img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)
img = img / 255.0 # Normalize the image
img = img.transpose(2, 0, 1) # Convert to NCHW format
return orig_img, img
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def decode_boxes_and_keypoints(outputs, orig_img):
# Reshape each output array to the correct format
input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0)) # 1x116x80x80
input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1)) # 1x116x40x40
input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2)) # 1x116x20x20
input_data = [
np.transpose(input0_data, (2, 3, 0, 1)), # (80, 80, 1, 116)
np.transpose(input1_data, (2, 3, 0, 1)), # (40, 40, 1, 116)
np.transpose(input2_data, (2, 3, 0, 1)) # (20, 20, 1, 116)
]
boxes = []
keypoints = []
for data in input_data:
for grid_y in range(data.shape[0]):
for grid_x in range(data.shape[1]):
# Decode bounding boxes
box_conf = sigmoid(data[grid_y, grid_x, 0, 0]) # The first element is the bounding box confidence
bbox = data[grid_y, grid_x, 0, 1:65].reshape((16, 4)) # Bounding box coordinates
for (x, y, w, h) in bbox:
x = sigmoid(x) * IMG_SIZE
y = sigmoid(y) * IMG_SIZE
w = np.exp(w) * IMG_SIZE
h = np.exp(h) * IMG_SIZE
boxes.append([x, y, w, h, box_conf])
# Decode keypoints
kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3)) # Keypoint coordinates
for x, y, conf in kpts:
x = sigmoid(x) * orig_img.shape[1] # Adjust to the original image size
y = sigmoid(y) * orig_img.shape[0] # Adjust to the original image size
conf = sigmoid(conf)
keypoints.append((x, y, conf))
return boxes, keypoints
def nms(boxes, threshold):
# Apply Non-Maximum Suppression (NMS) to filter boxes
boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
final_boxes = []
while boxes:
best_box = boxes.pop(0)
final_boxes.append(best_box)
boxes = [box for box in boxes if iou(best_box, box) < threshold]
return final_boxes
def iou(box1, box2):
# Compute Intersection over Union (IoU) between two boxes
x1, y1, w1, h1 = box1[:4]
x2, y2, w2, h2 = box2[:4]
inter_x1 = max(x1, x2)
inter_y1 = max(y1, y2)
inter_x2 = min(x1 + w1, x2 + w2)
inter_y2 = min(y1 + h1, y2 + h2)
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
box1_area = w1 * h1
box2_area = w2 * h2
iou_value = inter_area / (box1_area + box2_area - inter_area)
return iou_value
def postprocess_output(outputs, orig_img):
boxes, keypoints = decode_boxes_and_keypoints(outputs, orig_img) # Pass orig_img as an argument
boxes = nms(boxes, 0.5) # Apply NMS with a threshold
# Draw keypoints
for x, y, conf in keypoints:
if conf > THRESHOLD:
cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)
# Draw skeleton
for joint in SKELETON:
pt1 = keypoints[joint[0]]
pt2 = keypoints[joint[1]]
if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:
cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)
return orig_img
def run_model(library_path, model_path, image_path):
model = KSNN('VIM3')
model.nn_init(library=library_path, model=model_path, level=0)
orig_img, img = preprocess_image(image_path)
img = [img]
outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
result_img = postprocess_output(outputs, orig_img)
cv.imwrite("result_pose.jpg", result_img)
cv.imshow("Pose Estimation", result_img)
cv.waitKey(0)
cv.destroyAllWindows()
if __name__ == '__main__':
library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"
model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"
image_path = "/home/khadas/test/Screenshot.png"
run_model(library_path, model_path, image_path)
Hello @afa1414 ,
Both of decoding are wrong.
For box decoding, if you can not understand official code, you can refer KSNN YOLOv8n demo codes. They are the same.
For keypoints decoding, only conf need sigmoid. x and y need to do follow operation.
x = (2 * x + grid_x) * stride_x / input.shape[1] * orig_img.shape[1]
y = (2 * y + grid_y) * stride_y / input.shape[0] * orig_img.shape[0]
grid_x and grid_y are the location of feature map. stride is input size divide feature map size.
stride_x = [640 / 80, 640 / 40, 640 / 20]
stride_y = [640 / 80, 640 / 40, 640 / 20]
import numpy as np
import cv2 as cv
import time
from ksnn.api import KSNN
from ksnn.types import *
# Constants
GRID0, GRID1, GRID2 = 80, 40, 20 # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17 # Number of keypoints
IMG_SIZE = 640 # Input image size for the model
THRESHOLD = 0.1 # Confidence threshold for keypoints (lowered for testing)
# COCO format keypoint connections
SKELETON = [
[0, 1], [0, 2], [1, 3], [2, 4], # Face
[5, 6], # Shoulders
[5, 7], [7, 9], [6, 8], [8, 10], # Arms
[11, 12], # Hips
[11, 13], [13, 15], [12, 14], [14, 16] # Legs
]
def preprocess_image(image_path):
print("Preprocessing image...")
orig_img = cv.imread(image_path)
if orig_img is None:
print("Error: Failed to load image.")
raise ValueError("Failed to load image. Please check the file path.")
print("Original image loaded successfully.")
# Resize image to 640x640 and normalize
img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32) # Use float32 format
print("Image resized to 640x640 and converted to float32.")
img = img / 255.0 # Normalize the image to range [0, 1]
print("Image normalized.")
# Convert to RGB format for KSNN
img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
print("Image converted to RGB format.")
# Convert to NHWC format and ensure memory is contiguous
img = np.ascontiguousarray(img)
print("Image memory is contiguous.")
# Create a list for KSNN input
img_list = [img]
print("Image preprocessing completed.")
return orig_img, img_list
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def decode_boxes_and_keypoints(outputs, orig_img):
# Reshape each output array to the correct format
input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))
input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))
input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))
input_data = [
np.transpose(input0_data, (2, 3, 0, 1)),
np.transpose(input1_data, (2, 3, 0, 1)),
np.transpose(input2_data, (2, 3, 0, 1))
]
boxes = []
keypoints = []
strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2]
for idx, data in enumerate(input_data):
stride_x, stride_y = strides[idx], strides[idx]
for grid_y in range(data.shape[0]):
for grid_x in range(data.shape[1]):
# Decode bounding boxes
box_conf = sigmoid(data[grid_y, grid_x, 0, 0])
bbox = data[grid_y, grid_x, 0, 1:65].reshape((16, 4))
for (x, y, w, h) in bbox:
x = (2 * sigmoid(x) + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]
y = (2 * sigmoid(y) + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]
w = np.exp(w) * stride_x
h = np.exp(h) * stride_y
boxes.append([x, y, w, h, box_conf])
# Decode keypoints
kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3))
for kp_x, kp_y, conf in kpts:
kp_x = (2 * kp_x + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]
kp_y = (2 * kp_y + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]
conf = sigmoid(conf)
keypoints.append((kp_x, kp_y, conf))
return boxes, keypoints
def nms(boxes, threshold):
# Apply Non-Maximum Suppression (NMS) to filter boxes
boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
final_boxes = []
while boxes:
best_box = boxes.pop(0)
final_boxes.append(best_box)
boxes = [box for box in boxes if iou(best_box, box) < threshold]
return final_boxes
def iou(box1, box2):
# Compute Intersection over Union (IoU) between two boxes
x1, y1, w1, h1 = box1[:4]
x2, y2, w2, h2 = box2[:4]
inter_x1 = max(x1, x2)
inter_y1 = max(y1, y2)
inter_x2 = min(x1 + w1, x2 + w2)
inter_y2 = min(y1 + h1, y2 + h2)
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
box1_area = w1 * h1
box2_area = w2 * h2
iou_value = inter_area / (box1_area + box2_area - inter_area)
return iou_value
def postprocess_output(outputs, orig_img):
boxes, keypoints = decode_boxes_and_keypoints(outputs, orig_img) # Pass orig_img as an argument
boxes = nms(boxes, 0.5) # Apply NMS with a threshold
# Draw keypoints
for x, y, conf in keypoints:
if conf > THRESHOLD:
cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)
# Draw skeleton
for joint in SKELETON:
pt1 = keypoints[joint[0]]
pt2 = keypoints[joint[1]]
if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:
cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)
return orig_img
def run_model(library_path, model_path, image_path):
print("Initializing KSNN model...")
model = KSNN('VIM3')
model.nn_init(library=library_path, model=model_path, level=0)
print("Model initialized successfully.")
# Add a delay
time.sleep(1)
try:
orig_img, img_list = preprocess_image(image_path)
except ValueError as e:
print(str(e))
return
print("Image preprocessed successfully.")
# Debugging message to check the format and content of the input image
print(f"Input image shape: {img_list[0].shape}")
print(f"Input image dtype: {img_list[0].dtype}")
print(f"Input image data (sample):\n{img_list[0][0:5, 0:5, :]}") # Print a sample of the data
try:
print("Setting inputs...")
model.nn_set_inputs(img_list, platform='ONNX', reorder='2 1 0', tensor=1)
print("Inputs set successfully.")
print("Running inference...")
outputs = model.nn_inference(output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
print("Inference completed.")
except Exception as e:
print(f"Inference failed: {e}")
return
try:
result_img = postprocess_output(outputs, orig_img)
print("Post-processing completed.")
except Exception as e:
print(f"Post-processing failed: {e}")
return
try:
print("Saving and displaying results...")
cv.imwrite("result_pose.jpg", result_img)
cv.imshow("Pose Estimation", result_img)
cv.waitKey(0)
cv.destroyAllWindows()
print("Results saved and displayed successfully.")
except Exception as e:
print(f"Failed to save or display results: {e}")
if __name__ == '__main__':
library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"
model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"
image_path = "/home/khadas/test/Screenshot.png"
run_model(library_path, model_path, image_path)
@Louis-Cheng-Liu
Sorry to ask, but could you check it just one more time?
Hello @afa1414 ,
You misunderstand box decoding.
For box decoding, first step is softmax.
def softmax(x, axis=0):
x = np.exp(x)
return x / x.sum(axis=axis, keepdims=True)
box_0 = softmax(input[..., NUM_CLS: NUM_CLS + 16], -1)
box_1 = softmax(input[..., NUM_CLS + 16:NUM_CLS + 32], -1)
box_2 = softmax(input[..., NUM_CLS + 32:NUM_CLS + 48], -1)
box_3 = softmax(input[..., NUM_CLS + 48:NUM_CLS + 64], -1)
Second, multiple a constant matrix.
constant_martix = np.array([[0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15]]).T
result[i, j, :, 0] = np.dot(box_0[i, j], constant_martix)
result[i, j, :, 1] = np.dot(box_1[i, j], constant_martix)
result[i, j, :, 2] = np.dot(box_2[i, j], constant_martix)
result[i, j, :, 3] = np.dot(box_3[i, j], constant_martix)
At last, a simple calculation.
result[..., 0] = (0.5 - result[..., 0] + grid_w) / stride_x # xmin
result[..., 1] = (0.5 - result[..., 1] + grid_h) / stride_y # ymin
result[..., 2] = (0.5 + result[..., 2] + grid_w) / stride_x # xmax
result[..., 3] = (0.5 + result[..., 3] + grid_h) / stride_y # ymax
import numpy as np
import cv2 as cv
from ksnn.api import KSNN
from ksnn.types import output_format
# Constants
GRID0, GRID1, GRID2 = 80, 40, 20 # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17 # Number of keypoints
IMG_SIZE = 640 # Input image size for the model
THRESHOLD = 0.7 # Confidence threshold for keypoints
NUM_CLS = 80 # 모델의 클래스 수를 맞게 설정
# COCO format keypoint connections
SKELETON = [
[0, 1], [0, 2], [1, 3], [2, 4], # Face
[5, 6], # Shoulders
[5, 7], [7, 9], [6, 8], [8, 10], # Arms
[11, 12], # Hips
[11, 13], [13, 15], [12, 14], [14, 16] # Legs
]
def preprocess_image(image_path):
orig_img = cv.imread(image_path)
img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)
img = img / 255.0 # Normalize the image
img = img.transpose(2, 0, 1) # Convert to NCHW format
print(f'Original image shape: {orig_img.shape}')
print(f'Processed image shape: {img.shape}, dtype: {img.dtype}')
return orig_img, img
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def softmax(x, axis=-1):
x = np.exp(x)
return x / x.sum(axis=axis, keepdims=True)
def decode_boxes_and_keypoints(outputs, orig_img):
# 각 출력 배열을 올바른 형식으로 재구성
input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0)) # 1x116x80x80
input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1)) # 1x116x40x40
input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2)) # 1x116x20x20
input_data = [
np.transpose(input0_data, (2, 3, 0, 1)), # (80, 80, 1, 116)
np.transpose(input1_data, (2, 3, 0, 1)), # (40, 40, 1, 116)
np.transpose(input2_data, (2, 3, 0, 1)) # (20, 20, 1, 116)
]
strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2] # 각 그리드 크기의 스트라이드
boxes = []
keypoints = []
for idx, data in enumerate(input_data):
stride_x = stride_y = strides[idx]
for grid_y in range(data.shape[0]):
for grid_x in range(data.shape[1]):
# 바운딩 박스 디코딩 - Softmax 계산
box_conf = sigmoid(data[grid_y, grid_x, 0, 0]) # 첫 번째 요소는 바운딩 박스 신뢰도
if box_conf < THRESHOLD: # 신뢰도 필터
continue
# Softmax를 사용하여 바운딩 박스 예측값 계산
box_0 = softmax(data[grid_y, grid_x, 0, NUM_CLS: NUM_CLS + 16], -1)
box_1 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 16:NUM_CLS + 32], -1)
box_2 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 32:NUM_CLS + 48], -1)
box_3 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 48:NUM_CLS + 64], -1)
# 상수 행렬과의 곱
constant_matrix = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]).T
result_xmin = np.dot(box_0, constant_matrix)
result_ymin = np.dot(box_1, constant_matrix)
result_xmax = np.dot(box_2, constant_matrix)
result_ymax = np.dot(box_3, constant_matrix)
# 바운딩 박스 좌표 계산
xmin = (0.5 - result_xmin + grid_x) / stride_x
ymin = (0.5 - result_ymin + grid_y) / stride_y
xmax = (0.5 + result_xmax + grid_x) / stride_x
ymax = (0.5 + result_ymax + grid_y) / stride_y
boxes.append([xmin, ymin, xmax, ymax, box_conf])
# 키포인트 디코딩
kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3))
for kp_x, kp_y, conf in kpts:
kp_x = (2 * kp_x + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]
kp_y = (2 * kp_y + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]
conf = sigmoid(conf)
keypoints.append((kp_x, kp_y, conf))
return boxes, keypoints
def nms(boxes, threshold):
# Apply Non-Maximum Suppression (NMS) to filter boxes
boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
final_boxes = []
while boxes:
best_box = boxes.pop(0)
final_boxes.append(best_box)
boxes = [box for box in boxes if iou(best_box, box) < threshold]
return final_boxes
def iou(box1, box2):
# Compute Intersection over Union (IoU) between two boxes
x1, y1, w1, h1 = box1[:4]
x2, y2, w2, h2 = box2[:4]
inter_x1 = max(x1, x2)
inter_y1 = max(y1, y2)
inter_x2 = min(x1 + w1, x2 + w2)
inter_y2 = min(y1 + h1, y2 + h2)
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
box1_area = w1 * h1
box2_area = w2 * h2
iou_value = inter_area / (box1_area + box2_area - inter_area)
return iou_value
def postprocess_output(outputs, orig_img):
boxes, keypoints = decode_boxes_and_keypoints(outputs, orig_img) # Pass orig_img as an argument
boxes = nms(boxes, 0.5) # Apply NMS with a threshold
# Draw keypoints
for x, y, conf in keypoints:
if conf > THRESHOLD:
cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)
# Draw skeleton
for joint in SKELETON:
if joint[0] < len(keypoints) and joint[1] < len(keypoints):
pt1 = keypoints[joint[0]]
pt2 = keypoints[joint[1]]
if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:
cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)
return orig_img
def run_model(library_path, model_path, image_path):
model = KSNN('VIM3')
model.nn_init(library=library_path, model=model_path, level=1)
orig_img, img = preprocess_image(image_path)
# 이미지 배열을 배치 크기에 맞게 변경
img = np.expand_dims(img, axis=0) # (1, 3, 640, 640)
# nn_inference 호출
outputs = model.nn_inference(img, platform='ONNX', reorder='0 1 2', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
result_img = postprocess_output(outputs, orig_img)
cv.imwrite("result_pose.jpg", result_img)
cv.imshow("Pose Estimation", result_img)
cv.waitKey(0)
cv.destroyAllWindows()
if __name__ == '__main__':
library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"
model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"
image_path = "/home/khadas/test/Screenshot.png"
run_model(library_path, model_path, image_path)
@Louis-Cheng-Liu
Will it work this way? But this code results in the error: ‘resize input pictures error !!! Set nn inputs error !!!’. Also, is there any issue with the keypoint extraction process?
Hello @afa1414 ,
nn_inference need list data and KSNN will expand image dim.
-img = np.expand_dims(img, axis=0) # (1, 3, 640, 640)
+img = [img] # (1, 3, 640, 640)
Three errors.
First, you forget to modify NUM_CLS.
-NUM_CLS = 80
+NUM_CLS = 1
Second, ONNX model need '2 1 0'
reorder.
-outputs = model.nn_inference(img, platform='ONNX', reorder='0 1 2', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
+outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
At last, Keypoints information and boxes information are associated. Each box has its own keypoints. A box is abandoned by NMS and its keypoints also need to be abandoned. So you should not separate them.
@Louis-Cheng-Liu
I made modifications, but the same error still occurs. In other posts, it was solved by updating ksnn, but my Khadas VIM3 board has a kernel version of 4.9. Should I use ksnn 1.4
And
import numpy as np
import cv2 as cv
from ksnn.api import KSNN
from ksnn.types import output_format
# Constants
GRID0, GRID1, GRID2 = 80, 40, 20 # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17 # Number of keypoints
IMG_SIZE = 640 # Input image size for the model
THRESHOLD = 0.5 # Confidence threshold for keypoints
NUM_CLS = 1 # Adjust the number of classes for the model
# COCO format keypoint connections (Skeleton structure)
SKELETON = [
[0, 1], [0, 2], [1, 3], [2, 4], # Face connections
[5, 6], # Shoulders connection
[5, 7], [7, 9], [6, 8], [8, 10], # Arm connections
[11, 12], # Hip connection
[11, 13], [13, 15], [12, 14], [14, 16] # Leg connections
]
# Function to preprocess the input image
def preprocess_image(image_path):
orig_img = cv.imread(image_path) # Read the original image
img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32) # Resize and change data type
img = img / 255.0 # Normalize the image (scale between 0 and 1)
img = img.transpose(2, 0, 1) # Convert the image to NCHW format (Channel-first)
print(f'Original image shape: {orig_img.shape}')
print(f'Processed image shape: {img.shape}, dtype: {img.dtype}')
return orig_img, img # Return the original and preprocessed images
# Sigmoid function (to compress values between 0 and 1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# Softmax function (for calculating class probabilities)
def softmax(x, axis=-1):
x = np.exp(x)
return x / x.sum(axis=axis, keepdims=True)
# Function to decode the boxes and keypoints from the model output
def decode_boxes_and_keypoints(outputs, orig_img):
# Reshape each output array into the correct format
input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))
input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))
input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))
input_data = [
np.transpose(input0_data, (2, 3, 0, 1)), # First output
np.transpose(input1_data, (2, 3, 0, 1)), # Second output
np.transpose(input2_data, (2, 3, 0, 1)) # Third output
]
strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2] # Strides corresponding to each grid size
box_keypoints_pairs = [] # List to store box and keypoint pairs
# Iterate over each grid cell to decode boxes and keypoints
for idx, data in enumerate(input_data):
stride_x = stride_y = strides[idx]
for grid_y in range(data.shape[0]):
for grid_x in range(data.shape[1]):
box_conf = sigmoid(data[grid_y, grid_x, 0, 0]) # Confidence score for the box
if box_conf < THRESHOLD:
continue
# Calculate the bounding box coordinates
box_0 = softmax(data[grid_y, grid_x, 0, NUM_CLS: NUM_CLS + 16], -1)
box_1 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 16:NUM_CLS + 32], -1)
box_2 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 32:NUM_CLS + 48], -1)
box_3 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 48:NUM_CLS + 64], -1)
constant_matrix = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]).T
result_xmin = np.dot(box_0, constant_matrix)
result_ymin = np.dot(box_1, constant_matrix)
result_xmax = np.dot(box_2, constant_matrix)
result_ymax = np.dot(box_3, constant_matrix)
xmin = (0.5 - result_xmin + grid_x) / stride_x
ymin = (0.5 - result_ymin + grid_y) / stride_y
xmax = (0.5 + result_xmax + grid_x) / stride_x
ymax = (0.5 + result_ymax + grid_y) / stride_y
box = [xmin, ymin, xmax, ymax, box_conf] # Save the bounding box coordinates
# Decode the keypoints
kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3)) # Extract keypoint data
keypoints = []
for kp_x, kp_y, conf in kpts:
kp_x = (2 * kp_x + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1] # Transform x coordinate
kp_y = (2 * kp_y + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0] # Transform y coordinate
conf = sigmoid(conf) # Apply confidence value
keypoints.append((kp_x, kp_y, conf)) # Save keypoint
# Save the box and keypoint pair
box_keypoints_pairs.append((box, keypoints))
return box_keypoints_pairs # Return box and keypoint pairs
# Non-Maximum Suppression (NMS) function to filter out overlapping boxes
def nms(box_keypoints_pairs, threshold):
box_keypoints_pairs = sorted(box_keypoints_pairs, key=lambda x: x[0][4], reverse=True) # Sort by box confidence score
final_pairs = []
while box_keypoints_pairs:
best_pair = box_keypoints_pairs.pop(0) # Select the box with the highest confidence
final_pairs.append(best_pair)
box_keypoints_pairs = [pair for pair in box_keypoints_pairs if iou(best_pair[0], pair[0]) < threshold] # Remove overlapping boxes
return final_pairs # Return the final selected boxes and keypoints
# Function to compute the Intersection over Union (IoU) between two boxes
def iou(box1, box2):
x1, y1, w1, h1 = box1[:4]
x2, y2, w2, h2 = box2[:4]
inter_x1 = max(x1, x2) # x coordinate of the top-left corner of the intersection
inter_y1 = max(y1, y2) # y coordinate of the top-left corner of the intersection
inter_x2 = min(x1 + w1, x2 + w2) # x coordinate of the bottom-right corner of the intersection
inter_y2 = min(y1 + h1, y2 + h2) # y coordinate of the bottom-right corner of the intersection
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) # Calculate the intersection area
box1_area = w1 * h1 # Area of the first box
box2_area = w2 * h2 # Area of the second box
iou_value = inter_area / (box1_area + box2_area - inter_area) # Calculate IoU value
return iou_value # Return the IoU value
# Post-processing function to draw boxes, keypoints, and skeleton
def postprocess_output(outputs, orig_img):
box_keypoints_pairs = decode_boxes_and_keypoints(outputs, orig_img)
final_pairs = nms(box_keypoints_pairs, 0.5) # Apply NMS
# Draw keypoints and skeleton
for box, keypoints in final_pairs:
# Draw keypoints
for x, y, conf in keypoints:
if conf > THRESHOLD: # Only draw keypoints with confidence greater than the threshold
cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)
# Draw skeleton
for joint in SKELETON:
pt1 = keypoints[joint[0]]
pt2 = keypoints[joint[1]]
if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD: # Only connect points with high confidence
cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)
return orig_img # Return the processed image
# Function to run the model
def run_model(library_path, model_path, image_path):
model = KSNN('VIM3')
model.nn_init(library=library_path, model=model_path, level=1) # Initialize the model
orig_img, img = preprocess_image(image_path) # Preprocess the image
img = [img] # Prepare input image array
# Run inference with nn_inference
outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
result_img = postprocess_output(outputs, orig_img) # Post-process the inference results
output_path = "/home/khadas/test/result_pose.jpg" # Path to save the result image
cv.imwrite(output_path, result_img) # Save the result image
print(f"Pose estimation result saved to {output_path}")
# Main function
if __name__ == '__main__':
library_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/libnn_yolov8n_pose_onnx.so" # Path to the library
model_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/yolov8n_pose_onnx.nb" # Path to the model
image_path = "/home/khadas/test/Screenshot.png" # Path to the input image
run_model(library_path, model_path, image_path) # Run the model
I made the changes as you suggested and tried running it on ksnn-1.4, but the skeleton was not drawn.
When I look at the confidence scores of the predicted results, they are too low. Could there be a problem with the model? The values generally range from around 0.4 to the upper 0.5 range.
And right now, I’m experimenting with the code that outputs only the bounding box for the person, but there is an issue with the output along with the image.
import numpy as np
import cv2 as cv
from ksnn.api import KSNN
from ksnn.types import output_format
# Define constants
GRID0, GRID1, GRID2 = 80, 40, 20 # Grid sizes for each output
LISTSIZE = 116
IMG_SIZE = 640 # Input image size for the model
THRESHOLD = 0.5 # Confidence threshold for bounding boxes
NUM_CLS = 1 # Number of classes in the model (person)
# Sigmoid function (compresses values between 0 and 1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# Softmax function (used to calculate class probabilities)
def softmax(x, axis=-1):
x = np.exp(x)
return x / x.sum(axis=axis, keepdims=True)
# Function to decode boxes from the model output
def decode_boxes(outputs):
# Reshape each output array into the correct format
input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))
input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))
input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))
input_data = [
np.transpose(input0_data, (2, 3, 0, 1)), # First output
np.transpose(input1_data, (2, 3, 0, 1)), # Second output
np.transpose(input2_data, (2, 3, 0, 1)) # Third output
]
strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2] # Strides for each grid size
boxes = [] # List to store boxes
# Iterate over each grid cell and decode the boxes
for idx, data in enumerate(input_data):
stride_x = stride_y = strides[idx]
for grid_y in range(data.shape[0]):
for grid_x in range(data.shape[1]):
box_conf = sigmoid(data[grid_y, grid_x, 0, 0]) # Box confidence score
if box_conf < THRESHOLD:
continue
# Calculate bounding box coordinates
box_0 = softmax(data[grid_y, grid_x, 0, NUM_CLS: NUM_CLS + 16], -1)
box_1 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 16:NUM_CLS + 32], -1)
box_2 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 32:NUM_CLS + 48], -1)
box_3 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 48:NUM_CLS + 64], -1)
constant_matrix = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]).T
result_xmin = np.dot(box_0, constant_matrix)
result_ymin = np.dot(box_1, constant_matrix)
result_xmax = np.dot(box_2, constant_matrix)
result_ymax = np.dot(box_3, constant_matrix)
# Convert numpy arrays to scalar values using .item()
xmin = (0.5 - result_xmin.item() + grid_x) / stride_x
ymin = (0.5 - result_ymin.item() + grid_y) / stride_y
xmax = (0.5 + result_xmax.item() + grid_x) / stride_x
ymax = (0.5 + result_ymax.item() + grid_y) / stride_y
# Check if the values are in the correct format before adding them to the list
if isinstance(xmin, (int, float)) and isinstance(ymin, (int, float)) and isinstance(xmax, (int, float)) and isinstance(ymax, (int, float)) and isinstance(box_conf, (int, float)):
box = [xmin, ymin, xmax, ymax, box_conf] # Store bounding box coordinates
boxes.append(box)
else:
print(f"Invalid box detected: {xmin}, {ymin}, {xmax}, {ymax}, {box_conf}")
return boxes # Return the boxes
# Non-Maximum Suppression (NMS) function to filter overlapping boxes
def nms(boxes, threshold):
if len(boxes) == 0:
return []
# Convert boxes list to a numpy array
boxes = np.array(boxes)
# Split the box coordinates
x1 = boxes[:, 0] # xmin
y1 = boxes[:, 1] # ymin
x2 = boxes[:, 2] # xmax
y2 = boxes[:, 3] # ymax
scores = boxes[:, 4] # Confidence scores
# Calculate the area of each box
areas = (x2 - x1) * (y2 - y1)
# Sort the boxes by confidence score in descending order
order = scores.argsort()[::-1]
keep = [] # List to store the indices of the boxes to keep
while order.size > 0:
i = order[0] # Index of the box with the highest confidence score
keep.append(i)
# Calculate the intersection area with the remaining boxes
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 0.00001)
h = np.maximum(0.0, yy2 - yy1 + 0.00001)
inter = w * h # Calculate the intersection area
# Calculate the IoU
iou = inter / (areas[i] + areas[order[1:]] - inter)
# Keep only the boxes with IoU below the threshold
inds = np.where(iou <= threshold)[0]
order = order[inds + 1]
# Return the final selected boxes
return boxes[keep].tolist()
# Function to calculate the Intersection over Union (IoU) between two boxes
def iou(box1, box2):
x1, y1, x2, y2 = box1[:4]
x1b, y1b, x2b, y2b = box2[:4]
inter_x1 = max(x1, x1b) # Top-left x-coordinate of the intersection area
inter_y1 = max(y1, y1b) # Top-left y-coordinate of the intersection area
inter_x2 = min(x2, x2b) # Bottom-right x-coordinate of the intersection area
inter_y2 = min(y2, y2b) # Bottom-right y-coordinate of the intersection area
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) # Calculate the intersection area
box1_area = (x2 - x1) * (y2 - y1) # Area of the first box
box2_area = (x2b - x1b) * (y2b - y1b) # Area of the second box
iou_value = inter_area / (box1_area + box2_area - inter_area) # Calculate the IoU value
return iou_value # Return the IoU value
# Post-processing function to draw the boxes on the image
def postprocess_output(outputs, orig_img):
boxes = decode_boxes(outputs)
final_boxes = nms(boxes, 0.5) # Apply NMS
# Draw the boxes
for box in final_boxes:
x1, y1, x2, y2, score = box
x1 = int(x1 * orig_img.shape[1])
y1 = int(y1 * orig_img.shape[0])
x2 = int(x2 * orig_img.shape[1])
y2 = int(y2 * orig_img.shape[0])
# Draw the box on the image
cv.rectangle(orig_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv.putText(orig_img, f'Person {score:.2f}', (x1, y1 - 10), cv.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
return orig_img # Return the processed image
# Function to run the model
def run_model(library_path, model_path, image_path):
model = KSNN('VIM3')
model.nn_init(library=library_path, model=model_path, level=1) # Initialize the model
orig_img = cv.imread(image_path) # Read the image
img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32) # Resize and normalize the image
img = img / 255.0
img = img.transpose(2, 0, 1) # Convert to NCHW format
img = [img] # Prepare the input image array
# Run inference using nn_inference
outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
result_img = postprocess_output(outputs, orig_img) # Post-process the inference results
output_path = "/home/khadas/test/result_boxes.jpg" # Path to save the result image
cv.imwrite(output_path, result_img) # Save the result image
print(f"Person detection result saved to {output_path}")
# Main function
if __name__ == '__main__':
library_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/libnn_yolov8n_pose_onnx.so" # Path to the library
model_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/yolov8n_pose_onnx.nb" # Path to the model
image_path = "/home/khadas/test/Screenshot.png" # Path to the input image
run_model(library_path, model_path, image_path) # Run the model
I would appreciate your response. @Louis-Cheng-Liu, @Frank
Hello @afa1414
@Louis-Cheng-Liu is out of office, and he will help you once he come back.
ok thank you @numbqq
Sorry, but when can I expect to receive a reply? @numbqq
Hello @afa1414
Sorry, it should be next week.
ok thank you @numbqq