UnboundLocalError

Louis-Cheng-Liu · September 6, 2024, 6:12am

Have you downgraded PyTorch version?

afa1414 · September 6, 2024, 7:38am

@Louis-Cheng-Liu
Thank you for the good feedback. After double-checking, I found that I was using PyTorch 2.4.1. I downloaded version 1.10.1 again and attempted the conversion, but the output of the ONNX model still appears the same. Additionally, when converting using aml_npu_sdk, I encountered the following error: ValueError: operands could not be broadcast together with shapes (1,0,160,160) (1,16,160,160).

[68541] Failed to execute script pegasus
Traceback (most recent call last):
  File "pegasus.py", line 131, in <module>
  File "pegasus.py", line 112, in main
  File "acuitylib/app/importer/commands.py", line 245, in execute
  File "acuitylib/vsi_nn.py", line 171, in load_onnx
  File "acuitylib/app/importer/import_onnx.py", line 123, in run
  File "acuitylib/converter/onnx/convert_onnx.py", line 61, in __init__
  File "acuitylib/converter/onnx/convert_onnx.py", line 761, in _shape_inference
  File "acuitylib/onnx_ir/onnx_numpy_backend/shape_inference.py", line 65, in infer_shape
  File "acuitylib/onnx_ir/onnx_numpy_backend/smart_graph_engine.py", line 70, in smart_onnx_scanner
  File "acuitylib/onnx_ir/onnx_numpy_backend/smart_node.py", line 48, in calc_and_assign_smart_info
  File "acuitylib/onnx_ir/onnx_numpy_backend/smart_toolkit.py", line 636, in multi_direction_broadcast_shape
ValueError: operands could not be broadcast together with shapes (1,0,160,160) (1,16,160,160)

Lastly, is running a pose model on the Khadas VIM3 board inefficient? I can’t seem to find any pose models attempted on the Khadas VIM3 board on the internet.

Louis-Cheng-Liu · September 6, 2024, 10:22am

Hello @afa1414 ,

I try to modify pose output and then convert ONNX. Here is my modification You can refer.

class Detect(nn.Module):
    """YOLOv8 Detect head for detection models."""
    dynamic = False  # force grid reconstruction
    export = False  # export mode
    shape = None
    anchors = torch.empty(0)  # init
    strides = torch.empty(0)  # init

    def __init__(self, nc=80, ch=()):  # detection layer
        super().__init__()
        self.nc = nc  # number of classes
        self.nl = len(ch)  # number of detection layers
        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
        self.no = nc + self.reg_max * 4  # number of outputs per anchor
        self.stride = torch.zeros(self.nl)  # strides computed during build
        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc)  # channels
        self.cv2 = nn.ModuleList(
            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()

    def forward(self, x):
        """Concatenates and returns predicted bounding boxes and class probabilities."""
        # if torch.onnx.is_in_onnx_export():
        return self.forward_export(x)
        
        shape = x[0].shape  # BCHW
        for i in range(self.nl):
            
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        if self.training:
            # print("######################")
            # print(self.dfl)
            return x
        elif self.dynamic or self.shape != shape:
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape

        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
        if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'):  # avoid TF FlexSplitV ops
            box = x_cat[:, :self.reg_max * 4]
            cls = x_cat[:, self.reg_max * 4:]
        else:
            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
        y = torch.cat((dbox, cls.sigmoid()), 1)
        return y if self.export else (y, x)

    def forward_export(self, x):
        results = []
        for i in range(self.nl):
            dfl = self.cv2[i](x[i]).contiguous()
            cls = self.cv3[i](x[i]).contiguous()
            # results.append(torch.cat([cls, dfl], 1).permute(0, 2, 3, 1))
            results.append(torch.cat([cls, dfl], 1))
        return tuple(results)

class Pose(Detect):
    """YOLOv8 Pose head for keypoints models."""

    def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
        """Initialize YOLO network with default parameters and Convolutional Layers."""
        super().__init__(nc, ch)
        self.kpt_shape = kpt_shape  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
        self.nk = kpt_shape[0] * kpt_shape[1]  # number of keypoints total
        self.detect = Detect.forward

        c4 = max(ch[0] // 4, self.nk)
        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)

    def forward(self, x):
        """Perform forward pass through YOLO model and return predictions."""
        bs = x[0].shape[0]  # batch size
        # kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
        kpt = [self.cv4[i](x[i]) for i in range(self.nl)]
        x = self.detect(self, x)
        result = []
        for i in range(self.nl):
            result.append(torch.cat([x[i], kpt[i]], 1))
        return result
        if self.training:
            return x, kpt
        pred_kpt = self.kpts_decode(kpt)
        return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))

Then convert nb model, Successfully.

You can refer infer time of YOLOv8n Demo. The inference time of the pose model and the detection model is not much different.

afa1414 · September 8, 2024, 9:23am

@Louis-Cheng-Liu
Thank you, but do you know why 116 is coming up?

Louis-Cheng-Liu · September 9, 2024, 9:00am

Hello @afa1414 ,

I have explained before.

116=16×4+1+17×3

16×4 is boxes information. Postprocess will do them to four sides of box (64→4).
1 is boxes confidence. Postprocess will normalize it between 0 to 1.
17×3 is body points. Postprocess will map it to the position of the input image.

afa1414 · September 9, 2024, 11:37am

@Louis-Cheng-Liu
Thank you for your response. I tried writing the code this way, but it didn’t work well. Do I necessarily have to go through a process like the yolov3_post_process function for the output of the YOLOv3 model?

import numpy as np
import cv2 as cv
import time
from ksnn.api import KSNN
from ksnn.types import *

# Constants
GRID0, GRID1, GRID2 = 80, 40, 20  # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17  # Number of keypoints
IMG_SIZE = 640  # Input image size for the model
THRESHOLD = 0.1  # Confidence threshold for keypoints (lowered for testing)

# COCO format keypoint connections
SKELETON = [
    [0, 1], [0, 2], [1, 3], [2, 4],       # Face
    [5, 6],                               # Shoulders
    [5, 7], [7, 9], [6, 8], [8, 10],      # Arms
    [11, 12],                             # Hips
    [11, 13], [13, 15], [12, 14], [14, 16] # Legs
]

def preprocess_image(image_path):
    """
    Preprocess the input image according to the model requirements.
    """
    orig_img = cv.imread(image_path)
    img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)
    img = img / 255.0  # Normalize the image
    img = img.transpose(2, 0, 1)  # Convert to NCHW format
    return orig_img, img

def postprocess_output(outputs, orig_img):
    """
    Post-process the model output to extract keypoints and draw the skeleton on the image.
    """
    # Reshape each output array to the correct size
    input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))  # 1x116x80x80
    input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))  # 1x116x40x40
    input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))  # 1x116x20x20

    # Transform data and add debugging output
    input_data = [
        np.transpose(input0_data, (2, 3, 0, 1)),  # (80, 80, 1, 116)
        np.transpose(input1_data, (2, 3, 0, 1)),  # (40, 40, 1, 116)
        np.transpose(input2_data, (2, 3, 0, 1))   # (20, 20, 1, 116)
    ]

    keypoints = []

    # Post-process keypoints and visualize
    for i in range(3):
        for grid_y in range(input_data[i].shape[0]):
            for grid_x in range(input_data[i].shape[1]):
                # Extract keypoint information (last 51 elements of 116)
                kpts = input_data[i][grid_y, grid_x, 0, 65:116].reshape((-1, NUM_POINTS, 3))

                # Debug: Verify keypoint data
                for pt in kpts:
                    for idx, (x, y, conf) in enumerate(pt):
                        if conf > THRESHOLD:
                            x = int(x * orig_img.shape[1])
                            y = int(y * orig_img.shape[0])
                            if 0 <= x < orig_img.shape[1] and 0 <= y < orig_img.shape[0]:
                                keypoints.append((x, y, conf))
                                cv.circle(orig_img, (x, y), 3, (0, 255, 0), -1)

    # Draw skeleton
    for joint in SKELETON:
        pt1 = keypoints[joint[0]]
        pt2 = keypoints[joint[1]]
        if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:  # Check confidence
            cv.line(orig_img, (pt1[0], pt1[1]), (pt2[0], pt2[1]), (0, 255, 0), 2)

    return orig_img

def run_model(library_path, model_path, image_path):
    """
    Run the model on the VIM3 board and draw the skeleton on the image.
    """
    # Initialize KSNN API
    model = KSNN('VIM3')
    model.nn_init(library=library_path, model=model_path, level=0)

    # Preprocess input image
    orig_img, img = preprocess_image(image_path)

    # Inference
    img = [img]  # Wrap the image in a list
    outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)

    # Post-process and draw the skeleton
    result_img = postprocess_output(outputs, orig_img)

    # Save and display the result
    cv.imwrite("result_pose.jpg", result_img)
    cv.imshow("Pose Estimation", result_img)
    cv.waitKey(0)
    cv.destroyAllWindows()

if __name__ == '__main__':
    # Set paths
    library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"  # Path to C static library
    model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"  # Path to compiled model file
    image_path = "/home/khadas/test/Screenshot.png"  # Path to input image

    run_model(library_path, model_path, image_path)

Louis-Cheng-Liu · September 10, 2024, 2:27am

Hello @afa1414 ,

First, you forgot to decode keypoints.

Second, although you do only need keypoints information, you still need to decode box. Because you need to do NMS by box. For box decode you can refer this.

A mistake in my explanation. The location of box and the confidence of box are reversed.
116=1+16×4+17×3

afa1414 · September 10, 2024, 6:35am

@Louis-Cheng-Liu
Thank you for your response. I have made additional changes to the code as you suggested, but the output coordinates are clustered in one place. Could there be an issue with the model or the code?

import numpy as np
import cv2 as cv
import time
from ksnn.api import KSNN
from ksnn.types import *

# Constants
GRID0, GRID1, GRID2 = 80, 40, 20  # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17  # Number of keypoints
IMG_SIZE = 640  # Input image size for the model
THRESHOLD = 0.1  # Confidence threshold for keypoints (lowered for testing)

# COCO format keypoint connections
SKELETON = [
    [0, 1], [0, 2], [1, 3], [2, 4],       # Face
    [5, 6],                               # Shoulders
    [5, 7], [7, 9], [6, 8], [8, 10],      # Arms
    [11, 12],                             # Hips
    [11, 13], [13, 15], [12, 14], [14, 16] # Legs
]

def preprocess_image(image_path):
    orig_img = cv.imread(image_path)
    img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)
    img = img / 255.0  # Normalize the image
    img = img.transpose(2, 0, 1)  # Convert to NCHW format
    return orig_img, img

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def decode_boxes_and_keypoints(outputs, orig_img):
    # Reshape each output array to the correct format
    input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))  # 1x116x80x80
    input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))  # 1x116x40x40
    input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))  # 1x116x20x20

    input_data = [
        np.transpose(input0_data, (2, 3, 0, 1)),  # (80, 80, 1, 116)
        np.transpose(input1_data, (2, 3, 0, 1)),  # (40, 40, 1, 116)
        np.transpose(input2_data, (2, 3, 0, 1))   # (20, 20, 1, 116)
    ]

    boxes = []
    keypoints = []

    for data in input_data:
        for grid_y in range(data.shape[0]):
            for grid_x in range(data.shape[1]):
                # Decode bounding boxes
                box_conf = sigmoid(data[grid_y, grid_x, 0, 0])  # The first element is the bounding box confidence
                bbox = data[grid_y, grid_x, 0, 1:65].reshape((16, 4))  # Bounding box coordinates
                
                for (x, y, w, h) in bbox:
                    x = sigmoid(x) * IMG_SIZE
                    y = sigmoid(y) * IMG_SIZE
                    w = np.exp(w) * IMG_SIZE
                    h = np.exp(h) * IMG_SIZE
                    boxes.append([x, y, w, h, box_conf])
                
                # Decode keypoints
                kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3))  # Keypoint coordinates
                for x, y, conf in kpts:
                    x = sigmoid(x) * orig_img.shape[1]  # Adjust to the original image size
                    y = sigmoid(y) * orig_img.shape[0]  # Adjust to the original image size
                    conf = sigmoid(conf)
                    keypoints.append((x, y, conf))
    
    return boxes, keypoints


def nms(boxes, threshold):
    # Apply Non-Maximum Suppression (NMS) to filter boxes
    boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
    final_boxes = []
    
    while boxes:
        best_box = boxes.pop(0)
        final_boxes.append(best_box)
        boxes = [box for box in boxes if iou(best_box, box) < threshold]
    
    return final_boxes

def iou(box1, box2):
    # Compute Intersection over Union (IoU) between two boxes
    x1, y1, w1, h1 = box1[:4]
    x2, y2, w2, h2 = box2[:4]

    inter_x1 = max(x1, x2)
    inter_y1 = max(y1, y2)
    inter_x2 = min(x1 + w1, x2 + w2)
    inter_y2 = min(y1 + h1, y2 + h2)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    box1_area = w1 * h1
    box2_area = w2 * h2

    iou_value = inter_area / (box1_area + box2_area - inter_area)
    return iou_value

def postprocess_output(outputs, orig_img):
    boxes, keypoints = decode_boxes_and_keypoints(outputs, orig_img)  # Pass orig_img as an argument
    boxes = nms(boxes, 0.5)  # Apply NMS with a threshold

    # Draw keypoints
    for x, y, conf in keypoints:
        if conf > THRESHOLD:
            cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)

    # Draw skeleton
    for joint in SKELETON:
        pt1 = keypoints[joint[0]]
        pt2 = keypoints[joint[1]]
        if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:
            cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)

    return orig_img


def run_model(library_path, model_path, image_path):
    model = KSNN('VIM3')
    model.nn_init(library=library_path, model=model_path, level=0)

    orig_img, img = preprocess_image(image_path)

    img = [img]
    outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)

    result_img = postprocess_output(outputs, orig_img)

    cv.imwrite("result_pose.jpg", result_img)
    cv.imshow("Pose Estimation", result_img)
    cv.waitKey(0)
    cv.destroyAllWindows()

if __name__ == '__main__':
    library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"
    model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"
    image_path = "/home/khadas/test/Screenshot.png"

    run_model(library_path, model_path, image_path)

Louis-Cheng-Liu · September 10, 2024, 8:40am

Hello @afa1414 ,

Both of decoding are wrong.

For box decoding, if you can not understand official code, you can refer KSNN YOLOv8n demo codes. They are the same.

ksnn/examples/yolov8n/yolov8n-picture.py at master · khadas/ksnn (github.com)

For keypoints decoding, only conf need sigmoid. x and y need to do follow operation.

x = (2 * x + grid_x) * stride_x / input.shape[1] * orig_img.shape[1]
y = (2 * y + grid_y) * stride_y / input.shape[0] * orig_img.shape[0]

grid_x and grid_y are the location of feature map. stride is input size divide feature map size.

stride_x = [640 / 80, 640 / 40, 640 / 20]
stride_y = [640 / 80, 640 / 40, 640 / 20]

afa1414 · September 12, 2024, 7:51am

import numpy as np
import cv2 as cv
import time
from ksnn.api import KSNN
from ksnn.types import *

# Constants
GRID0, GRID1, GRID2 = 80, 40, 20  # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17  # Number of keypoints
IMG_SIZE = 640  # Input image size for the model
THRESHOLD = 0.1  # Confidence threshold for keypoints (lowered for testing)

# COCO format keypoint connections
SKELETON = [
    [0, 1], [0, 2], [1, 3], [2, 4],       # Face
    [5, 6],                               # Shoulders
    [5, 7], [7, 9], [6, 8], [8, 10],      # Arms
    [11, 12],                             # Hips
    [11, 13], [13, 15], [12, 14], [14, 16] # Legs
]

def preprocess_image(image_path):
    print("Preprocessing image...")
    orig_img = cv.imread(image_path)
    if orig_img is None:
        print("Error: Failed to load image.")
        raise ValueError("Failed to load image. Please check the file path.")
    print("Original image loaded successfully.")

    # Resize image to 640x640 and normalize
    img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)  # Use float32 format
    print("Image resized to 640x640 and converted to float32.")

    img = img / 255.0  # Normalize the image to range [0, 1]
    print("Image normalized.")

    # Convert to RGB format for KSNN
    img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    print("Image converted to RGB format.")

    # Convert to NHWC format and ensure memory is contiguous
    img = np.ascontiguousarray(img)
    print("Image memory is contiguous.")

    # Create a list for KSNN input
    img_list = [img]
    print("Image preprocessing completed.")

    return orig_img, img_list


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def decode_boxes_and_keypoints(outputs, orig_img):
    # Reshape each output array to the correct format
    input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))
    input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))
    input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))

    input_data = [
        np.transpose(input0_data, (2, 3, 0, 1)), 
        np.transpose(input1_data, (2, 3, 0, 1)), 
        np.transpose(input2_data, (2, 3, 0, 1))  
    ]

    boxes = []
    keypoints = []

    strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2]

    for idx, data in enumerate(input_data):
        stride_x, stride_y = strides[idx], strides[idx]

        for grid_y in range(data.shape[0]):
            for grid_x in range(data.shape[1]):
                # Decode bounding boxes
                box_conf = sigmoid(data[grid_y, grid_x, 0, 0])  
                bbox = data[grid_y, grid_x, 0, 1:65].reshape((16, 4))  
                
                for (x, y, w, h) in bbox:
                    x = (2 * sigmoid(x) + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]
                    y = (2 * sigmoid(y) + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]
                    w = np.exp(w) * stride_x
                    h = np.exp(h) * stride_y
                    boxes.append([x, y, w, h, box_conf])
                
                # Decode keypoints
                kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3)) 
                for kp_x, kp_y, conf in kpts:
                    kp_x = (2 * kp_x + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]
                    kp_y = (2 * kp_y + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]
                    conf = sigmoid(conf)
                    keypoints.append((kp_x, kp_y, conf))
    
    return boxes, keypoints

def nms(boxes, threshold):
    # Apply Non-Maximum Suppression (NMS) to filter boxes
    boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
    final_boxes = []
    
    while boxes:
        best_box = boxes.pop(0)
        final_boxes.append(best_box)
        boxes = [box for box in boxes if iou(best_box, box) < threshold]
    
    return final_boxes

def iou(box1, box2):
    # Compute Intersection over Union (IoU) between two boxes
    x1, y1, w1, h1 = box1[:4]
    x2, y2, w2, h2 = box2[:4]

    inter_x1 = max(x1, x2)
    inter_y1 = max(y1, y2)
    inter_x2 = min(x1 + w1, x2 + w2)
    inter_y2 = min(y1 + h1, y2 + h2)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    box1_area = w1 * h1
    box2_area = w2 * h2

    iou_value = inter_area / (box1_area + box2_area - inter_area)
    return iou_value

def postprocess_output(outputs, orig_img):
    boxes, keypoints = decode_boxes_and_keypoints(outputs, orig_img)  # Pass orig_img as an argument
    boxes = nms(boxes, 0.5)  # Apply NMS with a threshold

    # Draw keypoints
    for x, y, conf in keypoints:
        if conf > THRESHOLD:
            cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)

    # Draw skeleton
    for joint in SKELETON:
        pt1 = keypoints[joint[0]]
        pt2 = keypoints[joint[1]]
        if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:
            cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)

    return orig_img

def run_model(library_path, model_path, image_path):
    print("Initializing KSNN model...")
    model = KSNN('VIM3')
    model.nn_init(library=library_path, model=model_path, level=0)
    print("Model initialized successfully.")
    
    # Add a delay
    time.sleep(1)

    try:
        orig_img, img_list = preprocess_image(image_path)
    except ValueError as e:
        print(str(e))
        return

    print("Image preprocessed successfully.")

    # Debugging message to check the format and content of the input image
    print(f"Input image shape: {img_list[0].shape}")
    print(f"Input image dtype: {img_list[0].dtype}")
    print(f"Input image data (sample):\n{img_list[0][0:5, 0:5, :]}")  # Print a sample of the data

    try:
        print("Setting inputs...")
        model.nn_set_inputs(img_list, platform='ONNX', reorder='2 1 0', tensor=1)
        print("Inputs set successfully.")
        
        print("Running inference...")
        outputs = model.nn_inference(output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
        print("Inference completed.")
    except Exception as e:
        print(f"Inference failed: {e}")
        return

    try:
        result_img = postprocess_output(outputs, orig_img)
        print("Post-processing completed.")
    except Exception as e:
        print(f"Post-processing failed: {e}")
        return

    try:
        print("Saving and displaying results...")
        cv.imwrite("result_pose.jpg", result_img)
        cv.imshow("Pose Estimation", result_img)
        cv.waitKey(0)
        cv.destroyAllWindows()
        print("Results saved and displayed successfully.")
    except Exception as e:
        print(f"Failed to save or display results: {e}")

if __name__ == '__main__':
    library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"
    model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"
    image_path = "/home/khadas/test/Screenshot.png"

    run_model(library_path, model_path, image_path)

@Louis-Cheng-Liu
Sorry to ask, but could you check it just one more time?

Louis-Cheng-Liu · September 13, 2024, 6:51am

Hello @afa1414 ,

You misunderstand box decoding.

For box decoding, first step is softmax.

def softmax(x, axis=0):
	x = np.exp(x)
	return x / x.sum(axis=axis, keepdims=True)

box_0 = softmax(input[..., NUM_CLS: NUM_CLS + 16], -1)
box_1 = softmax(input[..., NUM_CLS + 16:NUM_CLS + 32], -1)
box_2 = softmax(input[..., NUM_CLS + 32:NUM_CLS + 48], -1)
box_3 = softmax(input[..., NUM_CLS + 48:NUM_CLS + 64], -1)

Second, multiple a constant matrix.

constant_martix = np.array([[0,  1,  2,  3,
			     4,  5,  6,  7,
			     8,  9,  10, 11,
			     12, 13, 14, 15]]).T

result[i, j, :, 0] = np.dot(box_0[i, j], constant_martix)
result[i, j, :, 1] = np.dot(box_1[i, j], constant_martix)
result[i, j, :, 2] = np.dot(box_2[i, j], constant_martix)
result[i, j, :, 3] = np.dot(box_3[i, j], constant_martix)

At last, a simple calculation.

result[..., 0] = (0.5 - result[..., 0] + grid_w) / stride_x # xmin
result[..., 1] = (0.5 - result[..., 1] + grid_h) / stride_y # ymin
result[..., 2] = (0.5 + result[..., 2] + grid_w) / stride_x # xmax
result[..., 3] = (0.5 + result[..., 3] + grid_h) / stride_y # ymax

afa1414 · September 16, 2024, 6:26am

import numpy as np
import cv2 as cv
from ksnn.api import KSNN
from ksnn.types import output_format

# Constants
GRID0, GRID1, GRID2 = 80, 40, 20  # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17  # Number of keypoints
IMG_SIZE = 640  # Input image size for the model
THRESHOLD = 0.7  # Confidence threshold for keypoints
NUM_CLS = 80  # 모델의 클래스 수를 맞게 설정

# COCO format keypoint connections
SKELETON = [
    [0, 1], [0, 2], [1, 3], [2, 4],       # Face
    [5, 6],                               # Shoulders
    [5, 7], [7, 9], [6, 8], [8, 10],      # Arms
    [11, 12],                             # Hips
    [11, 13], [13, 15], [12, 14], [14, 16] # Legs
]

def preprocess_image(image_path):
    orig_img = cv.imread(image_path)
    img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)
    img = img / 255.0  # Normalize the image
    img = img.transpose(2, 0, 1)  # Convert to NCHW format
    
    print(f'Original image shape: {orig_img.shape}')
    print(f'Processed image shape: {img.shape}, dtype: {img.dtype}')
    return orig_img, img


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x, axis=-1):
    x = np.exp(x)
    return x / x.sum(axis=axis, keepdims=True)

def decode_boxes_and_keypoints(outputs, orig_img):
    # 각 출력 배열을 올바른 형식으로 재구성
    input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))  # 1x116x80x80
    input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))  # 1x116x40x40
    input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))  # 1x116x20x20

    input_data = [
        np.transpose(input0_data, (2, 3, 0, 1)),  # (80, 80, 1, 116)
        np.transpose(input1_data, (2, 3, 0, 1)),  # (40, 40, 1, 116)
        np.transpose(input2_data, (2, 3, 0, 1))   # (20, 20, 1, 116)
    ]

    strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2]  # 각 그리드 크기의 스트라이드
    boxes = []
    keypoints = []

    for idx, data in enumerate(input_data):
        stride_x = stride_y = strides[idx]
        for grid_y in range(data.shape[0]):
            for grid_x in range(data.shape[1]):
                # 바운딩 박스 디코딩 - Softmax 계산
                box_conf = sigmoid(data[grid_y, grid_x, 0, 0])  # 첫 번째 요소는 바운딩 박스 신뢰도
                if box_conf < THRESHOLD:  # 신뢰도 필터
                    continue

                # Softmax를 사용하여 바운딩 박스 예측값 계산
                box_0 = softmax(data[grid_y, grid_x, 0, NUM_CLS: NUM_CLS + 16], -1)
                box_1 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 16:NUM_CLS + 32], -1)
                box_2 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 32:NUM_CLS + 48], -1)
                box_3 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 48:NUM_CLS + 64], -1)

                # 상수 행렬과의 곱
                constant_matrix = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]).T
                result_xmin = np.dot(box_0, constant_matrix)
                result_ymin = np.dot(box_1, constant_matrix)
                result_xmax = np.dot(box_2, constant_matrix)
                result_ymax = np.dot(box_3, constant_matrix)

                # 바운딩 박스 좌표 계산
                xmin = (0.5 - result_xmin + grid_x) / stride_x
                ymin = (0.5 - result_ymin + grid_y) / stride_y
                xmax = (0.5 + result_xmax + grid_x) / stride_x
                ymax = (0.5 + result_ymax + grid_y) / stride_y
                boxes.append([xmin, ymin, xmax, ymax, box_conf])

                # 키포인트 디코딩
                kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3))
                for kp_x, kp_y, conf in kpts:
                    kp_x = (2 * kp_x + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]
                    kp_y = (2 * kp_y + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]
                    conf = sigmoid(conf)
                    keypoints.append((kp_x, kp_y, conf))

    return boxes, keypoints

def nms(boxes, threshold):
    # Apply Non-Maximum Suppression (NMS) to filter boxes
    boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
    final_boxes = []
    
    while boxes:
        best_box = boxes.pop(0)
        final_boxes.append(best_box)
        boxes = [box for box in boxes if iou(best_box, box) < threshold]
    
    return final_boxes

def iou(box1, box2):
    # Compute Intersection over Union (IoU) between two boxes
    x1, y1, w1, h1 = box1[:4]
    x2, y2, w2, h2 = box2[:4]

    inter_x1 = max(x1, x2)
    inter_y1 = max(y1, y2)
    inter_x2 = min(x1 + w1, x2 + w2)
    inter_y2 = min(y1 + h1, y2 + h2)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    box1_area = w1 * h1
    box2_area = w2 * h2

    iou_value = inter_area / (box1_area + box2_area - inter_area)
    return iou_value

def postprocess_output(outputs, orig_img):
    boxes, keypoints = decode_boxes_and_keypoints(outputs, orig_img)  # Pass orig_img as an argument
    boxes = nms(boxes, 0.5)  # Apply NMS with a threshold

    # Draw keypoints
    for x, y, conf in keypoints:
        if conf > THRESHOLD:
            cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)

    # Draw skeleton
    for joint in SKELETON:
        if joint[0] < len(keypoints) and joint[1] < len(keypoints):
            pt1 = keypoints[joint[0]]
            pt2 = keypoints[joint[1]]
            if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:
                cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)

    return orig_img

def run_model(library_path, model_path, image_path):
    model = KSNN('VIM3')
    model.nn_init(library=library_path, model=model_path, level=1)

    orig_img, img = preprocess_image(image_path)

    # 이미지 배열을 배치 크기에 맞게 변경
    img = np.expand_dims(img, axis=0)  # (1, 3, 640, 640)

    # nn_inference 호출
    outputs = model.nn_inference(img, platform='ONNX', reorder='0 1 2', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)

    result_img = postprocess_output(outputs, orig_img)

    cv.imwrite("result_pose.jpg", result_img)
    cv.imshow("Pose Estimation", result_img)
    cv.waitKey(0)
    cv.destroyAllWindows()

if __name__ == '__main__':
    library_path = "/home/khadas/test/yolov8n_pose_onnx/libnn_yolov8n_pose_onnx.so"
    model_path = "/home/khadas/test/yolov8n_pose_onnx/yolov8n_pose_onnx.nb"
    image_path = "/home/khadas/test/Screenshot.png"

    run_model(library_path, model_path, image_path)

@Louis-Cheng-Liu
Will it work this way? But this code results in the error: ‘resize input pictures error !!! Set nn inputs error !!!’. Also, is there any issue with the keypoint extraction process?

Louis-Cheng-Liu · September 18, 2024, 8:45am

Hello @afa1414 ,

nn_inference need list data and KSNN will expand image dim.

-img = np.expand_dims(img, axis=0)  # (1, 3, 640, 640)
+img = [img]  # (1, 3, 640, 640)

Three errors.
First, you forget to modify NUM_CLS.

-NUM_CLS = 80
+NUM_CLS = 1

Second, ONNX model need '2 1 0' reorder.

-outputs = model.nn_inference(img, platform='ONNX', reorder='0 1 2', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
+outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)

At last, Keypoints information and boxes information are associated. Each box has its own keypoints. A box is abandoned by NMS and its keypoints also need to be abandoned. So you should not separate them.

afa1414 · September 21, 2024, 5:41am

@Louis-Cheng-Liu
I made modifications, but the same error still occurs. In other posts, it was solved by updating ksnn, but my Khadas VIM3 board has a kernel version of 4.9. Should I use ksnn 1.4
And

import numpy as np
import cv2 as cv
from ksnn.api import KSNN
from ksnn.types import output_format

# Constants
GRID0, GRID1, GRID2 = 80, 40, 20  # Grid sizes for each output
LISTSIZE = 116
NUM_POINTS = 17  # Number of keypoints
IMG_SIZE = 640  # Input image size for the model
THRESHOLD = 0.5  # Confidence threshold for keypoints
NUM_CLS = 1  # Adjust the number of classes for the model

# COCO format keypoint connections (Skeleton structure)
SKELETON = [
    [0, 1], [0, 2], [1, 3], [2, 4],       # Face connections
    [5, 6],                               # Shoulders connection
    [5, 7], [7, 9], [6, 8], [8, 10],      # Arm connections
    [11, 12],                             # Hip connection
    [11, 13], [13, 15], [12, 14], [14, 16] # Leg connections
]

# Function to preprocess the input image
def preprocess_image(image_path):
    orig_img = cv.imread(image_path)  # Read the original image
    img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)  # Resize and change data type
    img = img / 255.0  # Normalize the image (scale between 0 and 1)
    img = img.transpose(2, 0, 1)  # Convert the image to NCHW format (Channel-first)

    print(f'Original image shape: {orig_img.shape}')
    print(f'Processed image shape: {img.shape}, dtype: {img.dtype}')
    return orig_img, img  # Return the original and preprocessed images

# Sigmoid function (to compress values between 0 and 1)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Softmax function (for calculating class probabilities)
def softmax(x, axis=-1):
    x = np.exp(x)
    return x / x.sum(axis=axis, keepdims=True)

# Function to decode the boxes and keypoints from the model output
def decode_boxes_and_keypoints(outputs, orig_img):
    # Reshape each output array into the correct format
    input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))
    input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))
    input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))

    input_data = [
        np.transpose(input0_data, (2, 3, 0, 1)),  # First output
        np.transpose(input1_data, (2, 3, 0, 1)),  # Second output
        np.transpose(input2_data, (2, 3, 0, 1))   # Third output
    ]

    strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2]  # Strides corresponding to each grid size
    box_keypoints_pairs = []  # List to store box and keypoint pairs

    # Iterate over each grid cell to decode boxes and keypoints
    for idx, data in enumerate(input_data):
        stride_x = stride_y = strides[idx]
        for grid_y in range(data.shape[0]):
            for grid_x in range(data.shape[1]):
                box_conf = sigmoid(data[grid_y, grid_x, 0, 0])  # Confidence score for the box
                if box_conf < THRESHOLD:
                    continue

                # Calculate the bounding box coordinates
                box_0 = softmax(data[grid_y, grid_x, 0, NUM_CLS: NUM_CLS + 16], -1)
                box_1 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 16:NUM_CLS + 32], -1)
                box_2 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 32:NUM_CLS + 48], -1)
                box_3 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 48:NUM_CLS + 64], -1)

                constant_matrix = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]).T
                result_xmin = np.dot(box_0, constant_matrix)
                result_ymin = np.dot(box_1, constant_matrix)
                result_xmax = np.dot(box_2, constant_matrix)
                result_ymax = np.dot(box_3, constant_matrix)

                xmin = (0.5 - result_xmin + grid_x) / stride_x
                ymin = (0.5 - result_ymin + grid_y) / stride_y
                xmax = (0.5 + result_xmax + grid_x) / stride_x
                ymax = (0.5 + result_ymax + grid_y) / stride_y
                box = [xmin, ymin, xmax, ymax, box_conf]  # Save the bounding box coordinates

                # Decode the keypoints
                kpts = data[grid_y, grid_x, 0, 65:116].reshape((17, 3))  # Extract keypoint data
                keypoints = []
                for kp_x, kp_y, conf in kpts:
                    kp_x = (2 * kp_x + grid_x) * stride_x / IMG_SIZE * orig_img.shape[1]  # Transform x coordinate
                    kp_y = (2 * kp_y + grid_y) * stride_y / IMG_SIZE * orig_img.shape[0]  # Transform y coordinate
                    conf = sigmoid(conf)  # Apply confidence value
                    keypoints.append((kp_x, kp_y, conf))  # Save keypoint

                # Save the box and keypoint pair
                box_keypoints_pairs.append((box, keypoints))

    return box_keypoints_pairs  # Return box and keypoint pairs

# Non-Maximum Suppression (NMS) function to filter out overlapping boxes
def nms(box_keypoints_pairs, threshold):
    box_keypoints_pairs = sorted(box_keypoints_pairs, key=lambda x: x[0][4], reverse=True)  # Sort by box confidence score
    final_pairs = []
    
    while box_keypoints_pairs:
        best_pair = box_keypoints_pairs.pop(0)  # Select the box with the highest confidence
        final_pairs.append(best_pair)
        box_keypoints_pairs = [pair for pair in box_keypoints_pairs if iou(best_pair[0], pair[0]) < threshold]  # Remove overlapping boxes
    
    return final_pairs  # Return the final selected boxes and keypoints

# Function to compute the Intersection over Union (IoU) between two boxes
def iou(box1, box2):
    x1, y1, w1, h1 = box1[:4]
    x2, y2, w2, h2 = box2[:4]

    inter_x1 = max(x1, x2)  # x coordinate of the top-left corner of the intersection
    inter_y1 = max(y1, y2)  # y coordinate of the top-left corner of the intersection
    inter_x2 = min(x1 + w1, x2 + w2)  # x coordinate of the bottom-right corner of the intersection
    inter_y2 = min(y1 + h1, y2 + h2)  # y coordinate of the bottom-right corner of the intersection

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)  # Calculate the intersection area
    box1_area = w1 * h1  # Area of the first box
    box2_area = w2 * h2  # Area of the second box

    iou_value = inter_area / (box1_area + box2_area - inter_area)  # Calculate IoU value
    return iou_value  # Return the IoU value

# Post-processing function to draw boxes, keypoints, and skeleton
def postprocess_output(outputs, orig_img):
    box_keypoints_pairs = decode_boxes_and_keypoints(outputs, orig_img)
    final_pairs = nms(box_keypoints_pairs, 0.5)  # Apply NMS

    # Draw keypoints and skeleton
    for box, keypoints in final_pairs:
        # Draw keypoints
        for x, y, conf in keypoints:
            if conf > THRESHOLD:  # Only draw keypoints with confidence greater than the threshold
                cv.circle(orig_img, (int(x), int(y)), 3, (0, 255, 0), -1)

        # Draw skeleton
        for joint in SKELETON:
            pt1 = keypoints[joint[0]]
            pt2 = keypoints[joint[1]]
            if pt1[2] > THRESHOLD and pt2[2] > THRESHOLD:  # Only connect points with high confidence
                cv.line(orig_img, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])), (0, 255, 0), 2)

    return orig_img  # Return the processed image

# Function to run the model
def run_model(library_path, model_path, image_path):
    model = KSNN('VIM3')
    model.nn_init(library=library_path, model=model_path, level=1)  # Initialize the model

    orig_img, img = preprocess_image(image_path)  # Preprocess the image

    img = [img]  # Prepare input image array

    # Run inference with nn_inference
    outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)

    result_img = postprocess_output(outputs, orig_img)  # Post-process the inference results
    output_path = "/home/khadas/test/result_pose.jpg"  # Path to save the result image
    cv.imwrite(output_path, result_img)  # Save the result image
    print(f"Pose estimation result saved to {output_path}")

# Main function
if __name__ == '__main__':
    library_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/libnn_yolov8n_pose_onnx.so"  # Path to the library
    model_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/yolov8n_pose_onnx.nb"  # Path to the model
    image_path = "/home/khadas/test/Screenshot.png"  # Path to the input image

    run_model(library_path, model_path, image_path)  # Run the model

I made the changes as you suggested and tried running it on ksnn-1.4, but the skeleton was not drawn.
When I look at the confidence scores of the predicted results, they are too low. Could there be a problem with the model? The values generally range from around 0.4 to the upper 0.5 range.

And right now, I’m experimenting with the code that outputs only the bounding box for the person, but there is an issue with the output along with the image.

import numpy as np
import cv2 as cv
from ksnn.api import KSNN
from ksnn.types import output_format

# Define constants
GRID0, GRID1, GRID2 = 80, 40, 20  # Grid sizes for each output
LISTSIZE = 116
IMG_SIZE = 640  # Input image size for the model
THRESHOLD = 0.5  # Confidence threshold for bounding boxes
NUM_CLS = 1  # Number of classes in the model (person)

# Sigmoid function (compresses values between 0 and 1)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Softmax function (used to calculate class probabilities)
def softmax(x, axis=-1):
    x = np.exp(x)
    return x / x.sum(axis=axis, keepdims=True)

# Function to decode boxes from the model output
def decode_boxes(outputs):
    # Reshape each output array into the correct format
    input0_data = np.array(outputs[0]).reshape((1, LISTSIZE, GRID0, GRID0))
    input1_data = np.array(outputs[1]).reshape((1, LISTSIZE, GRID1, GRID1))
    input2_data = np.array(outputs[2]).reshape((1, LISTSIZE, GRID2, GRID2))

    input_data = [
        np.transpose(input0_data, (2, 3, 0, 1)),  # First output
        np.transpose(input1_data, (2, 3, 0, 1)),  # Second output
        np.transpose(input2_data, (2, 3, 0, 1))   # Third output
    ]

    strides = [IMG_SIZE / GRID0, IMG_SIZE / GRID1, IMG_SIZE / GRID2]  # Strides for each grid size
    boxes = []  # List to store boxes

    # Iterate over each grid cell and decode the boxes
    for idx, data in enumerate(input_data):
        stride_x = stride_y = strides[idx]
        for grid_y in range(data.shape[0]):
            for grid_x in range(data.shape[1]):
                box_conf = sigmoid(data[grid_y, grid_x, 0, 0])  # Box confidence score
                if box_conf < THRESHOLD:
                    continue

                # Calculate bounding box coordinates
                box_0 = softmax(data[grid_y, grid_x, 0, NUM_CLS: NUM_CLS + 16], -1)
                box_1 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 16:NUM_CLS + 32], -1)
                box_2 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 32:NUM_CLS + 48], -1)
                box_3 = softmax(data[grid_y, grid_x, 0, NUM_CLS + 48:NUM_CLS + 64], -1)

                constant_matrix = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]).T
                result_xmin = np.dot(box_0, constant_matrix)
                result_ymin = np.dot(box_1, constant_matrix)
                result_xmax = np.dot(box_2, constant_matrix)
                result_ymax = np.dot(box_3, constant_matrix)

                # Convert numpy arrays to scalar values using .item()
                xmin = (0.5 - result_xmin.item() + grid_x) / stride_x
                ymin = (0.5 - result_ymin.item() + grid_y) / stride_y
                xmax = (0.5 + result_xmax.item() + grid_x) / stride_x
                ymax = (0.5 + result_ymax.item() + grid_y) / stride_y

                # Check if the values are in the correct format before adding them to the list
                if isinstance(xmin, (int, float)) and isinstance(ymin, (int, float)) and isinstance(xmax, (int, float)) and isinstance(ymax, (int, float)) and isinstance(box_conf, (int, float)):
                    box = [xmin, ymin, xmax, ymax, box_conf]  # Store bounding box coordinates
                    boxes.append(box)
                else:
                    print(f"Invalid box detected: {xmin}, {ymin}, {xmax}, {ymax}, {box_conf}")

    return boxes  # Return the boxes


# Non-Maximum Suppression (NMS) function to filter overlapping boxes
def nms(boxes, threshold):
    if len(boxes) == 0:
        return []

    # Convert boxes list to a numpy array
    boxes = np.array(boxes)

    # Split the box coordinates
    x1 = boxes[:, 0]  # xmin
    y1 = boxes[:, 1]  # ymin
    x2 = boxes[:, 2]  # xmax
    y2 = boxes[:, 3]  # ymax
    scores = boxes[:, 4]  # Confidence scores

    # Calculate the area of each box
    areas = (x2 - x1) * (y2 - y1)

    # Sort the boxes by confidence score in descending order
    order = scores.argsort()[::-1]

    keep = []  # List to store the indices of the boxes to keep
    while order.size > 0:
        i = order[0]  # Index of the box with the highest confidence score
        keep.append(i)

        # Calculate the intersection area with the remaining boxes
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 0.00001)
        h = np.maximum(0.0, yy2 - yy1 + 0.00001)
        inter = w * h  # Calculate the intersection area

        # Calculate the IoU
        iou = inter / (areas[i] + areas[order[1:]] - inter)

        # Keep only the boxes with IoU below the threshold
        inds = np.where(iou <= threshold)[0]
        order = order[inds + 1]

    # Return the final selected boxes
    return boxes[keep].tolist()


# Function to calculate the Intersection over Union (IoU) between two boxes
def iou(box1, box2):
    x1, y1, x2, y2 = box1[:4]
    x1b, y1b, x2b, y2b = box2[:4]

    inter_x1 = max(x1, x1b)  # Top-left x-coordinate of the intersection area
    inter_y1 = max(y1, y1b)  # Top-left y-coordinate of the intersection area
    inter_x2 = min(x2, x2b)  # Bottom-right x-coordinate of the intersection area
    inter_y2 = min(y2, y2b)  # Bottom-right y-coordinate of the intersection area

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)  # Calculate the intersection area
    box1_area = (x2 - x1) * (y2 - y1)  # Area of the first box
    box2_area = (x2b - x1b) * (y2b - y1b)  # Area of the second box

    iou_value = inter_area / (box1_area + box2_area - inter_area)  # Calculate the IoU value
    return iou_value  # Return the IoU value

# Post-processing function to draw the boxes on the image
def postprocess_output(outputs, orig_img):
    boxes = decode_boxes(outputs)
    final_boxes = nms(boxes, 0.5)  # Apply NMS

    # Draw the boxes
    for box in final_boxes:
        x1, y1, x2, y2, score = box
        x1 = int(x1 * orig_img.shape[1])
        y1 = int(y1 * orig_img.shape[0])
        x2 = int(x2 * orig_img.shape[1])
        y2 = int(y2 * orig_img.shape[0])

        # Draw the box on the image
        cv.rectangle(orig_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv.putText(orig_img, f'Person {score:.2f}', (x1, y1 - 10), cv.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    return orig_img  # Return the processed image

# Function to run the model
def run_model(library_path, model_path, image_path):
    model = KSNN('VIM3')
    model.nn_init(library=library_path, model=model_path, level=1)  # Initialize the model

    orig_img = cv.imread(image_path)  # Read the image
    img = cv.resize(orig_img, (IMG_SIZE, IMG_SIZE)).astype(np.float32)  # Resize and normalize the image
    img = img / 255.0
    img = img.transpose(2, 0, 1)  # Convert to NCHW format
    img = [img]  # Prepare the input image array

    # Run inference using nn_inference
    outputs = model.nn_inference(img, platform='ONNX', reorder='2 1 0', input_tensor=1, output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)

    result_img = postprocess_output(outputs, orig_img)  # Post-process the inference results
    output_path = "/home/khadas/test/result_boxes.jpg"  # Path to save the result image
    cv.imwrite(output_path, result_img)  # Save the result image
    print(f"Person detection result saved to {output_path}")

# Main function
if __name__ == '__main__':
    library_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/libnn_yolov8n_pose_onnx.so"  # Path to the library
    model_path = "/home/khadas/test/yolov8n_pose_onnx_uint8/yolov8n_pose_onnx.nb"  # Path to the model
    image_path = "/home/khadas/test/Screenshot.png"  # Path to the input image

    run_model(library_path, model_path, image_path)  # Run the model