Using opencv opecl

What system do you use? Android, Ubuntu, OOWOW or other?

Ubuntu22.04

What version of the system do you use? Kadas official image, homebrew, or other?

Kadas official ubutu

Please describe your issue below:

kernel.cl
__kernel void vector_add(__global const float *a, __global const float *b, __global const float *c, const unsigned int n) {
    int id = get_global_id(0);
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

test.py
import cv2
import numpy as np
import pyopencl as cl
import time

Vector operations work fine, but when I process images, I get all zeros.
KERNEL_SOURCE = "kernel.cl"

with open(KERNEL_SOURCE, "r") as source_file:
    kernel_str = source_file.read()


platforms = cl.get_platforms()
if not platforms:
    print("No OpenCL platforms found.")
    exit(1)

platform = platforms[0]


devices = platform.get_devices(cl.device_type.ALL)
if not devices:
    print("No OpenCL devices found.")
    exit(1)

device = devices[0]
print(f"Using device: {device.name}")


context = cl.Context([device])
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)


try:
    program = cl.Program(context, kernel_str).build()
except cl.RuntimeError as e:
    print("Error during kernel build:", e)
    print("Build log:\n", program.get_build_info(device, cl.program_build_info.LOG))
    exit(1)

copy_image = program.copy_image


gst_pipeline = (
    'v4l2src device=/dev/video0 ! '
    'videoconvert ! '
    'video/x-raw,format=BGRx ! '
    'appsink'
)

OpenCV VideoCapture ๊ฐ์ฒด ์ƒ์„ฑ

cap = cv2.VideoCapture(gst_pipeline, cv2.CAP_GSTREAMER)

if not cap.isOpened():
    print("Error: Unable to open video source.")
    exit(1)

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture frame")
            break


        print("Frame captured")


        height, width = frame.shape[:2]
        print(f"Frame size: {width}x{height}")


        image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8)

      
        mf = cl.mem_flags
        frame_bgra = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        print("Converted frame to BGRA")

        src_buf = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR, image_format, shape=(width, height), hostbuf=frame_bgra)
        dst_buf = cl.Image(context, mf.WRITE_ONLY, image_format, shape=(width, height))

  
        copy_image.set_args(src_buf, dst_buf)

    
        cl.enqueue_nd_range_kernel(queue, copy_image, (width, height), None)
        queue.finish()
        print("Kernel executed")

  
        result = np.empty_like(frame_bgra)
        cl.enqueue_copy(queue, result, dst_buf, origin=(0, 0), region=(width, height))
        queue.finish()
        print("Result copied from device")


        result_bgr = cv2.cvtColor(result, cv2.COLOR_BGRA2BGR)
        print("Converted result to BGR")

   
        cv2.imshow('Original', frame)
        cv2.imshow('Copied', result_bgr)

  
        time.sleep(0.033)  

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("KeyboardInterrupt detected, exiting gracefully.")

finally:
    cap.release()
    cv2.destroyAllWindows()

Vector operations work fine, but when I process images, I get all zeros.
Do you know why?

$ sudo mv /usr/lib/libOpenCL.so /usr/lib/libOpenCL.so.old
$ sudo ln -s /usr/lib/aarch64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so

I tried this as well and also installed cl and confirmed that the gpu is caught in clinfo

Post a console log of the issue below:


**Delete this line and post the log here.

Translated with DeepL Translate: The world's most accurate translator (free version)

Hello @Electr1

Can you help to check this issue?

1 Like
Platform: ARM Platform
  Device: Mali-G52 r1p0 - Type: ALL | GPU
The kernel itself is deducing from the cl test that the computation is fine

Kernel Functions 
----------------------------------------------
__kernel void vector_add(__global const float *a, __global const float *b, __global float *c, const unsigned int n) {
    int id = get_global_id(0);
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

testcode


import pyopencl as cl

import numpy as np


KERNEL_SOURCE = "vector_add.cl"

with open(KERNEL_SOURCE, "r") as source_file:

kernel_str = source_file.read()



platforms = cl.get_platforms()

if not platforms:

print("No OpenCL platforms found.")

exit(1)


platform = platforms[0]

devices = platform.get_devices(cl.device_type.ALL)

if not devices:

print("No OpenCL devices found.")

exit(1)

device = devices[0]

print(f"Using device: {device.name}")



context = cl.Context([device])

queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)



program = cl.Program(context, kernel_str).build()

vector_add = program.vector_add



N = 1024

a = np.random.rand(N).astype(np.float32)

b = np.random.rand(N).astype(np.float32)

c = np.empty_like(a)



mf = cl.mem_flags

a_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)

b_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)

c_buf = cl.Buffer(context, mf.WRITE_ONLY, c.nbytes)



vector_add.set_args(a_buf, b_buf, c_buf, np.uint32(N))



cl.enqueue_nd_range_kernel(queue, vector_add, (N,), None)

cl.enqueue_copy(queue, c, c_buf)

queue.finish()

print("Input A:", a)

print("Input B:", b)

print("Output C:", c)

print("Expected C:", a + b)

print("Difference:", np.linalg.norm(c - (a + b)))
______________________________________________

result
Using device: Mali-G52 r1p0
Input A: [0.7967211  0.9415344  0.85789233 ... 0.4951092  0.6097247  0.6960514 ]
Input B: [0.30978295 0.23329993 0.31276643 ... 0.8768382  0.34669176 0.9608981 ]
Output C: [1.1065041 1.1748344 1.1706588 ... 1.3719474 0.9564165 1.6569495]
Expected C: [1.1065041 1.1748344 1.1706588 ... 1.3719474 0.9564165 1.6569495]
Difference: 0.0

But when I pass it as an image, whether it's gray scale or just a copy, all values come out as 0.

__kernel void copy_image(
    __read_only image2d_t src,
    __write_only image2d_t dst)
{
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    float4 pixel = read_imagef(src, coord);
    write_imagef(dst, coord, pixel);
}

Before Kernel Traversal


[[164 168 147]
[166 170 151]
[167 170 154]
โ€ฆ
[ 54 38 38]
[ 52 36 37]
[ 46 30 31]]


After passing


[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
โ€ฆ
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]]

[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]


Hi @tyjk456789

I will check this issue and follow up for you.

@tyjk456789 seems like maybe you have a data mismatch, you are using float for pixel values in your kernel, as seen by usage of image_readf(), but your camera data is np.uint8,

Please verify if your kernel is right, it doesnโ€™t seem to align with your code.

you either need to change the kernel to do operation on uchar datatype (this is the the equivalent for np.uint8). Alternative is to cast the integer to normalized floating point value ( value * 1 / 255).
This involves some processing on the cpu side, but it will allow you to do faster processing on the GPU side.

thanks!
I applied it as you suggested and it works!!! Iโ€™ll post my code just in case anyone else runs into the same problem

import cv2
import numpy as np
import pyopencl as cl
import time

# OpenCL ์ปค๋„ ์†Œ์Šค ์ฝ”๋“œ
KERNEL_SOURCE = """
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

__kernel void grayscale_and_canny(
    __read_only image2d_t src,
    __write_only image2d_t dst,
    __global float* gaussian_kernel,
    int kernel_size,
    float low_threshold,
    float high_threshold)
{
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    uint4 pixel = read_imageui(src, coord);
    float gray = 0.299f * pixel.x + 0.587f * pixel.y + 0.114f * pixel.z;
    
    // Gaussian Blur
    float sum = 0.0f;
    int half_size = kernel_size / 2;
    for (int i = -half_size; i <= half_size; ++i) {
        for (int j = -half_size; j <= half_size; ++j) {
            int2 offset = (int2)(i, j);
            uint4 p = read_imageui(src, coord + offset);
            float intensity = 0.299f * p.x + 0.587f * p.y + 0.114f * p.z;
            sum += intensity * gaussian_kernel[(i + half_size) * kernel_size + (j + half_size)];
        }
    }
    float blurred = sum;

    // Sobel Filter
    float Gx = 0.0f;
    float Gy = 0.0f;
    float sobel_x[3][3] = {
        {-1.0f, 0.0f, 1.0f},
        {-2.0f, 0.0f, 2.0f},
        {-1.0f, 0.0f, 1.0f}
    };
    float sobel_y[3][3] = {
        {-1.0f, -2.0f, -1.0f},
        { 0.0f,  0.0f,  0.0f},
        { 1.0f,  2.0f,  1.0f}
    };
    for (int i = -1; i <= 1; ++i) {
        for (int j = -1; j <= 1; ++j) {
            int2 offset = (int2)(i, j);
            uint4 p = read_imageui(src, coord + offset);
            float intensity = 0.299f * p.x + 0.587f * p.y + 0.114f * p.z;
            Gx += intensity * sobel_x[i + 1][j + 1];
            Gy += intensity * sobel_y[i + 1][j + 1];
        }
    }
    float gradient_magnitude = sqrt(Gx * Gx + Gy * Gy);
    float gradient_direction = atan2(Gy, Gx);

    // Non-Maximum Suppression
    float edge = 0.0f;
    if (gradient_direction < 0.0f) {
        gradient_direction += 3.14159265f;
    }
    float angle = gradient_direction * 180.0f / 3.14159265f;
    angle = angle < 22.5f || angle > 157.5f ? 0.0f :
            (angle < 67.5f ? 45.0f :
            (angle < 112.5f ? 90.0f : 135.0f));

    float q = 255.0f;
    float r = 255.0f;
    if (angle == 0.0f) {
        q = read_imageui(src, coord + (int2)(1, 0)).x;
        r = read_imageui(src, coord + (int2)(-1, 0)).x;
    } else if (angle == 45.0f) {
        q = read_imageui(src, coord + (int2)(1, -1)).x;
        r = read_imageui(src, coord + (int2)(-1, 1)).x;
    } else if (angle == 90.0f) {
        q = read_imageui(src, coord + (int2)(0, 1)).x;
        r = read_imageui(src, coord + (int2)(0, -1)).x;
    } else if (angle == 135.0f) {
        q = read_imageui(src, coord + (int2)(1, 1)).x;
        r = read_imageui(src, coord + (int2)(-1, -1)).x;
    }

    if (gradient_magnitude >= q && gradient_magnitude >= r) {
        edge = gradient_magnitude;
    } else {
        edge = 0.0f;
    }

    // Hysteresis Thresholding
    float edge_final = 0.0f;
    if (edge >= high_threshold) {
        edge_final = 255.0f;
    } else if (edge >= low_threshold) {
        edge_final = 127.0f;
    }

    int4 edge_pixel = (int4)(edge_final, edge_final, edge_final, pixel.w);
    write_imagei(dst, coord, edge_pixel);
}
"""

# OpenCL ํ”Œ๋žซํผ ๋ฐ ๋””๋ฐ”์ด์Šค ์„ค์ •
platforms = cl.get_platforms()
if not platforms:
    print("No OpenCL platforms found.")
    exit(1)

platform = platforms[0]
devices = platform.get_devices(cl.device_type.ALL)
if not devices:
    print("No OpenCL devices found.")
    exit(1)

device = devices[0]
print(f"Using device: {device.name}")

# OpenCL ์ปจํ…์ŠคํŠธ ๋ฐ ๋ช…๋ น ํ ์ƒ์„ฑ
context = cl.Context([device])
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)

# OpenCL ํ”„๋กœ๊ทธ๋žจ ๋นŒ๋“œ
try:
    program = cl.Program(context, KERNEL_SOURCE).build()
except cl.RuntimeError as e:
    print("Error during kernel build:", e)
    print("Build log:\n", program.get_build_info(device, cl.program_build_info.LOG))
    exit(1)

grayscale_and_canny = program.grayscale_and_canny

# OpenCV์—์„œ OpenCL ์‚ฌ์šฉ ์„ค์ •
cv2.ocl.setUseOpenCL(True)
print(f"OpenCL ์‚ฌ์šฉ ์—ฌ๋ถ€: {cv2.ocl.useOpenCL()}")

# GStreamer ํŒŒ์ดํ”„๋ผ์ธ ์„ค์ •
gst_pipeline = (
    'v4l2src device=/dev/video0 ! '
    'videoconvert ! '
    'video/x-raw,format=BGRx ! '
    'appsink'
)

# OpenCV VideoCapture ๊ฐ์ฒด ์ƒ์„ฑ
cap = cv2.VideoCapture(gst_pipeline, cv2.CAP_GSTREAMER)

if not cap.isOpened():
    print("Error: Unable to open video source.")
    exit(1)

# Gaussian ๋ธ”๋Ÿฌ ํ•„ํ„ฐ ์ƒ์„ฑ
def gaussian_kernel(size, sigma=1.0):
    kernel = np.fromfunction(
        lambda x, y: (1 / (2 * np.pi * sigma**2)) * np.exp(-((x - (size - 1) / 2)**2 + (y - (size - 1) / 2)**2) / (2 * sigma**2)),
        (size, size)
    )
    return kernel / np.sum(kernel)

gaussian_k = gaussian_kernel(5).astype(np.float32)

try:
    frame_count = 0
    start_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture frame")
            break

        frame_count += 1

        print("Frame captured")

        height, width = frame.shape[:2]
        print(f"Frame size: {width}x{height}")

        image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8)

        mf = cl.mem_flags
        frame_bgra = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        print("Converted frame to BGRA")

        print("Frame data (BGRA):", frame_bgra)

        src_buf = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR, image_format, shape=(width, height), hostbuf=frame_bgra)
        dst_buf = cl.Image(context, mf.WRITE_ONLY, image_format, shape=(width, height))

        # Gaussian ์ปค๋„์„ OpenCL ๋ฉ”๋ชจ๋ฆฌ๋กœ ๋ณต์‚ฌ
        gaussian_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=gaussian_k)

        # ์ปค๋„ ์ธ์ž ์„ค์ •
        grayscale_and_canny.set_args(src_buf, dst_buf, gaussian_buf, np.int32(5), np.float32(50.0), np.float32(150.0))

        # ์ปค๋„ ์‹คํ–‰ ์‹œ๊ฐ„ ์ธก์ • ์‹œ์ž‘
        kernel_start_time = time.time()

        cl.enqueue_nd_range_kernel(queue, grayscale_and_canny, (width, height), None)
        queue.finish()
        
        # ์ปค๋„ ์‹คํ–‰ ์‹œ๊ฐ„ ์ธก์ • ์ข…๋ฃŒ
        kernel_end_time = time.time()

        print("Kernel executed")

        result = np.empty_like(frame_bgra)
        cl.enqueue_copy(queue, result, dst_buf, origin=(0, 0), region=(width, height))
        queue.finish()
        print("Result copied from device")

        print("Result data (BGRA):", result)

        result_bgr = cv2.cvtColor(result, cv2.COLOR_BGRA2BGR)
        print("Converted result to BGR")

        cv2.imshow('Original', frame)
        cv2.imshow('Grayscale + Canny', result_bgr)

        # FPS์™€ ์ปค๋„ ์‹คํ–‰ ์‹œ๊ฐ„ ์ถœ๋ ฅ
        current_time = time.time()
        elapsed_time = current_time - start_time
        fps = frame_count / elapsed_time
        kernel_execution_time = kernel_end_time - kernel_start_time

        print(f"Current FPS: {fps:.2f}")
        print(f"Kernel execution time: {kernel_execution_time:.6f} seconds per frame")

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("KeyboardInterrupt detected, exiting gracefully.")

finally:
    cap.release()
    cv2.destroyAllWindows()
1 Like

Hello @tyjk456789

Please post code with Code Blocks.

Your code here.

This will help for you. Basic Syntax | Markdown Guide