Using opencv opecl

tyjk456789 · May 16, 2024, 9:15am

What system do you use? Android, Ubuntu, OOWOW or other?

Ubuntu22.04

What version of the system do you use? Kadas official image, homebrew, or other?

Kadas official ubutu

Please describe your issue below:

kernel.cl
__kernel void vector_add(__global const float *a, __global const float *b, __global const float *c, const unsigned int n) {
    int id = get_global_id(0);
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

test.py
import cv2
import numpy as np
import pyopencl as cl
import time

Vector operations work fine, but when I process images, I get all zeros.
KERNEL_SOURCE = "kernel.cl"

with open(KERNEL_SOURCE, "r") as source_file:
    kernel_str = source_file.read()


platforms = cl.get_platforms()
if not platforms:
    print("No OpenCL platforms found.")
    exit(1)

platform = platforms[0]


devices = platform.get_devices(cl.device_type.ALL)
if not devices:
    print("No OpenCL devices found.")
    exit(1)

device = devices[0]
print(f"Using device: {device.name}")


context = cl.Context([device])
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)


try:
    program = cl.Program(context, kernel_str).build()
except cl.RuntimeError as e:
    print("Error during kernel build:", e)
    print("Build log:\n", program.get_build_info(device, cl.program_build_info.LOG))
    exit(1)

copy_image = program.copy_image


gst_pipeline = (
    'v4l2src device=/dev/video0 ! '
    'videoconvert ! '
    'video/x-raw,format=BGRx ! '
    'appsink'
)

OpenCV VideoCapture 객체 생성

cap = cv2.VideoCapture(gst_pipeline, cv2.CAP_GSTREAMER)

if not cap.isOpened():
    print("Error: Unable to open video source.")
    exit(1)

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture frame")
            break


        print("Frame captured")


        height, width = frame.shape[:2]
        print(f"Frame size: {width}x{height}")


        image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8)

      
        mf = cl.mem_flags
        frame_bgra = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        print("Converted frame to BGRA")

        src_buf = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR, image_format, shape=(width, height), hostbuf=frame_bgra)
        dst_buf = cl.Image(context, mf.WRITE_ONLY, image_format, shape=(width, height))

  
        copy_image.set_args(src_buf, dst_buf)

    
        cl.enqueue_nd_range_kernel(queue, copy_image, (width, height), None)
        queue.finish()
        print("Kernel executed")

  
        result = np.empty_like(frame_bgra)
        cl.enqueue_copy(queue, result, dst_buf, origin=(0, 0), region=(width, height))
        queue.finish()
        print("Result copied from device")


        result_bgr = cv2.cvtColor(result, cv2.COLOR_BGRA2BGR)
        print("Converted result to BGR")

   
        cv2.imshow('Original', frame)
        cv2.imshow('Copied', result_bgr)

  
        time.sleep(0.033)  

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("KeyboardInterrupt detected, exiting gracefully.")

finally:
    cap.release()
    cv2.destroyAllWindows()

Vector operations work fine, but when I process images, I get all zeros.
Do you know why?

$ sudo mv /usr/lib/libOpenCL.so /usr/lib/libOpenCL.so.old
$ sudo ln -s /usr/lib/aarch64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so

I tried this as well and also installed cl and confirmed that the gpu is caught in clinfo

Post a console log of the issue below:


**Delete this line and post the log here.

Translated with DeepL Translate: The world's most accurate translator (free version)

numbqq · May 16, 2024, 9:22am

Hello @Electr1

Can you help to check this issue?

tyjk456789 · May 16, 2024, 9:35am

Platform: ARM Platform
  Device: Mali-G52 r1p0 - Type: ALL | GPU
The kernel itself is deducing from the cl test that the computation is fine

Kernel Functions 
----------------------------------------------
__kernel void vector_add(__global const float *a, __global const float *b, __global float *c, const unsigned int n) {
    int id = get_global_id(0);
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

testcode

import pyopencl as cl

import numpy as np


KERNEL_SOURCE = "vector_add.cl"

with open(KERNEL_SOURCE, "r") as source_file:

kernel_str = source_file.read()



platforms = cl.get_platforms()

if not platforms:

print("No OpenCL platforms found.")

exit(1)


platform = platforms[0]

devices = platform.get_devices(cl.device_type.ALL)

if not devices:

print("No OpenCL devices found.")

exit(1)

device = devices[0]

print(f"Using device: {device.name}")



context = cl.Context([device])

queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)



program = cl.Program(context, kernel_str).build()

vector_add = program.vector_add



N = 1024

a = np.random.rand(N).astype(np.float32)

b = np.random.rand(N).astype(np.float32)

c = np.empty_like(a)



mf = cl.mem_flags

a_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)

b_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)

c_buf = cl.Buffer(context, mf.WRITE_ONLY, c.nbytes)



vector_add.set_args(a_buf, b_buf, c_buf, np.uint32(N))



cl.enqueue_nd_range_kernel(queue, vector_add, (N,), None)

cl.enqueue_copy(queue, c, c_buf)

queue.finish()

print("Input A:", a)

print("Input B:", b)

print("Output C:", c)

print("Expected C:", a + b)

print("Difference:", np.linalg.norm(c - (a + b)))
______________________________________________

result
Using device: Mali-G52 r1p0
Input A: [0.7967211  0.9415344  0.85789233 ... 0.4951092  0.6097247  0.6960514 ]
Input B: [0.30978295 0.23329993 0.31276643 ... 0.8768382  0.34669176 0.9608981 ]
Output C: [1.1065041 1.1748344 1.1706588 ... 1.3719474 0.9564165 1.6569495]
Expected C: [1.1065041 1.1748344 1.1706588 ... 1.3719474 0.9564165 1.6569495]
Difference: 0.0

But when I pass it as an image, whether it's gray scale or just a copy, all values come out as 0.

__kernel void copy_image(
    __read_only image2d_t src,
    __write_only image2d_t dst)
{
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    float4 pixel = read_imagef(src, coord);
    write_imagef(dst, coord, pixel);
}

Before Kernel Traversal

[[164 168 147]
[166 170 151]
[167 170 154]
…
[ 54 38 38]
[ 52 36 37]
[ 46 30 31]]

After passing

[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
…
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]]

[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]

Electr1 · May 17, 2024, 2:25am

Hi @tyjk456789

I will check this issue and follow up for you.

Electr1 · May 17, 2024, 9:12am

@tyjk456789 seems like maybe you have a data mismatch, you are using float for pixel values in your kernel, as seen by usage of image_readf(), but your camera data is np.uint8,

Please verify if your kernel is right, it doesn’t seem to align with your code.

you either need to change the kernel to do operation on uchar datatype (this is the the equivalent for np.uint8). Alternative is to cast the integer to normalized floating point value ( value * 1 / 255).
This involves some processing on the cpu side, but it will allow you to do faster processing on the GPU side.

tyjk456789 · May 17, 2024, 10:36am

thanks!
I applied it as you suggested and it works!!! I’ll post my code just in case anyone else runs into the same problem

import cv2
import numpy as np
import pyopencl as cl
import time

# OpenCL 커널 소스 코드
KERNEL_SOURCE = """
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

__kernel void grayscale_and_canny(
    __read_only image2d_t src,
    __write_only image2d_t dst,
    __global float* gaussian_kernel,
    int kernel_size,
    float low_threshold,
    float high_threshold)
{
    int2 coord = (int2)(get_global_id(0), get_global_id(1));
    uint4 pixel = read_imageui(src, coord);
    float gray = 0.299f * pixel.x + 0.587f * pixel.y + 0.114f * pixel.z;
    
    // Gaussian Blur
    float sum = 0.0f;
    int half_size = kernel_size / 2;
    for (int i = -half_size; i <= half_size; ++i) {
        for (int j = -half_size; j <= half_size; ++j) {
            int2 offset = (int2)(i, j);
            uint4 p = read_imageui(src, coord + offset);
            float intensity = 0.299f * p.x + 0.587f * p.y + 0.114f * p.z;
            sum += intensity * gaussian_kernel[(i + half_size) * kernel_size + (j + half_size)];
        }
    }
    float blurred = sum;

    // Sobel Filter
    float Gx = 0.0f;
    float Gy = 0.0f;
    float sobel_x[3][3] = {
        {-1.0f, 0.0f, 1.0f},
        {-2.0f, 0.0f, 2.0f},
        {-1.0f, 0.0f, 1.0f}
    };
    float sobel_y[3][3] = {
        {-1.0f, -2.0f, -1.0f},
        { 0.0f,  0.0f,  0.0f},
        { 1.0f,  2.0f,  1.0f}
    };
    for (int i = -1; i <= 1; ++i) {
        for (int j = -1; j <= 1; ++j) {
            int2 offset = (int2)(i, j);
            uint4 p = read_imageui(src, coord + offset);
            float intensity = 0.299f * p.x + 0.587f * p.y + 0.114f * p.z;
            Gx += intensity * sobel_x[i + 1][j + 1];
            Gy += intensity * sobel_y[i + 1][j + 1];
        }
    }
    float gradient_magnitude = sqrt(Gx * Gx + Gy * Gy);
    float gradient_direction = atan2(Gy, Gx);

    // Non-Maximum Suppression
    float edge = 0.0f;
    if (gradient_direction < 0.0f) {
        gradient_direction += 3.14159265f;
    }
    float angle = gradient_direction * 180.0f / 3.14159265f;
    angle = angle < 22.5f || angle > 157.5f ? 0.0f :
            (angle < 67.5f ? 45.0f :
            (angle < 112.5f ? 90.0f : 135.0f));

    float q = 255.0f;
    float r = 255.0f;
    if (angle == 0.0f) {
        q = read_imageui(src, coord + (int2)(1, 0)).x;
        r = read_imageui(src, coord + (int2)(-1, 0)).x;
    } else if (angle == 45.0f) {
        q = read_imageui(src, coord + (int2)(1, -1)).x;
        r = read_imageui(src, coord + (int2)(-1, 1)).x;
    } else if (angle == 90.0f) {
        q = read_imageui(src, coord + (int2)(0, 1)).x;
        r = read_imageui(src, coord + (int2)(0, -1)).x;
    } else if (angle == 135.0f) {
        q = read_imageui(src, coord + (int2)(1, 1)).x;
        r = read_imageui(src, coord + (int2)(-1, -1)).x;
    }

    if (gradient_magnitude >= q && gradient_magnitude >= r) {
        edge = gradient_magnitude;
    } else {
        edge = 0.0f;
    }

    // Hysteresis Thresholding
    float edge_final = 0.0f;
    if (edge >= high_threshold) {
        edge_final = 255.0f;
    } else if (edge >= low_threshold) {
        edge_final = 127.0f;
    }

    int4 edge_pixel = (int4)(edge_final, edge_final, edge_final, pixel.w);
    write_imagei(dst, coord, edge_pixel);
}
"""

# OpenCL 플랫폼 및 디바이스 설정
platforms = cl.get_platforms()
if not platforms:
    print("No OpenCL platforms found.")
    exit(1)

platform = platforms[0]
devices = platform.get_devices(cl.device_type.ALL)
if not devices:
    print("No OpenCL devices found.")
    exit(1)

device = devices[0]
print(f"Using device: {device.name}")

# OpenCL 컨텍스트 및 명령 큐 생성
context = cl.Context([device])
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)

# OpenCL 프로그램 빌드
try:
    program = cl.Program(context, KERNEL_SOURCE).build()
except cl.RuntimeError as e:
    print("Error during kernel build:", e)
    print("Build log:\n", program.get_build_info(device, cl.program_build_info.LOG))
    exit(1)

grayscale_and_canny = program.grayscale_and_canny

# OpenCV에서 OpenCL 사용 설정
cv2.ocl.setUseOpenCL(True)
print(f"OpenCL 사용 여부: {cv2.ocl.useOpenCL()}")

# GStreamer 파이프라인 설정
gst_pipeline = (
    'v4l2src device=/dev/video0 ! '
    'videoconvert ! '
    'video/x-raw,format=BGRx ! '
    'appsink'
)

# OpenCV VideoCapture 객체 생성
cap = cv2.VideoCapture(gst_pipeline, cv2.CAP_GSTREAMER)

if not cap.isOpened():
    print("Error: Unable to open video source.")
    exit(1)

# Gaussian 블러 필터 생성
def gaussian_kernel(size, sigma=1.0):
    kernel = np.fromfunction(
        lambda x, y: (1 / (2 * np.pi * sigma**2)) * np.exp(-((x - (size - 1) / 2)**2 + (y - (size - 1) / 2)**2) / (2 * sigma**2)),
        (size, size)
    )
    return kernel / np.sum(kernel)

gaussian_k = gaussian_kernel(5).astype(np.float32)

try:
    frame_count = 0
    start_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture frame")
            break

        frame_count += 1

        print("Frame captured")

        height, width = frame.shape[:2]
        print(f"Frame size: {width}x{height}")

        image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8)

        mf = cl.mem_flags
        frame_bgra = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        print("Converted frame to BGRA")

        print("Frame data (BGRA):", frame_bgra)

        src_buf = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR, image_format, shape=(width, height), hostbuf=frame_bgra)
        dst_buf = cl.Image(context, mf.WRITE_ONLY, image_format, shape=(width, height))

        # Gaussian 커널을 OpenCL 메모리로 복사
        gaussian_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=gaussian_k)

        # 커널 인자 설정
        grayscale_and_canny.set_args(src_buf, dst_buf, gaussian_buf, np.int32(5), np.float32(50.0), np.float32(150.0))

        # 커널 실행 시간 측정 시작
        kernel_start_time = time.time()

        cl.enqueue_nd_range_kernel(queue, grayscale_and_canny, (width, height), None)
        queue.finish()
        
        # 커널 실행 시간 측정 종료
        kernel_end_time = time.time()

        print("Kernel executed")

        result = np.empty_like(frame_bgra)
        cl.enqueue_copy(queue, result, dst_buf, origin=(0, 0), region=(width, height))
        queue.finish()
        print("Result copied from device")

        print("Result data (BGRA):", result)

        result_bgr = cv2.cvtColor(result, cv2.COLOR_BGRA2BGR)
        print("Converted result to BGR")

        cv2.imshow('Original', frame)
        cv2.imshow('Grayscale + Canny', result_bgr)

        # FPS와 커널 실행 시간 출력
        current_time = time.time()
        elapsed_time = current_time - start_time
        fps = frame_count / elapsed_time
        kernel_execution_time = kernel_end_time - kernel_start_time

        print(f"Current FPS: {fps:.2f}")
        print(f"Kernel execution time: {kernel_execution_time:.6f} seconds per frame")

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("KeyboardInterrupt detected, exiting gracefully.")

finally:
    cap.release()
    cv2.destroyAllWindows()

numbqq · May 17, 2024, 12:28pm

Hello @tyjk456789

Please post code with Code Blocks.

Your code here.

This will help for you. Basic Syntax | Markdown Guide