Video writing is too slow

Hello everyone!
I’m using Khadas Vim3 Pro. I’m using Yolov8n to predict objects on video and it works fine - 12 fps (if you know how to make it better, I’ll be glad to hear it), my problem is that when I try to write video with object, it start going slow, it takes 70ms without video writing, with video writing it takes 300ms
The code is here:

cap = cv.VideoCapture(source)
        if output_path:
            if os.path.exists('/'.join(output_path.split('/')[:-1])) == False:
                sys.exit('Output Path doesnt exist')
            frame_width = int(cap.get(3)) 
            frame_height = int(cap.get(4)) 
            size = (frame_width, frame_height)
            fourcc = cv.VideoWriter_fourcc(*'XVID')
            out = cv.VideoWriter(output_path, fourcc, 20.0, size, True)
        while(1):
            cv_img = list()
            ret,img = cap.read()
            cv_img.append(img)
            start = time.perf_counter()
            '''
                default input_tensor is 1
            '''
            data = yolov3.nn_inference(cv_img, platform='ONNX', reorder='2 1 0', output_tensor=3, output_format=output_format.OUT_FORMAT_FLOAT32)
            
            input0_data = data[2]
            input1_data = data[1]
            input2_data = data[0]

            input0_data = input0_data.reshape(SPAN, LISTSIZE, GRID0, GRID0)
            input1_data = input1_data.reshape(SPAN, LISTSIZE, GRID1, GRID1)
            input2_data = input2_data.reshape(SPAN, LISTSIZE, GRID2, GRID2)
            
            input_data = list()
            input_data.append(np.transpose(input0_data, (2, 3, 0, 1)).astype(np.float32))
            input_data.append(np.transpose(input1_data, (2, 3, 0, 1)).astype(np.float32))
            input_data.append(np.transpose(input2_data, (2, 3, 0, 1)).astype(np.float32))

            boxes, scores, classes = yolov3_post_process(input_data)

            if boxes is not None:
                draw(img, boxes, scores, classes)

            if args.visualize:
                img = cv.resize(img, (960, 540))
                cv.imshow("capture", img)
                if cv.waitKey(1) & 0xFF == ord('q'):
                    break
            if output_path:
                out.write(img)
            end = time.perf_counter()
            print('1 frame per: {}s'.format(end - start))
        cap.release()
        if output_path:
            out.release()
        cv.destroyAllWindows()

Do you have any suggestions?