How to convert BGR array to LCh array efficiently?

Question 1

I have a NumPy three dimensional array of shape (height, width, 3) with float data type, the values are between 0 and 1, it represents an BGR image of resolution width*height.

And now I want to convert it to a LCh(ab) array and back. So I spent days researching, I have read the Wikipedia article, another Wikipedia article, GIMP source code, babl source code, this program, this article, and another one…

I don’t like that all the values I find online are of low precision. So I ripped the full precision values from GIMP source code and reverse engineered the whole process to create the matrices, and I got the following:

import numpy as np
from pathlib import Path
from typing import Tuple


def xy_to_XY(x: float, y: float) -> Tuple[float]:
    return x / y, (1 - x - y) / y


BRADFORD = np.array(
    [
        [0.8951000, 0.2664000, -0.1614000],
        [-0.7502000, 1.7135000, 0.0367000],
        [0.0389000, -0.0685000, 1.0296000],
    ],
    dtype=float,
)

BRADFORD_INV = np.linalg.inv(BRADFORD)

D65 = np.array([0.9504559270516716, 1, 1.0890577507598784], dtype=float)
D50 = np.array([0.96420288, 1, 0.82490540], dtype=float)
CHAD = np.zeros(9)
CHAD[:9:4] = (BRADFORD @ D50) / (BRADFORD @ D65)
CHAD = BRADFORD_INV @ CHAD.reshape((3, 3)) @ BRADFORD


Rx = 0.639998686
Ry = 0.330010138
Gx = 0.300003784
Gy = 0.600003357
Bx = 0.150002046
By = 0.059997204

Xr, Zr = xy_to_XY(Rx, Ry)
Xg, Zg = xy_to_XY(Gx, Gy)
Xb, Zb = xy_to_XY(Bx, By)
INTERIMAT = np.array([[Xr, Xg, Xb], [1, 1, 1], [Zr, Zg, Zb]], dtype=float)
MATINV = np.linalg.inv(INTERIMAT)
D65_Y = MATINV @ D65
D65_XYZ = INTERIMAT * D65_Y


def xyY_to_XYZ(x: float, y: float, Y: float) -> Tuple[float]:
    return x * Y / y, Y, (1 - x - y) * Y / y


D50_XYZ = np.vstack(
    [
        CHAD @ xyY_to_XYZ(Rx, Ry, D65_Y[0]),
        CHAD @ xyY_to_XYZ(Gx, Gy, D65_Y[1]),
        CHAD @ xyY_to_XYZ(Bx, By, D65_Y[2]),
    ]
).T

D65_RGB = np.linalg.inv(D65_XYZ)
D50_RGB = np.linalg.inv(D50_XYZ)

D65_RGB = np.linalg.inv(D65_XYZ)
D50_RGB = np.linalg.inv(D50_XYZ)

for i, c in enumerate("XYZ"):
    for j, d in enumerate("rgb"):
        print(f"D65_{c}{d} = {D65_XYZ[i, j]}")

print()
for i, c in enumerate("RGB"):
    for j, d in enumerate("xyz"):
        print(f"D65_{c}{d} = {D65_RGB[i, j]}")

print()
for i, c in enumerate("XYZ"):
    for j, d in enumerate("rgb"):
        print(f"D50_{c}{d} = {D50_XYZ[i, j]}")

print()
for i, c in enumerate("RGB"):
    for j, d in enumerate("xyz"):
        print(f"D50_{c}{d} = {D50_RGB[i, j]}")

I now have the full precision values, so I used them in my completely working code to convert between BGR and LCh:

import numba as nb
import numpy as np
from math import atan2, cos, pi, sin
from typing import Callable, Tuple

D65_Xw = 0.9504559270516716
D65_Zw = 1.0890577507598784

D65_Xr = 0.4123835774573348
D65_Xg = 0.35758636076837935
D65_Xb = 0.18048598882595746
D65_Yr = 0.21264225112116675
D65_Yg = 0.7151677022795175
D65_Yb = 0.07219004659931565
D65_Zr = 0.019324834131038457
D65_Zg = 0.11918543851645445
D65_Zb = 0.9505474781123853

D65_Rx = 3.2410639132702483
D65_Ry = -1.5374434989773638
D65_Rz = -0.49863738352233855
D65_Gx = -0.9692888172936756
D65_Gy = 1.875993314670902
D65_Gz = 0.04157078604801982
D65_Bx = 0.05564381729909414
D65_By = -0.20396692403457678
D65_Bz = 1.0569503107394616


LAB_F0 = 216 / 24389
LAB_F1 = 841 / 108
LAB_F2 = 4 / 29
LAB_F3 = LAB_F0 ** (1 / 3)
LAB_F4 = 1 / LAB_F1
LAB_F5 = 1 / 2.4
LAB_F6 = 0.04045 / 12.92
RtD = 180 / pi
DtR = pi / 180


@nb.njit(cache=True, fastmath=True)
def gamma_expand(c: float) -> float:
    return c / 12.92 if c <= 0.04045 else ((c + 0.055) / 1.055) ** 2.4


@nb.njit(cache=True, fastmath=True)
def LABF(f: float) -> float:
    return f ** (1 / 3) if f >= LAB_F0 else LAB_F1 * f + LAB_F2


@nb.njit(cache=True, fastmath=True)
def LABINVF(f: float) -> float:
    return f**3 if f >= LAB_F3 else LAB_F4 * (f - 4 / 29)


@nb.njit(cache=True, fastmath=True)
def gamma_contract(n: float) -> float:
    n = n * 12.92 if n <= LAB_F6 else (1.055 * n**LAB_F5) - 0.055
    return 0.0 if n < 0 else (1.0 if n > 1 else n)


@nb.njit(cache=True, fastmath=True)
def BGR_to_LCh_D65(b: float, g: float, r: float) -> Tuple[float]:
    b = gamma_expand(b)
    g = gamma_expand(g)
    r = gamma_expand(r)
    x = LABF((D65_Xr * r + D65_Xg * g + D65_Xb * b) / D65_Xw)
    y = LABF(D65_Yr * r + D65_Yg * g + D65_Yb * b)
    z = LABF((D65_Zr * r + D65_Zg * g + D65_Zb * b) / D65_Zw)
    m = 500 * (x - y)
    n = 200 * (y - z)
    return 116 * y - 16, (m * m + n * n) ** 0.5, (atan2(n, m) * RtD) % 360


@nb.njit(cache=True, fastmath=True)
def LCh_D65_to_BGR(l: float, c: float, h: float) -> Tuple[float]:
    h *= DtR
    l = (l + 16) / 116
    x = D65_Xw * LABINVF(l + c * cos(h) / 500)
    y = LABINVF(l)
    z = D65_Zw * LABINVF(l - c * sin(h) / 200)
    r = D65_Rx * x + D65_Ry * y + D65_Rz * z
    g = D65_Gx * x + D65_Gy * y + D65_Gz * z
    b = D65_Bx * x + D65_By * y + D65_Bz * z
    m = min(r, g, b)
    if m < 0:
        r -= m
        g -= m
        b -= m

    return gamma_contract(b), gamma_contract(g), gamma_contract(r)


@nb.njit(cache=True, parallel=True)
def loop_LCh(img: np.ndarray, mode: Callable) -> np.ndarray:
    height, width = img.shape[:2]
    out = np.empty_like(img)
    for y in nb.prange(height):
        for x in nb.prange(width):
            b, g, r = img[y, x]
            out[y, x] = mode(b, g, r)

    return out


@nb.njit
def IMG_to_LCh_D65(img: np.ndarray) -> np.ndarray:
    return loop_LCh(img, BGR_to_LCh_D65)


@nb.njit
def LCh_D65_to_IMG(lch: np.ndarray) -> np.ndarray:
    return loop_LCh(lch, LCh_D65_to_IMG)

It works, and I have rigorously verified its correctness, but it is very slow.

For reference, below is the code I use to convert from BGR to HSL:

@nb.njit(cache=True, fastmath=True)
def extrema(a: float, b: float, c: float) -> Tuple[float]:
    i = 2
    if b > c:
        b, c = c, b
        i = 1

    if a > b:
        a, b = b, a

    if b > c:
        b, c = c, b
        i = 0

    return i, a, c


@nb.njit(cache=True, fastmath=True)
def hue(b: float, g: float, r: float, d: float, i: float) -> float:
    if i == 2:
        h = (g - b) / (6 * d)
    elif i:
        h = 1 / 3 + (b - r) / (6 * d)
    else:
        h = 2 / 3 + (r - g) / (6 * d)

    return h % 1


@nb.njit(cache=True, fastmath=True)
def HSL_pixel(
    b: float, g: float, r: float, i: float, x: float, z: float
) -> Tuple[float]:
    s = x + z
    d = z - x
    avg = s / 2

    return (hue(b, g, r, d, i), d / (1 - abs(s - 1)), avg) if d else (0, 0, avg)


@nb.njit(cache=True, parallel=True)
def from_BGR(img: np.ndarray, mode: Callable) -> np.ndarray:
    height, width = img.shape[:2]
    out = np.empty_like(img)
    for y in nb.prange(height):
        for x in nb.prange(width):
            b, g, r = img[y, x]
            i, a, c = extrema(b, g, r)
            out[y, x] = mode(b, g, r, i, a, c)

    return out


@nb.njit
def BGR_to_HSL(img: np.ndarray) -> np.ndarray:
    return from_BGR(img, HSL_pixel)

BGR_to_HSL takes about 20 milliseconds to process an 1920×1080 image, but IMG_to_LCh_D65 takes about 256 milliseconds to process the same image, which is over 10 times slower.

In [2]: img = np.random.random((1080, 1920, 3))

In [3]: %timeit BGR_to_HSL(img)
19.4 ms ± 889 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [4]: %timeit BGR_to_HSL(img)
22.1 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [5]: %timeit IMG_to_LCh_D65(img)
257 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [6]: %timeit IMG_to_LCh_D65(img)
268 ms ± 18.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [7]: b, g, r = np.random.random(3)

In [8]: %timeit i, a, c = extrema(b, g, r)
479 ns ± 14.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

In [9]: %timeit i, a, c = extrema(b, g, r); HSL_pixel(b, g, r, i, a, c)
1.02 µs ± 4.98 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

In [10]: %timeit BGR_to_LCh_D65(b, g, r)
913 ns ± 19.5 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

These two functions use two very similar structures, they both use a two level nested for loop and process the pixels in parallel, the only difference is the detail of the computation.

As you can clearly see, the process IMG_to_LCh_D65 uses is actually faster than what uses BGR_to_HSL, so why does the former function which processes each individual faster takes much longer time to complete overall, and how to fix that?

And more, people often say Python is slow and C++ is superfast, I agree with that. So I want to offload the heavy computation to C++, so I tried to port my code to C++ and use it as shared library.

I have no idea how to do that, I want to compile to .dll but so far I have only managed to compile to .exe files. I am still a beginner in C++, but I have managed to port the code that converts from BGR to HSL to C++. It compiled successfully, but it was extremely inefficient, so I didn’t port the LCh code:

#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>

constexpr double one_third = 1.0 / 3.0;
constexpr double two_third = 2.0 / 3.0;
using std::tuple;
using std::vector;
using pixel = vector<double>;
using line = vector<pixel>;
using image = vector<line>;
using std::chrono::steady_clock;
using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::microseconds;
using std::cout;
using std::function;

double wrap(double d) {
    d = fmod(d, 1.0);
    return d < 0 ? d + 1.0 : d;
}

void fill(pixel& p) {
    p[0] = double(rand()) / RAND_MAX;
    p[1] = double(rand()) / RAND_MAX;
    p[2] = double(rand()) / RAND_MAX;
}

image random_image(int64_t height, int64_t width) {
    image image(height, line(width, pixel(3)));
    for (auto& l : image) {
        std::for_each(l.begin(), l.end(), fill);
    }
    return image;
}

tuple<int, double, double> extrema(double a, double b, double c) {
    int i = 2;
    if (b > c) {
        std::swap(b, c);
        i = 1;
    }
    if (a > b) {
        std::swap(a, b);
    }
    if (b > c) {
        std::swap(b, c);
        i = 0;
    }
    return { i, a, c };
}

double hue(double b, double g, double r, double d, double i) {
    double h;
    if (i == 2) {
        h = (g - b) / (6 * d);
    }
    else if (i) {
        h = one_third + (b - r) / (6 * d);
    }
    else {
        h = two_third + (r - g) / (6 * d);
    }
    return wrap(h);
}

pixel HSL_pixel(double b, double g, double r) {
    auto [i, x, z] = extrema(b, g, r);
    double s = x + z;
    double d = z - x;
    double avg = s / 2;
    return d ? pixel { hue(b, g, r, d, i), d / (1 - abs(s - 1)), avg } : pixel { 0.0, 0.0, avg };
}

image BGR_to_HSL(const image& img) {
    size_t height, width;
    height = img.size();
    width = img[0].size();
    image out(height, line(width, pixel(3)));
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            pixel px = img[y][x];
            out[y][x] = HSL_pixel(px[0], px[1], px[2]);
        }
    }
    return out;
}


double timeit(function<image(image)> func, image img, int runs = 256) {
    auto start = steady_clock::now();
    for (int64_t i = 0; i < runs; i++) {
        func(img);
    }
    auto end = steady_clock::now();
    duration<double, std::nano> time = end - start;
    return time.count() / runs / 1000.0;
}

int main()
{
    image img = random_image(1080, 1920);
    double once = timeit(BGR_to_HSL, img, 16);
    cout << "converting 1920x1080 BGR image to HSL vector: " + std::to_string(once) + " microseconds\n";
}

PS C:\Users\Xeni> C:\Users\Xeni\source\repos\CIELCh\x64\Release\CIELCh.exe
converting 1920x1080 BGR image to HSL vector: 1024189.362500 microseconds

I don’t know if it works, presumably it works. Why is it so terribly slow, and then, how can I fix that so that I can use it in Python?

Question 2

there are 2 major problems in your code.

allocation of vectors
data is not contiguous in memory

both can be solved by converting the std::vector<std::vector<std::vector<double>>> into a vector<array<double,3>>, where the image is flattened, in memory, and you keep track of the width and height separately, in a struct.

also in order to get good benchmarking you should put the allocation of the output vector outside of your benchmarking code, and fix your random seed.

lastly if the input of a function is within a known set it’s better to do a switch than multiple if-else.

#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>
#include <cmath>

constexpr double one_third = 1.0 / 3.0;
constexpr double two_third = 2.0 / 3.0;
using std::tuple;
using std::vector;
using pixel = std::array<double,3>;
struct image
{
    vector<pixel> data;
    int height;
    int width;
};
using std::chrono::steady_clock;
using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::microseconds;
using std::cout;
using std::function;

double wrap(double d) {
    d = fmod(d, 1.0);
    return d < 0 ? d + 1.0 : d;
}

void fill(pixel& p) {
    p[0] = double(rand()) / RAND_MAX;
    p[1] = double(rand()) / RAND_MAX;
    p[2] = double(rand()) / RAND_MAX;
}

image random_image(int height, int width) {
    srand(0);
    image image{vector<pixel>(height*width), height, width};
    for (auto& l : image.data) {
        fill(l);
    }
    return image;
}

tuple<int, double, double> extrema(double a, double b, double c) {
    int i = 2;
    if (b > c) {
        std::swap(b, c);
        i = 1;
    }
    if (a > b) {
        std::swap(a, b);
    }
    if (b > c) {
        std::swap(b, c);
        i = 0;
    }
    return { i, a, c };
}

double hue(double b, double g, double r, double d, int i) {
    double h;
    switch (i)
    {
    case 2:
        h = (g - b) / (6 * d);
        break;
    case 1:
        h = one_third + (b - r) / (6 * d);
        break;
    default:
        h = two_third + (r - g) / (6 * d);
        break;
    }
    return wrap(h);
}

pixel HSL_pixel(double b, double g, double r) {
    auto [i, x, z] = extrema(b, g, r);
    double s = x + z;
    double d = z - x;
    double avg = s / 2;
    return d ? pixel { hue(b, g, r, d, i), d / (1 - abs(s - 1)), avg } : pixel { 0.0, 0.0, avg };
}

void BGR_to_HSL(const image& img, image& out) {
    size_t height, width;
    height = img.height;
    width = img.width;
    //#pragma omp parallel for
    for (size_t y = 0; y < height; y++) {
        for (size_t x = 0; x < width; x++) {
            pixel px = img.data[width*y+x];
            out.data[width*y+x] = HSL_pixel(px[0], px[1], px[2]);
        }
    }
}


double timeit(function<void(image&,image&)> func, image img, int runs = 256) {
    image out(vector<pixel>(img.width*img.height),img.height,img.width);
    auto start = steady_clock::now();
    for (int64_t i = 0; i < runs; i++) {
        func(img, out);
    }
    auto end = steady_clock::now();
    duration<double, std::nano> time = end - start;
    return time.count() / runs / 1000.0;
}

int main()
{
    image img = random_image(1080, 1920);
    double once = timeit(BGR_to_HSL, img, 256);
    cout << "converting 1920x1080 BGR image to HSL vector: " + std::to_string(once) + " microseconds\n";
}

resulting time went from 0.9 seconds to around 60ms on a 2.8GHz core, and without allocations you get almost perfect parallelization, so with 10 cores you get 1/10 of the time, just by uncommenting the pragma omp parallel for and enabling it in your compiler with -fopenmp.

more optimization can be done by using smaller datatypes such as float or uint8 but it is up to your required accuracy.

Leave a Comment Cancel reply