How do I print a range of unicode characters in C++?

Question 1

I want to print the range of Unicode characters, from 00B2 to 00B5 (and many other ranges), how can I print these programmatically?

Obviously, I don’t know how to define a variable Unicode character and then do arithmetic (increment) for them.

I am looking for the loop equivalent of this below.

How can I programmatically print an arbitrary range of Unicode characters?

#include <iostream>

int main() {

    std::cout << "\u00B2" << std::endl;
    std::cout << "\u00B3" << std::endl;
    std::cout << "\u00B4" << std::endl;
    std::cout << "\u00B5" << std::endl;

}

output (unicode characters)

²
³
´
µ

Question 2

Not sure if I understood your question correctly. However, you can achieve this programmatically by using a loop and incrementing the Unicode code points. Here’s an example in C++:

#include <iostream>
#include <locale>

int main() {
   
    std::locale::global(std::locale(""));

    wchar_t start = 0x00B2;  // Unicode code point for '²'
    wchar_t end = 0x00B5;    // Unicode code point for 'µ'

    for (wchar_t codePoint = start; codePoint <= end; ++codePoint) {
        std::wcout << codePoint << std::endl;
    }

    return 0;
}

The above code uses wchar_t for wide characters and std::wcout for wide character output. The std::locale::global(std::locale(“”)) line sets the global locale to the user’s default, which is necessary for properly displaying Unicode characters based on the user’s environment.

Question 3

You might encode/decode from utf-8:

void append_utf8 (::std::string& s, ::std::uint32_t unicode)
{
    if (unicode < 0x80)
    {
        s.push_back (unicode & 0x7F);
    }
    else if (unicode < 0x08'00)
    {
        s.push_back (((unicode >> 6) & 0x1F) | 0xC0);
        s.push_back ((unicode & 0x3F) | 0x80);
    }
    else if (unicode < 0x00'01'00'00)
    {
        s.push_back (((unicode >> 12) & 0x0F) | 0xE0);
        s.push_back (((unicode >> 6) & 0x3F) | 0x80);
        s.push_back ((unicode & 0x3F) | 0x80);
    }
    else
    {
        s.push_back (((unicode >> 18) & 0x07) | 0xF0);
        s.push_back (((unicode >> 12) & 0x3F) | 0x80);
        s.push_back (((unicode >> 6) & 0x3F) | 0x80);
        s.push_back ((unicode & 0x3F) | 0x80);
    }
}

::std::string to_utf8 (::std::uint32_t unicode)
{
    ::std::string res;
    append_utf8 (res, unicode);
    return res;
}

//--------------------------------------------------------------------------
constexpr ::std::uint32_t decodeUtf8 (const ::std::string_view& text, ::std::size_t& pos)
{
    if (text.size() <= pos)
    {
        throw ::std::out_of_range ("invalid position for decodeUtf8");
    }
    const ::std::uint32_t c0 = static_cast<unsigned char> (text[pos]);
    const auto is_extra_char = [] (char c) { return (c & 0b1100'0000) == 0b1000'0000; };
    const auto check_extra_size = [&] (int size) {
        if (text.size() <= pos + size || !std::all_of (&text[pos + 1], &text.data()[pos + 1 + size], is_extra_char))
        {
            throw ::std::out_of_range ("invalid position for decodeUtf8");
        }
    };
    if ((c0 & 0b1111'1000) == 0b1111'0000)
    {
        check_extra_size (3);
        const unsigned char c1 = text[pos + 1] & 0b0011'1111;
        const unsigned char c2 = text[pos + 2] & 0b0011'1111;
        const unsigned char c3 = text[pos + 3] & 0b0011'1111;

        pos += 4;
        return ((c0 & 0b000'0111) << 18) | (c1 << 12) | (c2 << 6) | c3;
    }
    else if ((c0 & 0b1111'0000) == 0b1110'0000)
    {
        check_extra_size (2);
        const unsigned char c1 = text[pos + 1] & 0b0011'1111;
        const unsigned char c2 = text[pos + 2] & 0b0011'1111;

        pos += 3;
        return ((c0 & 0b0000'1111) << 12) | (c1 << 6) | c2;
    }
    else if ((c0 & 0b1110'0000) == 0b1100'0000)
    {
        check_extra_size (1);
        const unsigned char c1 = text[pos + 1] & 0b0011'1111;

        pos += 2;
        return ((c0 & 0b0001'1111) << 6) | c1;
    }
    else
    {
        if ((c0 & 0b1000'0000) != 0)
        {
            throw ::std::runtime_error ("Invalid utf8 character for decodeUtf8");
        }
        pos += 1;
        return (c0 & 0b0111'1111);
    }
}

constexpr ::std::uint32_t decodeUtf8 (const ::std::string_view& text)
{
    std::size_t pos = 0;
    return decodeUtf8 (text, pos);
}

And then

for (auto c = decodeUtf8 ("\u00B2"); c != decodeUtf8 ("\u00B6"); ++c) {
    std::cout << to_utf8 (c) << std::endl;
}

or

for (auto c = 0x00B2; c != 0x00B6; ++c) {
    std::cout << to_utf8 (c) << std::endl;
}

Demo

Leave a Comment Cancel reply