I want to print the range of Unicode characters, from 00B2 to 00B5 (and many other ranges), how can I print these programmatically?
Obviously, I don’t know how to define a variable Unicode character and then do arithmetic (increment) for them.
I am looking for the loop equivalent of this below.
How can I programmatically print an arbitrary range of Unicode characters?
#include <iostream>
int main() {
std::cout << "\u00B2" << std::endl;
std::cout << "\u00B3" << std::endl;
std::cout << "\u00B4" << std::endl;
std::cout << "\u00B5" << std::endl;
}
output (unicode characters)
²
³
´
µ
Not sure if I understood your question correctly. However, you can achieve this programmatically by using a loop and incrementing the Unicode code points. Here’s an example in C++:
#include <iostream>
#include <locale>
int main() {
std::locale::global(std::locale(""));
wchar_t start = 0x00B2; // Unicode code point for '²'
wchar_t end = 0x00B5; // Unicode code point for 'µ'
for (wchar_t codePoint = start; codePoint <= end; ++codePoint) {
std::wcout << codePoint << std::endl;
}
return 0;
}
The above code uses wchar_t for wide characters and std::wcout for wide character output. The std::locale::global(std::locale(“”)) line sets the global locale to the user’s default, which is necessary for properly displaying Unicode characters based on the user’s environment.
You might encode/decode from utf-8:
void append_utf8 (::std::string& s, ::std::uint32_t unicode)
{
if (unicode < 0x80)
{
s.push_back (unicode & 0x7F);
}
else if (unicode < 0x08'00)
{
s.push_back (((unicode >> 6) & 0x1F) | 0xC0);
s.push_back ((unicode & 0x3F) | 0x80);
}
else if (unicode < 0x00'01'00'00)
{
s.push_back (((unicode >> 12) & 0x0F) | 0xE0);
s.push_back (((unicode >> 6) & 0x3F) | 0x80);
s.push_back ((unicode & 0x3F) | 0x80);
}
else
{
s.push_back (((unicode >> 18) & 0x07) | 0xF0);
s.push_back (((unicode >> 12) & 0x3F) | 0x80);
s.push_back (((unicode >> 6) & 0x3F) | 0x80);
s.push_back ((unicode & 0x3F) | 0x80);
}
}
::std::string to_utf8 (::std::uint32_t unicode)
{
::std::string res;
append_utf8 (res, unicode);
return res;
}
//--------------------------------------------------------------------------
constexpr ::std::uint32_t decodeUtf8 (const ::std::string_view& text, ::std::size_t& pos)
{
if (text.size() <= pos)
{
throw ::std::out_of_range ("invalid position for decodeUtf8");
}
const ::std::uint32_t c0 = static_cast<unsigned char> (text[pos]);
const auto is_extra_char = [] (char c) { return (c & 0b1100'0000) == 0b1000'0000; };
const auto check_extra_size = [&] (int size) {
if (text.size() <= pos + size || !std::all_of (&text[pos + 1], &text.data()[pos + 1 + size], is_extra_char))
{
throw ::std::out_of_range ("invalid position for decodeUtf8");
}
};
if ((c0 & 0b1111'1000) == 0b1111'0000)
{
check_extra_size (3);
const unsigned char c1 = text[pos + 1] & 0b0011'1111;
const unsigned char c2 = text[pos + 2] & 0b0011'1111;
const unsigned char c3 = text[pos + 3] & 0b0011'1111;
pos += 4;
return ((c0 & 0b000'0111) << 18) | (c1 << 12) | (c2 << 6) | c3;
}
else if ((c0 & 0b1111'0000) == 0b1110'0000)
{
check_extra_size (2);
const unsigned char c1 = text[pos + 1] & 0b0011'1111;
const unsigned char c2 = text[pos + 2] & 0b0011'1111;
pos += 3;
return ((c0 & 0b0000'1111) << 12) | (c1 << 6) | c2;
}
else if ((c0 & 0b1110'0000) == 0b1100'0000)
{
check_extra_size (1);
const unsigned char c1 = text[pos + 1] & 0b0011'1111;
pos += 2;
return ((c0 & 0b0001'1111) << 6) | c1;
}
else
{
if ((c0 & 0b1000'0000) != 0)
{
throw ::std::runtime_error ("Invalid utf8 character for decodeUtf8");
}
pos += 1;
return (c0 & 0b0111'1111);
}
}
constexpr ::std::uint32_t decodeUtf8 (const ::std::string_view& text)
{
std::size_t pos = 0;
return decodeUtf8 (text, pos);
}
And then
for (auto c = decodeUtf8 ("\u00B2"); c != decodeUtf8 ("\u00B6"); ++c) {
std::cout << to_utf8 (c) << std::endl;
}
or
for (auto c = 0x00B2; c != 0x00B6; ++c) {
std::cout << to_utf8 (c) << std::endl;
}
@RemyLebeau, that’s right, I am that ignorant of string manipulation. I wasn’t even sure if
wchar
was for this, and nevermind the use ofwcout
. I was bound to get it wrong.