C++ UTF-8 codepoint conversion
#include <iostream>
#include <sstream>
using namespace std;
int codepoint(const string &u);
string codepointhex(const string &u);
string utf8chr(int cp);
int main(int argc, char *argv[])
{
for(int i=32; i<127; i++) //printable ascii range
{
cout << "i" << i << ":" <<utf8chr(i) << endl;
}
for(int i=192; i<382; i++) // À to ž
{
cout << "i" << i << ":" <<utf8chr(i) << endl;
}
for(int i=0x4f60; i<0x4f80; i++) // 你 to 使
{
cout << "i" << i << ":" <<utf8chr(i) << endl;
}
string input0 = "A"; //A is ascii 65
string input1 = "\xc3\xa8"; // è
string input2 = "\xe4\xbd\xa0"; //你
cout << input0 << codepoint(input0) << "," << codepointhex(input0) << endl; //65,0x41
cout << input1 << codepoint(input1) << "," << codepointhex(input1) << endl; //232,0xe8
cout << input2 << codepoint(input2) << "," << codepointhex(input2) << endl; //20320,0x4f60
return 0;
}
int codepoint(const string &u)
{
int l = u.length();
if (l<1) return -1; unsigned char u0 = u[0]; if (u0>=0 && u0<=127) return u0;
if (l<2) return -1; unsigned char u1 = u[1]; if (u0>=192 && u0<=223) return (u0-192)*64 + (u1-128);
if (u[0]==0xed && (u[1] & 0xa0) == 0xa0) return -1; //code points, 0xd800 to 0xdfff
if (l<3) return -1; unsigned char u2 = u[2]; if (u0>=224 && u0<=239) return (u0-224)*4096 + (u1-128)*64 + (u2-128);
if (l<4) return -1; unsigned char u3 = u[3]; if (u0>=240 && u0<=247) return (u0-240)*262144 + (u1-128)*4096 + (u2-128)*64 + (u3-128);
return -1;
}
string codepointhex(const string &u)
{
stringstream ss;
string s;
ss << showbase << hex << codepoint(u);
ss >> s;
return s;
}
string utf8chr(int cp)
{
char c[5]={ 0x00,0x00,0x00,0x00,0x00 };
if (cp<=0x7F) { c[0] = cp; }
else if(cp<=0x7FF) { c[0] = (cp>>6)+192; c[1] = (cp&63)+128; }
else if(0xd800<=cp && cp<=0xdfff) {} //invalid block of utf8
else if(cp<=0xFFFF) { c[0] = (cp>>12)+224; c[1]= ((cp>>6)&63)+128; c[2]=(cp&63)+128; }
else if(cp<=0x10FFFF) { c[0] = (cp>>18)+240; c[1] = ((cp>>12)&63)+128; c[2] = ((cp>>6)&63)+128; c[3]=(cp&63)+128; }
return string(c);
}code snippets are licensed under Creative Commons CC-By-SA 3.0 (unless otherwise specified)
|
|
abhishek
on
2014-08-01 12:12:58
thank you so much i struggled a lot but no where it was mentioned how to map oodepoint to a utf8 character sequence.. great work buddy.
|
|
|
Teus Benschop
on
2015-07-17 18:40:17
Thank you for the conversion from a code point to a UTF- string. At first I used:
// wstring_convert , char32_t> conv1; // string u8str = conv1.to_bytes (codepoint); That worked well on some OSes, but was not available in GNU libstdc++. Your code replaced it well. Thank! |
|