C++ UTF-8 substr function
#include <iostream>
using namespace std;
string utf8_substr(const string& str, unsigned int start=0, unsigned int leng=string::npos);
int main(int argc, char *argv[])
{
string str = string("ni hao");
string ni_hao = string("\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c");
cout << utf8_substr(ni_hao) << endl;
cout << utf8_substr(ni_hao,0) << endl;
cout << utf8_substr(ni_hao,1,0) << endl;
cout << utf8_substr(ni_hao,0,2) << endl;
cout << utf8_substr(ni_hao,2,2) << endl;
return 0;
}
string utf8_substr(const string& str, unsigned int start, unsigned int leng)
{
if (leng==0) { return ""; }
unsigned int c, i, ix, q, min=string::npos, max=string::npos;
for (q=0, i=0, ix=str.length(); i < ix; i++, q++)
{
if (q==start){ min=i; }
if (q<=start+leng || leng==string::npos){ max=i; }
c = (unsigned char) str[i];
if (c>=0 && c<=127) i+=0;
else if ((c & 0xE0) == 0xC0) i+=1;
else if ((c & 0xF0) == 0xE0) i+=2;
else if ((c & 0xF8) == 0xF0) i+=3;
//else if (($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
//else if (($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
else return "";//invalid utf8
}
if (q<=start+leng || leng==string::npos){ max=i; }
if (min==string::npos || max==string::npos) { return ""; }
return str.substr(min,max);
}Here is an 2 pass version, in so doing supports negative indices, but requires #include <climits>
string utf8_substr2(const string &str,int start, int length=INT_MAX)
{
int i,ix,j,realstart,reallength;
if (length==0) return "";
if (start<0 || length <0)
{
//find j=utf8_strlen(str);
for(j=0,i=0,ix=str.length(); i<ix; i+=1, j++)
{
unsigned char c= str[i];
if (c>=0 && c<=127) i+=0;
else if (c>=192 && c<=223) i+=1;
else if (c>=224 && c<=239) i+=2;
else if (c>=240 && c<=247) i+=3;
else if (c>=248 && c<=255) return "";//invalid utf8
}
if (length !=INT_MAX && j+length-start<=0) return "";
if (start < 0 ) start+=j;
if (length < 0 ) length=j+length-start;
}
j=0,realstart=0,reallength=0;
for(i=0,ix=str.length(); i<ix; i+=1, j++)
{
if (j==start) { realstart=i; }
if (j>=start && (length==INT_MAX || j<=start+length)) { reallength=i-realstart; }
unsigned char c= str[i];
if (c>=0 && c<=127) i+=0;
else if (c>=192 && c<=223) i+=1;
else if (c>=224 && c<=239) i+=2;
else if (c>=240 && c<=247) i+=3;
else if (c>=248 && c<=255) return "";//invalid utf8
}
if (j==start) { realstart=i; }
if (j>=start && (length==INT_MAX || j<=start+length)) { reallength=i-realstart; }
return str.substr(realstart,reallength);
}code snippets are licensed under Creative Commons CC-By-SA 3.0 (unless otherwise specified)
|