C++ UTF-8 substr function
#include <iostream> using namespace std; string utf8_substr(const string& str, unsigned int start=0, unsigned int leng=string::npos); int main(int argc, char *argv[]) { string str = string("ni hao"); string ni_hao = string("\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c"); cout << utf8_substr(ni_hao) << endl; cout << utf8_substr(ni_hao,0) << endl; cout << utf8_substr(ni_hao,1,0) << endl; cout << utf8_substr(ni_hao,0,2) << endl; cout << utf8_substr(ni_hao,2,2) << endl; return 0; } string utf8_substr(const string& str, unsigned int start, unsigned int leng) { if (leng==0) { return ""; } unsigned int c, i, ix, q, min=string::npos, max=string::npos; for (q=0, i=0, ix=str.length(); i < ix; i++, q++) { if (q==start){ min=i; } if (q<=start+leng || leng==string::npos){ max=i; } c = (unsigned char) str[i]; if (c>=0 && c<=127) i+=0; else if ((c & 0xE0) == 0xC0) i+=1; else if ((c & 0xF0) == 0xE0) i+=2; else if ((c & 0xF8) == 0xF0) i+=3; //else if (($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8 //else if (($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8 else return "";//invalid utf8 } if (q<=start+leng || leng==string::npos){ max=i; } if (min==string::npos || max==string::npos) { return ""; } return str.substr(min,max); }
Here is an 2 pass version, in so doing supports negative indices, but requires #include <climits>
string utf8_substr2(const string &str,int start, int length=INT_MAX) { int i,ix,j,realstart,reallength; if (length==0) return ""; if (start<0 || length <0) { //find j=utf8_strlen(str); for(j=0,i=0,ix=str.length(); i<ix; i+=1, j++) { unsigned char c= str[i]; if (c>=0 && c<=127) i+=0; else if (c>=192 && c<=223) i+=1; else if (c>=224 && c<=239) i+=2; else if (c>=240 && c<=247) i+=3; else if (c>=248 && c<=255) return "";//invalid utf8 } if (length !=INT_MAX && j+length-start<=0) return ""; if (start < 0 ) start+=j; if (length < 0 ) length=j+length-start; } j=0,realstart=0,reallength=0; for(i=0,ix=str.length(); i<ix; i+=1, j++) { if (j==start) { realstart=i; } if (j>=start && (length==INT_MAX || j<=start+length)) { reallength=i-realstart; } unsigned char c= str[i]; if (c>=0 && c<=127) i+=0; else if (c>=192 && c<=223) i+=1; else if (c>=224 && c<=239) i+=2; else if (c>=240 && c<=247) i+=3; else if (c>=248 && c<=255) return "";//invalid utf8 } if (j==start) { realstart=i; } if (j>=start && (length==INT_MAX || j<=start+length)) { reallength=i-realstart; } return str.substr(realstart,reallength); }
code snippets are licensed under Creative Commons CC-By-SA 3.0 (unless otherwise specified)