diff --git a/googletest/src/gtest-printers.cc b/googletest/src/gtest-printers.cc index 4e1ccad8..8fc7a491 100644 --- a/googletest/src/gtest-printers.cc +++ b/googletest/src/gtest-printers.cc @@ -375,6 +375,29 @@ bool ContainsUnprintableControlCodes(const char* str, size_t length) { bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; } +bool Is2ByteChar(const unsigned char lead, const size_t i, const size_t length, const unsigned char *s) { + return lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i]); +} + +bool Is3ByteChar(const unsigned char lead, const size_t i, const size_t length, const unsigned char *s) { + return 0xe0 <= lead && lead <= 0xef && (i + 2) <= length && + IsUTF8TrailByte(s[i]) && + IsUTF8TrailByte(s[i + 1]) && + // check for non-shortest form and surrogate + (lead != 0xe0 || s[i] >= 0xa0) && + (lead != 0xed || s[i] < 0xa0); +} + +bool Is4ByteChar(const unsigned char lead, const size_t i, const size_t length, const unsigned char *s) { + return 0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && + IsUTF8TrailByte(s[i]) && + IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i + 2]) && + // check for non-shortest form + (lead != 0xf0 || s[i] >= 0x90) && + (lead != 0xf4 || s[i] < 0x90); +} + bool IsValidUTF8(const char* str, size_t length) { const unsigned char *s = reinterpret_cast(str); @@ -386,23 +409,12 @@ bool IsValidUTF8(const char* str, size_t length) { } if (lead < 0xc2) { return false; // trail byte or non-shortest form - } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { - ++i; // 2-byte character - } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && - IsUTF8TrailByte(s[i]) && - IsUTF8TrailByte(s[i + 1]) && - // check for non-shortest form and surrogate - (lead != 0xe0 || s[i] >= 0xa0) && - (lead != 0xed || s[i] < 0xa0)) { - i += 2; // 3-byte character - } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && - IsUTF8TrailByte(s[i]) && - IsUTF8TrailByte(s[i + 1]) && - IsUTF8TrailByte(s[i + 2]) && - // check for non-shortest form - (lead != 0xf0 || s[i] >= 0x90) && - (lead != 0xf4 || s[i] < 0x90)) { - i += 3; // 4-byte character + } else if (Is2ByteChar(lead, i, length, s)) { + ++i; + } else if (Is3ByteChar(lead, i, length, s)) { + i += 2; + } else if (Is4ByteChar(lead, i, length, s)) { + i += 3; } else { return false; }