问如何在纯C中检测UTF-8？
EN

Stack Overflow用户

提问于 2009-06-23 09:57:19

回答 8查看 36.1K关注 0票数 37

我正在寻找一个用普通老式C编写的代码片段，它可以检测给定的字符串是UTF-8编码的。我知道使用正则表达式的解决方案，但是由于各种原因，在这种情况下最好避免使用纯C之外的任何东西。

使用正则表达式的解决方案如下所示(警告:忽略各种检查)：

#define UTF8_DETECT_REGEXP  "^([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$"

const char *error;
int         error_off;
int         rc;
int         vect[100];

utf8_re = pcre_compile(UTF8_DETECT_REGEXP, PCRE_CASELESS, &error, &error_off, NULL);
utf8_pe = pcre_study(utf8_re, 0, &error);

rc = pcre_exec(utf8_re, utf8_pe, str, len, 0, 0, vect, sizeof(vect)/sizeof(vect[0]));

if (rc > 0) {
    printf("string is in UTF8\n");
} else {
    printf("string is not in UTF8\n")
}

utf-8

回答 8

Stack Overflow用户

发布于 2009-06-23 10:34:16

下面是一个用纯C实现的this expression (希望没有bug)：

_Bool is_utf8(const char * string)
{
    if(!string)
        return 0;

    const unsigned char * bytes = (const unsigned char *)string;
    while(*bytes)
    {
        if( (// ASCII
             // use bytes[0] <= 0x7F to allow ASCII control characters
                bytes[0] == 0x09 ||
                bytes[0] == 0x0A ||
                bytes[0] == 0x0D ||
                (0x20 <= bytes[0] && bytes[0] <= 0x7E)
            )
        ) {
            bytes += 1;
            continue;
        }

        if( (// non-overlong 2-byte
                (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF)
            )
        ) {
            bytes += 2;
            continue;
        }

        if( (// excluding overlongs
                bytes[0] == 0xE0 &&
                (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            ) ||
            (// straight 3-byte
                ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
                    bytes[0] == 0xEE ||
                    bytes[0] == 0xEF) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            ) ||
            (// excluding surrogates
                bytes[0] == 0xED &&
                (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            )
        ) {
            bytes += 3;
            continue;
        }

        if( (// planes 1-3
                bytes[0] == 0xF0 &&
                (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            ) ||
            (// planes 4-15
                (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            ) ||
            (// plane 16
                bytes[0] == 0xF4 &&
                (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            )
        ) {
            bytes += 4;
            continue;
        }

        return 0;
    }

    return 1;
}

请注意，这是W3C推荐用于表单验证的正则表达式的忠实翻译，它确实拒绝了一些有效的UTF-8序列(特别是那些包含ASCII控制字符的序列)。

此外，即使通过进行注释中提到的更改来修复此问题，它仍然假定为零终止，这将阻止嵌入NUL字符，尽管从技术上讲这应该是合法的。

当我尝试创建自己的字符串库时，我使用了修改后的UTF-8 (即将NUL编码为超长的两个字节的序列)--可以随意使用this header作为模板来提供一个验证例程，它不会受到上述缺点的影响。

票数 51

Stack Overflow用户

发布于 2014-03-03 07:15:42

Bjoern Hoermann的这个解码器是我发现的最简单的解码器。它的工作方式也是提供给它一个单字节，以及保持一个状态。该状态对于解析通过网络成块进入的UTF8非常有用。

http://bjoern.hoehrmann.de/utf-8/decoder/dfa/

// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0
#define UTF8_REJECT 1

static const uint8_t utf8d[] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

uint32_t inline
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  *state = utf8d[256 + *state*16 + type];
  return *state;
}

一个简单的验证器/检测器不需要代码点，因此可以这样编写(初始状态设置为UTF8_ACCEPT)：

uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
   size_t i;
   uint32_t type;

    for (i = 0; i < len; i++) {
        // We don't care about the codepoint, so this is
        // a simplified version of the decode function.
        type = utf8d[(uint8_t)str[i]];
        *state = utf8d[256 + (*state) * 16 + type];

        if (*state == UTF8_REJECT)
            break;
    }

    return *state;
}

如果文本有效，则返回utf8 UTF8_ACCEPT。如果它是无效的UTF8_REJECT。如果需要更多数据，则返回其他一些整数。

以块为单位(例如从网络)馈送数据的使用示例：

char buf[128];
size_t bytes_read;
uint32_t state = UTF8_ACCEPT;

// Validate the UTF8 data in chunks.
while ((bytes_read = get_new_data(buf, sizeof(buf))) {
    if (validate_utf8(&state, buf, bytes_read) == UTF8_REJECT)) {
        fprintf(stderr, "Invalid UTF8 data!\n");
        return -1;
    }
}

// If everything went well we should have proper UTF8,
// the data might instead have ended in the middle of a UTF8
// codepoint.
if (state != UTF8_ACCEPT) {
    fprintf(stderr, "Invalid UTF8, incomplete codepoint\n");
}

票数 37

Stack Overflow用户

发布于 2009-06-23 10:10:46

您无法检测给定的字符串(或字节序列)是否是UTF-8编码的文本，例如，每个UTF-8八位字节序列也是一系列有效的(如果没有意义的话)拉丁-1(或其他编码)八位字节序列。然而，并不是每一系列有效的拉丁文-1八位字节都是有效的UTF-8系列。因此，您可以排除不符合UTF-8编码模式的字符串：

U+0000-U+007F    0xxxxxxx
U+0080-U+07FF    110yyyxx    10xxxxxx
U+0800-U+FFFF    1110yyyy    10yyyyxx    10xxxxxx
U+10000-U+10FFFF 11110zzz    10zzyyyy    10yyyyxx    10xxxxxx

票数 9

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/1031645

复制

相似问题

问如何在纯C中检测UTF-8？
EN

回答 8

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何在纯C中检测UTF-8？EN

回答 8

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何在纯C中检测UTF-8？
EN