C# · 12月 20, 2021

c语言判断是否是utf8字符串,计算字符个数

#include

#include

#include

/****************************************************************************

Unicode符号范围 | UTF-8编码方式

    (十六进制) | (二进制)

0000 0000-0000 007F:0xxxxxxx

0000 0080-0000 07FF:110xxxxx 10xxxxxx

0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx

0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

**************************************************************************/

unsigned char utf8_look_for_table[] =

    {

        1,1,

        1,

        2,2,

        3,3,

        4,4,5,6,1};

#define UTFLEN(x) utf8_look_for_table[(x)]

//根据首字节,获取utf8字符所占字节数

inline int GetUtf8charByteNum(unsigned char ch)

{

    int byteNum = 0;

    if (ch >= 0xFC && ch < 0xFE)

        byteNum = 6;

    else if (ch >= 0xF8)

        byteNum = 5;

    else if (ch >= 0xF0)

        byteNum = 4;

    else if (ch >= 0xE0)

        byteNum = 3;

    else if (ch >= 0xC0)

        byteNum = 2;

    else if (0 == (ch & 0x80))

        byteNum = 1;

    return byteNum;

}

//判断字符串是否是utf8格式

int IsUtf8Format(const char *str)

{

    int byteNum = 0;

    unsigned char ch;

    const char *ptr = str;

    if (NULL == str)

        return 0;

    while (*ptr != ”)

    {

        ch = (unsigned char)*ptr;

        if (byteNum == 0) //根据首字节特性判断该字符的字节数

        {

            if (0 == (byteNum = GetUtf8charByteNum(ch)))

                return 0;

        }

        else //多字节字符,非首字节格式:10xxxxxx

        {

            if ((ch & 0xC0) != 0x80)

                return 0;

        }

        byteNum–;

        ptr++;

    }

    if (byteNum > 0)

        return 0;

    return 1;

}

//计算utf8字符串字符个数

int GetUtf8Length(char *str)

{

    int clen = 0;

    int len = 0;

    int byteNum = 0;

    unsigned char ch;

    char *ptr = str;

    if (NULL == str)

        return 0;

    clen = strlen(str);

    while (*ptr != ” && len < clen)

    {

        ch = (unsigned char)*ptr;

        if (0 == (byteNum = GetUtf8charByteNum(ch)))

            return 0;

        ptr += byteNum;

        len++;

    }

    return len;

}

int GetChargeNum(int len)

{

    int num = 0;

    if (len > 70 && len <= 500)

    {

        if (!len % 67)

            num = len / 67;

        else

            num = len / 67 + 1;

    }

    else if (len > 0)

        num = 1;

    return num;

}

int main(int argc, char **argv)

{

    //char *str = “hello 你好呀!”;

    char *str;

    int len = 0;

    int num = 0;

    if (argc < 2)

        return 0;

    str = argv[1];

    printf(“%sn”,str);

    if (!IsUtf8Format(str))

    {

        printf(“the text is not the Format of utf8n”);

        return 0;

    }

    if (!(len = GetUtf8Length(str)))

        return 0;

    printf(“the length of text: %dn”,len);

    if (!(num = GetChargeNum(len)))

        return 0;

    printf(“the chargeNumber of sms: %dn”,num);

    return 1;

}