src/utf.c

   1 #include <string.h>
   2
   3 int utf8_check_first(char byte)
   4 {
   5     unsigned char u = (unsigned char)byte;
   6
   7     if(u < 0x80)
   8         return 1;
   9
  10     if(0x80 <= u && u <= 0xBF) {
  11         /* second, third or fourth byte of a multi-byte
  12            sequence, i.e. a "continuation byte" */
  13         return 0;
  14     }
  15     else if(u == 0xC0 || u == 0xC1) {
  16         /* overlong encoding of an ASCII byte */
  17         return 0;
  18     }
  19     else if(0xC2 <= u && u <= 0xDF) {
  20         /* 2-byte sequence */
  21         return 2;
  22     }
  23
  24     else if(0xE0 <= u && u <= 0xEF) {
  25         /* 3-byte sequence */
  26         return 3;
  27     }
  28     else if(0xF0 <= u && u <= 0xF4) {
  29         /* 4-byte sequence */
  30         return 4;
  31     }
  32     else { /* u >= 0xF5 */
  33         /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
  34            UTF-8 */
  35         return 0;
  36     }
  37 }
  38
  39 int utf8_check_full(const char *buffer, int size)
  40 {
  41     int i, value = 0;
  42     unsigned char u = (unsigned char)buffer[0];
  43
  44     if(size == 2)
  45     {
  46         value = u & 0x1F;
  47     }
  48     else if(size == 3)
  49     {
  50         value = u & 0xF;
  51     }
  52     else if(size == 4)
  53     {
  54         value = u & 0x7;
  55     }
  56     else
  57         return 0;
  58
  59     for(i = 1; i < size; i++)
  60     {
  61         u = (unsigned char)buffer[i];
  62
  63         if(u < 0x80 || u > 0xBF) {
  64             /* not a continuation byte */
  65             return 0;
  66         }
  67
  68         value = (value << 6) + (u & 0x3F);
  69     }
  70
  71     if(value > 0x10FFFF) {
  72         /* not in Unicode range */
  73         return 0;
  74     }
  75
  76     else if(0xD800 <= value && value <= 0xDFFF) {
  77         /* invalid code point (UTF-16 surrogate halves) */
  78         return 0;
  79     }
  80
  81     else if((size == 2 && value < 0x80) ||
  82             (size == 3 && value < 0x800) ||
  83             (size == 4 && value < 0x10000)) {
  84         /* overlong encoding */
  85         return 0;
  86     }
  87
  88     return 1;
  89 }
  90
  91 int utf8_check_string(const char *string, int length)
  92 {
  93     int i;
  94
  95     if(length == -1)
  96         length = strlen(string);
  97
  98     for(i = 0; i < length; i++)
  99     {
 100         int count = utf8_check_first(string[i]);
 101         if(count == 0)
 102             return 0;
 103         else if(count > 1)
 104         {
 105             if(i + count > length)
 106                 return 0;
 107
 108             if(!utf8_check_full(&string[i], count))
 109                 return 0;
 110
 111             i += count - 1;
 112         }
 113     }
 114
 115     return 1;
 116 }