src/utf.c

   1 /*
   2  * Copyright (c) 2009 Petri Lehtinen <petri@digip.org>
   3  *
   4  * Jansson is free software; you can redistribute it and/or modify
   5  * it under the terms of the MIT license. See LICENSE for details.
   6  */
   7
   8 #include <string.h>
   9 #include <stdint.h>
  10
  11 int utf8_encode(int32_t codepoint, char *buffer, int *size)
  12 {
  13     if(codepoint < 0)
  14         return -1;
  15     else if(codepoint < 0x80)
  16     {
  17         buffer[0] = (char)codepoint;
  18         *size = 1;
  19     }
  20     else if(codepoint < 0x800)
  21     {
  22         buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
  23         buffer[1] = 0x80 + ((codepoint & 0x03F));
  24         *size = 2;
  25     }
  26     else if(codepoint < 0x10000)
  27     {
  28         buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
  29         buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
  30         buffer[2] = 0x80 + ((codepoint & 0x003F));
  31         *size = 3;
  32     }
  33     else if(codepoint <= 0x10FFFF)
  34     {
  35         buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
  36         buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
  37         buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
  38         buffer[3] = 0x80 + ((codepoint & 0x00003F));
  39         *size = 4;
  40     }
  41     else
  42         return -1;
  43
  44     return 0;
  45 }
  46
  47 int utf8_check_first(char byte)
  48 {
  49     unsigned char u = (unsigned char)byte;
  50
  51     if(u < 0x80)
  52         return 1;
  53
  54     if(0x80 <= u && u <= 0xBF) {
  55         /* second, third or fourth byte of a multi-byte
  56            sequence, i.e. a "continuation byte" */
  57         return 0;
  58     }
  59     else if(u == 0xC0 || u == 0xC1) {
  60         /* overlong encoding of an ASCII byte */
  61         return 0;
  62     }
  63     else if(0xC2 <= u && u <= 0xDF) {
  64         /* 2-byte sequence */
  65         return 2;
  66     }
  67
  68     else if(0xE0 <= u && u <= 0xEF) {
  69         /* 3-byte sequence */
  70         return 3;
  71     }
  72     else if(0xF0 <= u && u <= 0xF4) {
  73         /* 4-byte sequence */
  74         return 4;
  75     }
  76     else { /* u >= 0xF5 */
  77         /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
  78            UTF-8 */
  79         return 0;
  80     }
  81 }
  82
  83 int utf8_check_full(const char *buffer, int size)
  84 {
  85     int i;
  86     int32_t value = 0;
  87     unsigned char u = (unsigned char)buffer[0];
  88
  89     if(size == 2)
  90     {
  91         value = u & 0x1F;
  92     }
  93     else if(size == 3)
  94     {
  95         value = u & 0xF;
  96     }
  97     else if(size == 4)
  98     {
  99         value = u & 0x7;
 100     }
 101     else
 102         return 0;
 103
 104     for(i = 1; i < size; i++)
 105     {
 106         u = (unsigned char)buffer[i];
 107
 108         if(u < 0x80 || u > 0xBF) {
 109             /* not a continuation byte */
 110             return 0;
 111         }
 112
 113         value = (value << 6) + (u & 0x3F);
 114     }
 115
 116     if(value > 0x10FFFF) {
 117         /* not in Unicode range */
 118         return 0;
 119     }
 120
 121     else if(0xD800 <= value && value <= 0xDFFF) {
 122         /* invalid code point (UTF-16 surrogate halves) */
 123         return 0;
 124     }
 125
 126     else if((size == 2 && value < 0x80) ||
 127             (size == 3 && value < 0x800) ||
 128             (size == 4 && value < 0x10000)) {
 129         /* overlong encoding */
 130         return 0;
 131     }
 132
 133     return 1;
 134 }
 135
 136 int utf8_check_string(const char *string, int length)
 137 {
 138     int i;
 139
 140     if(length == -1)
 141         length = strlen(string);
 142
 143     for(i = 0; i < length; i++)
 144     {
 145         int count = utf8_check_first(string[i]);
 146         if(count == 0)
 147             return 0;
 148         else if(count > 1)
 149         {
 150             if(i + count > length)
 151                 return 0;
 152
 153             if(!utf8_check_full(&string[i], count))
 154                 return 0;
 155
 156             i += count - 1;
 157         }
 158     }
 159
 160     return 1;
 161 }