src/utf.c

   1 /*
   2  * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
   3  *
   4  * Jansson is free software; you can redistribute it and/or modify
   5  * it under the terms of the MIT license. See LICENSE for details.
   6  */
   7
   8 #include <string.h>
   9 #include <stdint.h>
  10
  11 int utf8_encode(int32_t codepoint, char *buffer, int *size)
  12 {
  13     if(codepoint < 0)
  14         return -1;
  15     else if(codepoint < 0x80)
  16     {
  17         buffer[0] = (char)codepoint;
  18         *size = 1;
  19     }
  20     else if(codepoint < 0x800)
  21     {
  22         buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
  23         buffer[1] = 0x80 + ((codepoint & 0x03F));
  24         *size = 2;
  25     }
  26     else if(codepoint < 0x10000)
  27     {
  28         buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
  29         buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
  30         buffer[2] = 0x80 + ((codepoint & 0x003F));
  31         *size = 3;
  32     }
  33     else if(codepoint <= 0x10FFFF)
  34     {
  35         buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
  36         buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
  37         buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
  38         buffer[3] = 0x80 + ((codepoint & 0x00003F));
  39         *size = 4;
  40     }
  41     else
  42         return -1;
  43
  44     return 0;
  45 }
  46
  47 int utf8_check_first(char byte)
  48 {
  49     unsigned char u = (unsigned char)byte;
  50
  51     if(u < 0x80)
  52         return 1;
  53
  54     if(0x80 <= u && u <= 0xBF) {
  55         /* second, third or fourth byte of a multi-byte
  56            sequence, i.e. a "continuation byte" */
  57         return 0;
  58     }
  59     else if(u == 0xC0 || u == 0xC1) {
  60         /* overlong encoding of an ASCII byte */
  61         return 0;
  62     }
  63     else if(0xC2 <= u && u <= 0xDF) {
  64         /* 2-byte sequence */
  65         return 2;
  66     }
  67
  68     else if(0xE0 <= u && u <= 0xEF) {
  69         /* 3-byte sequence */
  70         return 3;
  71     }
  72     else if(0xF0 <= u && u <= 0xF4) {
  73         /* 4-byte sequence */
  74         return 4;
  75     }
  76     else { /* u >= 0xF5 */
  77         /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
  78            UTF-8 */
  79         return 0;
  80     }
  81 }
  82
  83 int utf8_check_full(const char *buffer, int size, int32_t *codepoint)
  84 {
  85     int i;
  86     int32_t value = 0;
  87     unsigned char u = (unsigned char)buffer[0];
  88
  89     if(size == 2)
  90     {
  91         value = u & 0x1F;
  92     }
  93     else if(size == 3)
  94     {
  95         value = u & 0xF;
  96     }
  97     else if(size == 4)
  98     {
  99         value = u & 0x7;
 100     }
 101     else
 102         return 0;
 103
 104     for(i = 1; i < size; i++)
 105     {
 106         u = (unsigned char)buffer[i];
 107
 108         if(u < 0x80 || u > 0xBF) {
 109             /* not a continuation byte */
 110             return 0;
 111         }
 112
 113         value = (value << 6) + (u & 0x3F);
 114     }
 115
 116     if(value > 0x10FFFF) {
 117         /* not in Unicode range */
 118         return 0;
 119     }
 120
 121     else if(0xD800 <= value && value <= 0xDFFF) {
 122         /* invalid code point (UTF-16 surrogate halves) */
 123         return 0;
 124     }
 125
 126     else if((size == 2 && value < 0x80) ||
 127             (size == 3 && value < 0x800) ||
 128             (size == 4 && value < 0x10000)) {
 129         /* overlong encoding */
 130         return 0;
 131     }
 132
 133     if(codepoint)
 134         *codepoint = value;
 135
 136     return 1;
 137 }
 138
 139 const char *utf8_iterate(const char *buffer, int32_t *codepoint)
 140 {
 141     int count;
 142     int32_t value;
 143
 144     if(!*buffer)
 145         return buffer;
 146
 147     count = utf8_check_first(buffer[0]);
 148     if(count <= 0)
 149         return NULL;
 150
 151     if(count == 1)
 152         value = (unsigned char)buffer[0];
 153     else
 154     {
 155         if(!utf8_check_full(buffer, count, &value))
 156             return NULL;
 157     }
 158
 159     if(codepoint)
 160         *codepoint = value;
 161
 162     return buffer + count;
 163 }
 164
 165 int utf8_check_string(const char *string, int length)
 166 {
 167     int i;
 168
 169     if(length == -1)
 170         length = strlen(string);
 171
 172     for(i = 0; i < length; i++)
 173     {
 174         int count = utf8_check_first(string[i]);
 175         if(count == 0)
 176             return 0;
 177         else if(count > 1)
 178         {
 179             if(i + count > length)
 180                 return 0;
 181
 182             if(!utf8_check_full(&string[i], count, NULL))
 183                 return 0;
 184
 185             i += count - 1;
 186         }
 187     }
 188
 189     return 1;
 190 }