Decode and check for correct UTF-8

author Petri Lehtinen <petri@digip.org>

Mon, 13 Jul 2009 18:03:09 +0000 (21:03 +0300)

committer Petri Lehtinen <petri@digip.org>

Mon, 13 Jul 2009 20:38:01 +0000 (23:38 +0300)
author Petri Lehtinen <petri@digip.org>
Mon, 13 Jul 2009 18:03:09 +0000 (21:03 +0300)
committer Petri Lehtinen <petri@digip.org>
Mon, 13 Jul 2009 20:38:01 +0000 (23:38 +0300)
diff --git a/src/Makefile.am b/src/Makefile.am

index 79712d8..6fe3e9a 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,9 +5,12 @@ libjansson_la_SOURCES = \
         dump.c \
         hashtable.c \
         hashtable.h \
+       jansson_private.h \
         load.c \
         strbuffer.c \
         strbuffer.h \
+       utf.c \
+       utf.h \
         util.h \
         value.c
  libjansson_la_LDFLAGS = -version-info 0:0:0
diff --git a/src/jansson_private.h b/src/jansson_private.h

new file mode 100644 (file)

index 0000000..d359ed5
--- /dev/null
+++ b/src/jansson_private.h
@@ -0,0 +1,8 @@
+#ifndef JANSSON_PRIVATE_H
+#define JANSSON_PRIVATE_H
+
+int json_object_set_nocheck(json_t *json, const char *key, json_t *value);
+json_t *json_string_nocheck(const char *value);
+
+
+#endif
diff --git a/src/load.c b/src/load.c

index f576df1..cae8c53 100644 (file)
--- a/src/load.c
+++ b/src/load.c
@@ -9,7 +9,9 @@
  #include <assert.h>
  
  #include <jansson.h>
+#include "jansson_private.h"
  #include "strbuffer.h"
+#include "utf.h"
  
  #define TOKEN_INVALID         -1
  #define TOKEN_EOF              0
@@ -101,8 +103,37 @@ static char stream_get(stream_t *stream)
  {
      if(!stream->buffer[stream->buffer_pos])
      {
+        char c;
+
          stream->buffer[0] = stream->get(stream->data);
          stream->buffer_pos = 0;
+
+        c = stream->buffer[0];
+
+        if(c == EOF && stream->eof(stream->data))
+            return EOF;
+
+        if(c < 0)
+        {
+            /* multi-byte UTF-8 sequence */
+            int i, count;
+
+            count = utf8_check_first(c);
+            if(!count)
+                return 0;
+
+            assert(count >= 2);
+
+            for(i = 1; i < count; i++)
+                stream->buffer[i] = stream->get(stream->data);
+
+            if(!utf8_check_full(stream->buffer, count))
+                return 0;
+
+            stream->buffer[count] = '\0';
+        }
+        else
+            stream->buffer[1] = '\0';
      }
  
      return (char)stream->buffer[stream->buffer_pos++];
@@ -439,7 +470,7 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
              goto error;
          }
  
-        if(json_object_set(object, key, value)) {
+        if(json_object_set_nocheck(object, key, value)) {
              free(key);
              json_decref(value);
              goto error;
@@ -513,7 +544,7 @@ static json_t *parse_value(lex_t *lex, json_error_t *error)
  
      switch(lex->token) {
          case TOKEN_STRING: {
-            json = json_string(lex->value.string);
+            json = json_string_nocheck(lex->value.string);
              break;
          }
  
diff --git a/src/utf.c b/src/utf.c

new file mode 100644 (file)

index 0000000..092959d
--- /dev/null
+++ b/src/utf.c
@@ -0,0 +1,116 @@
+#include <string.h>
+
+int utf8_check_first(char byte)
+{
+    unsigned char u = (unsigned char)byte;
+
+    if(u < 0x80)
+        return 1;
+
+    if(0x80 <= u && u <= 0xBF) {
+        /* second, third or fourth byte of a multi-byte
+           sequence, i.e. a "continuation byte" */
+        return 0;
+    }
+    else if(u == 0xC0 || u == 0xC1) {
+        /* overlong encoding of an ASCII byte */
+        return 0;
+    }
+    else if(0xC2 <= u && u <= 0xDF) {
+        /* 2-byte sequence */
+        return 2;
+    }
+
+    else if(0xE0 <= u && u <= 0xEF) {
+        /* 3-byte sequence */
+        return 3;
+    }
+    else if(0xF0 <= u && u <= 0xF4) {
+        /* 4-byte sequence */
+        return 4;
+    }
+    else { /* u >= 0xF5 */
+        /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
+           UTF-8 */
+        return 0;
+    }
+}
+
+int utf8_check_full(const char *buffer, int size)
+{
+    int i, value = 0;
+    unsigned char u = (unsigned char)buffer[0];
+
+    if(size == 2)
+    {
+        value = u & 0x1F;
+    }
+    else if(size == 3)
+    {
+        value = u & 0xF;
+    }
+    else if(size == 4)
+    {
+        value = u & 0x7;
+    }
+    else
+        return 0;
+
+    for(i = 1; i < size; i++)
+    {
+        u = (unsigned char)buffer[i];
+
+        if(u < 0x80 || u > 0xBF) {
+            /* not a continuation byte */
+            return 0;
+        }
+
+        value = (value << 6) + (u & 0x3F);
+    }
+
+    if(value > 0x10FFFF) {
+        /* not in Unicode range */
+        return 0;
+    }
+
+    else if(0xD800 <= value && value <= 0xDFFF) {
+        /* invalid code point (UTF-16 surrogate halves) */
+        return 0;
+    }
+
+    else if((size == 2 && value < 0x80) ||
+            (size == 3 && value < 0x800) ||
+            (size == 4 && value < 0x10000)) {
+        /* overlong encoding */
+        return 0;
+    }
+
+    return 1;
+}
+
+int utf8_check_string(const char *string, int length)
+{
+    int i;
+
+    if(length == -1)
+        length = strlen(string);
+
+    for(i = 0; i < length; i++)
+    {
+        int count = utf8_check_first(string[i]);
+        if(count == 0)
+            return 0;
+        else if(count > 1)
+        {
+            if(i + count > length)
+                return 0;
+
+            if(!utf8_check_full(&string[i], count))
+                return 0;
+
+            i += count - 1;
+        }
+    }
+
+    return 1;
+}
diff --git a/src/utf.h b/src/utf.h

new file mode 100644 (file)

index 0000000..b8b0662
--- /dev/null
+++ b/src/utf.h
@@ -0,0 +1,9 @@
+#ifndef UTF_H
+#define UTF_H
+
+int utf8_check_first(char byte);
+int utf8_check_full(const char *buffer, int size);
+
+int utf8_check_string(const char *string, int length);
+
+#endif
diff --git a/src/value.c b/src/value.c

index 00501d5..6f0094b 100644 (file)
--- a/src/value.c
+++ b/src/value.c
@@ -4,6 +4,8 @@
  
  #include <jansson.h>
  #include "hashtable.h"
+#include "jansson_private.h"
+#include "utf.h"
  #include "util.h"
  
  #define container_of(ptr_, type_, member_)  \
@@ -109,7 +111,7 @@ json_t *json_object_get(const json_t *json, const char *key)
      return hashtable_get(&object->hashtable, key);
  }
  
-int json_object_set(json_t *json, const char *key, json_t *value)
+int json_object_set_nocheck(json_t *json, const char *key, json_t *value)
  {
      json_object_t *object;
  
@@ -120,6 +122,14 @@ int json_object_set(json_t *json, const char *key, json_t *value)
      return hashtable_set(&object->hashtable, strdup(key), json_incref(value));
  }
  
+int json_object_set(json_t *json, const char *key, json_t *value)
+{
+    if(!utf8_check_string(key, -1))
+        return -1;
+
+    return json_object_set_nocheck(json, key, value);
+}
+
  int json_object_del(json_t *json, const char *key)
  {
      json_object_t *object;
@@ -255,7 +265,7 @@ int json_array_append(json_t *json, json_t *value)
  
  /*** string ***/
  
-json_t *json_string(const char *value)
+json_t *json_string_nocheck(const char *value)
  {
      json_string_t *string = malloc(sizeof(json_string_t));
      if(!string)
@@ -266,6 +276,14 @@ json_t *json_string(const char *value)
      return &string->json;
  }
  
+json_t *json_string(const char *value)
+{
+    if(!utf8_check_string(value, -1))
+        return NULL;
+
+    return json_string_nocheck(value);
+}
+
  const char *json_string_value(const json_t *json)
  {
      if(!json_is_string(json))
author	Petri Lehtinen <petri@digip.org>
	Mon, 13 Jul 2009 18:03:09 +0000 (21:03 +0300)
committer	Petri Lehtinen <petri@digip.org>
	Mon, 13 Jul 2009 20:38:01 +0000 (23:38 +0300)
src/Makefile.am		patch \| blob \| history
src/jansson_private.h	[new file with mode: 0644]	patch \| blob
src/load.c		patch \| blob \| history
src/utf.c	[new file with mode: 0644]	patch \| blob
src/utf.h	[new file with mode: 0644]	patch \| blob
src/value.c		patch \| blob \| history