From 902bcdaa5e0d0273b1ba576c2fd676a5565b6d5e Mon Sep 17 00:00:00 2001 From: Petri Lehtinen Date: Mon, 13 Jul 2009 21:03:09 +0300 Subject: [PATCH] Decode and check for correct UTF-8 All strings (decoded JSON text, the argument of json_string(), the key argument of json_object_set()) are checked for valid UTF-8. --- src/Makefile.am | 3 ++ src/jansson_private.h | 8 ++++ src/load.c | 35 ++++++++++++++- src/utf.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/utf.h | 9 ++++ src/value.c | 22 +++++++++- 6 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 src/jansson_private.h create mode 100644 src/utf.c create mode 100644 src/utf.h diff --git a/src/Makefile.am b/src/Makefile.am index 79712d8..6fe3e9a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -5,9 +5,12 @@ libjansson_la_SOURCES = \ dump.c \ hashtable.c \ hashtable.h \ + jansson_private.h \ load.c \ strbuffer.c \ strbuffer.h \ + utf.c \ + utf.h \ util.h \ value.c libjansson_la_LDFLAGS = -version-info 0:0:0 diff --git a/src/jansson_private.h b/src/jansson_private.h new file mode 100644 index 0000000..d359ed5 --- /dev/null +++ b/src/jansson_private.h @@ -0,0 +1,8 @@ +#ifndef JANSSON_PRIVATE_H +#define JANSSON_PRIVATE_H + +int json_object_set_nocheck(json_t *json, const char *key, json_t *value); +json_t *json_string_nocheck(const char *value); + + +#endif diff --git a/src/load.c b/src/load.c index f576df1..cae8c53 100644 --- a/src/load.c +++ b/src/load.c @@ -9,7 +9,9 @@ #include #include +#include "jansson_private.h" #include "strbuffer.h" +#include "utf.h" #define TOKEN_INVALID -1 #define TOKEN_EOF 0 @@ -101,8 +103,37 @@ static char stream_get(stream_t *stream) { if(!stream->buffer[stream->buffer_pos]) { + char c; + stream->buffer[0] = stream->get(stream->data); stream->buffer_pos = 0; + + c = stream->buffer[0]; + + if(c == EOF && stream->eof(stream->data)) + return EOF; + + if(c < 0) + { + /* multi-byte UTF-8 sequence */ + int i, count; + + count = utf8_check_first(c); + if(!count) + return 0; + + assert(count >= 2); + + for(i = 1; i < count; i++) + stream->buffer[i] = stream->get(stream->data); + + if(!utf8_check_full(stream->buffer, count)) + return 0; + + stream->buffer[count] = '\0'; + } + else + stream->buffer[1] = '\0'; } return (char)stream->buffer[stream->buffer_pos++]; @@ -439,7 +470,7 @@ static json_t *parse_object(lex_t *lex, json_error_t *error) goto error; } - if(json_object_set(object, key, value)) { + if(json_object_set_nocheck(object, key, value)) { free(key); json_decref(value); goto error; @@ -513,7 +544,7 @@ static json_t *parse_value(lex_t *lex, json_error_t *error) switch(lex->token) { case TOKEN_STRING: { - json = json_string(lex->value.string); + json = json_string_nocheck(lex->value.string); break; } diff --git a/src/utf.c b/src/utf.c new file mode 100644 index 0000000..092959d --- /dev/null +++ b/src/utf.c @@ -0,0 +1,116 @@ +#include + +int utf8_check_first(char byte) +{ + unsigned char u = (unsigned char)byte; + + if(u < 0x80) + return 1; + + if(0x80 <= u && u <= 0xBF) { + /* second, third or fourth byte of a multi-byte + sequence, i.e. a "continuation byte" */ + return 0; + } + else if(u == 0xC0 || u == 0xC1) { + /* overlong encoding of an ASCII byte */ + return 0; + } + else if(0xC2 <= u && u <= 0xDF) { + /* 2-byte sequence */ + return 2; + } + + else if(0xE0 <= u && u <= 0xEF) { + /* 3-byte sequence */ + return 3; + } + else if(0xF0 <= u && u <= 0xF4) { + /* 4-byte sequence */ + return 4; + } + else { /* u >= 0xF5 */ + /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid + UTF-8 */ + return 0; + } +} + +int utf8_check_full(const char *buffer, int size) +{ + int i, value = 0; + unsigned char u = (unsigned char)buffer[0]; + + if(size == 2) + { + value = u & 0x1F; + } + else if(size == 3) + { + value = u & 0xF; + } + else if(size == 4) + { + value = u & 0x7; + } + else + return 0; + + for(i = 1; i < size; i++) + { + u = (unsigned char)buffer[i]; + + if(u < 0x80 || u > 0xBF) { + /* not a continuation byte */ + return 0; + } + + value = (value << 6) + (u & 0x3F); + } + + if(value > 0x10FFFF) { + /* not in Unicode range */ + return 0; + } + + else if(0xD800 <= value && value <= 0xDFFF) { + /* invalid code point (UTF-16 surrogate halves) */ + return 0; + } + + else if((size == 2 && value < 0x80) || + (size == 3 && value < 0x800) || + (size == 4 && value < 0x10000)) { + /* overlong encoding */ + return 0; + } + + return 1; +} + +int utf8_check_string(const char *string, int length) +{ + int i; + + if(length == -1) + length = strlen(string); + + for(i = 0; i < length; i++) + { + int count = utf8_check_first(string[i]); + if(count == 0) + return 0; + else if(count > 1) + { + if(i + count > length) + return 0; + + if(!utf8_check_full(&string[i], count)) + return 0; + + i += count - 1; + } + } + + return 1; +} diff --git a/src/utf.h b/src/utf.h new file mode 100644 index 0000000..b8b0662 --- /dev/null +++ b/src/utf.h @@ -0,0 +1,9 @@ +#ifndef UTF_H +#define UTF_H + +int utf8_check_first(char byte); +int utf8_check_full(const char *buffer, int size); + +int utf8_check_string(const char *string, int length); + +#endif diff --git a/src/value.c b/src/value.c index 00501d5..6f0094b 100644 --- a/src/value.c +++ b/src/value.c @@ -4,6 +4,8 @@ #include #include "hashtable.h" +#include "jansson_private.h" +#include "utf.h" #include "util.h" #define container_of(ptr_, type_, member_) \ @@ -109,7 +111,7 @@ json_t *json_object_get(const json_t *json, const char *key) return hashtable_get(&object->hashtable, key); } -int json_object_set(json_t *json, const char *key, json_t *value) +int json_object_set_nocheck(json_t *json, const char *key, json_t *value) { json_object_t *object; @@ -120,6 +122,14 @@ int json_object_set(json_t *json, const char *key, json_t *value) return hashtable_set(&object->hashtable, strdup(key), json_incref(value)); } +int json_object_set(json_t *json, const char *key, json_t *value) +{ + if(!utf8_check_string(key, -1)) + return -1; + + return json_object_set_nocheck(json, key, value); +} + int json_object_del(json_t *json, const char *key) { json_object_t *object; @@ -255,7 +265,7 @@ int json_array_append(json_t *json, json_t *value) /*** string ***/ -json_t *json_string(const char *value) +json_t *json_string_nocheck(const char *value) { json_string_t *string = malloc(sizeof(json_string_t)); if(!string) @@ -266,6 +276,14 @@ json_t *json_string(const char *value) return &string->json; } +json_t *json_string(const char *value) +{ + if(!utf8_check_string(value, -1)) + return NULL; + + return json_string_nocheck(value); +} + const char *json_string_value(const json_t *json) { if(!json_is_string(json)) -- 2.1.4