From 9240146c102f97045836d33fa1a79b267a050172 Mon Sep 17 00:00:00 2001 From: Petri Lehtinen Date: Tue, 14 Jul 2009 20:47:57 +0300 Subject: [PATCH] Implement support for \u escapes --- src/load.c | 99 ++++++++++++++++++++++++++++++++++++++++++++------- src/utf.c | 36 +++++++++++++++++++ src/utf.h | 2 ++ test/testdata/invalid | 20 +++++++++++ test/testdata/valid | 8 +++++ 5 files changed, 152 insertions(+), 13 deletions(-) diff --git a/src/load.c b/src/load.c index f9bcf7b..af6635a 100644 --- a/src/load.c +++ b/src/load.c @@ -83,8 +83,12 @@ static void error_set(json_error_t *error, const lex_t *lex, error->line = lex->line; if(saved_text && saved_text[0]) { - snprintf(error->text, JSON_ERROR_TEXT_LENGTH, - "%s near '%s'", text, saved_text); + if(lex->saved_text.length <= 20) { + snprintf(error->text, JSON_ERROR_TEXT_LENGTH, + "%s near '%s'", text, saved_text); + } + else + snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text); } else { @@ -208,11 +212,36 @@ static void lex_save_cached(lex_t *lex) } } +/* assumes that str points to 'u' plus at least 4 valid hex digits */ +static int decode_unicode_escape(const char *str) +{ + int i; + int value = 0; + + assert(str[0] == 'u'); + + for(i = 1; i <= 4; i++) { + char c = str[i]; + value <<= 4; + if(isdigit(c)) + value += c - '0'; + else if(islower(c)) + value += c - 'a' + 10; + else if(isupper(c)) + value += c - 'A' + 10; + else + assert(0); + } + + return value; +} + static void lex_scan_string(lex_t *lex, json_error_t *error) { char c; const char *p; char *t; + int i; lex->token = TOKEN_INVALID; @@ -240,7 +269,7 @@ static void lex_scan_string(lex_t *lex, json_error_t *error) c = lex_get_save(lex, error); if(c == 'u') { c = lex_get_save(lex, error); - for(int i = 0; i < 4; i++) { + for(i = 0; i < 4; i++) { if(!isxdigit(c)) { lex_unget_unsave(lex, c); error_set(error, lex, "invalid escape"); @@ -285,12 +314,57 @@ static void lex_scan_string(lex_t *lex, json_error_t *error) if(*p == '\\') { p++; if(*p == 'u') { - /* TODO */ - error_set(error, lex, "\\u escapes are not yet supported"); - free(lex->value.string); - lex->value.string = NULL; - goto out; - } else { + char buffer[4]; + int length; + int value; + + value = decode_unicode_escape(p); + p += 5; + + if(0xD800 <= value && value <= 0xDBFF) { + /* surrogate pair */ + if(*p == '\\' && *(p + 1) == 'u') { + int value2 = decode_unicode_escape(++p); + p += 5; + + if(0xDC00 <= value2 && value2 <= 0xDFFF) { + /* valid second surrogate */ + value = ((value - 0xD800) << 10) + + (value2 - 0xDC00) + + 0x10000; + } + else { + /* invalid second surrogate */ + error_set(error, lex, + "invalid Unicode '\\u%04X\\u%04X'", + value, value2); + goto out; + } + } + else { + /* no second surrogate */ + error_set(error, lex, "invalid Unicode '\\u%04X'", + value); + goto out; + } + } + else if(0xDC00 <= value && value <= 0xDFFF) { + error_set(error, lex, "invalid Unicode '\\u%04X'", value); + goto out; + } + else if(value == 0) + { + error_set(error, lex, "\\u0000 is not allowed"); + goto out; + } + + if(utf8_encode(value, buffer, &length)) + assert(0); + + memcpy(t, buffer, length); + t += length; + } + else { switch(*p) { case '"': case '\\': case '/': *t = *p; break; @@ -301,13 +375,12 @@ static void lex_scan_string(lex_t *lex, json_error_t *error) case 't': *t = '\t'; break; default: assert(0); } + t++; + p++; } } else - *t = *p; - - t++; - p++; + *(t++) = *(p++); } *t = '\0'; lex->token = TOKEN_STRING; diff --git a/src/utf.c b/src/utf.c index 092959d..0adf01b 100644 --- a/src/utf.c +++ b/src/utf.c @@ -1,5 +1,41 @@ #include +int utf8_encode(int codepoint, char *buffer, int *size) +{ + if(codepoint < 0) + return -1; + else if(codepoint < 0x80) + { + buffer[0] = (char)codepoint; + *size = 1; + } + else if(codepoint < 0x800) + { + buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); + buffer[1] = 0x80 + ((codepoint & 0x03F)); + *size = 2; + } + else if(codepoint < 0x10000) + { + buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); + buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); + buffer[2] = 0x80 + ((codepoint & 0x003F)); + *size = 3; + } + else if(codepoint <= 0x10FFFF) + { + buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); + buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); + buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); + buffer[3] = 0x80 + ((codepoint & 0x00003F)); + *size = 4; + } + else + return -1; + + return 0; +} + int utf8_check_first(char byte) { unsigned char u = (unsigned char)byte; diff --git a/src/utf.h b/src/utf.h index b8b0662..b49d8a1 100644 --- a/src/utf.h +++ b/src/utf.h @@ -1,6 +1,8 @@ #ifndef UTF_H #define UTF_H +int utf8_encode(int codepoint, char *buffer, int *size); + int utf8_check_first(char byte); int utf8_check_full(const char *buffer, int size); diff --git a/test/testdata/invalid b/test/testdata/invalid index df167ae..1871d85 100644 --- a/test/testdata/invalid +++ b/test/testdata/invalid @@ -136,3 +136,23 @@ invalid token near '-0' ==== 1 control character 0x9 near '"' +======== +["\u0000 (null byte not allowed)"] +==== +1 +\u0000 is not allowed +======== +["\uDADA (first surrogate without the second)"] +==== +1 +invalid Unicode '\uDADA' +======== +["\uD888\u3210 (first surrogate and invalid second surrogate)"] +==== +1 +invalid Unicode '\uD888\u3210' +======== +["\uDFAA (second surrogate on it's own)"] +==== +1 +invalid Unicode '\uDFAA' diff --git a/test/testdata/valid b/test/testdata/valid index 863933d..ebe7f95 100644 --- a/test/testdata/valid +++ b/test/testdata/valid @@ -8,6 +8,14 @@ ======== ["\"\\\/\b\f\n\r\t"] ======== +["\u002c one-byte UTF-8"] +======== +["\u0123 two-byte UTF-8"] +======== +["\u0821 three-byte UTF-8"] +======== +["\uD834\uDD1E surrogate, four-byte UTF-8"] +======== [0] ======== [1] -- 2.1.4