From 50031440a3b7ab2623e9468bd20e837250250cd9 Mon Sep 17 00:00:00 2001 From: Petri Lehtinen Date: Sat, 5 Dec 2009 22:55:30 +0200 Subject: [PATCH] Implement JSON_ENSURE_ASCII encoding flag With this flag, all Unicode characters outside the ASCII range are escaped. --- doc/apiref.rst | 7 +++++ src/dump.c | 67 +++++++++++++++++++++++++++++++++++----------- src/jansson.h | 5 ++-- src/load.c | 2 +- src/utf.c | 33 +++++++++++++++++++++-- src/utf.h | 3 ++- test/testprogs/test_dump.c | 57 +++++++++++++++++++++++++++++++++++++-- 7 files changed, 150 insertions(+), 24 deletions(-) diff --git a/doc/apiref.rst b/doc/apiref.rst index cf8842d..dcd401c 100644 --- a/doc/apiref.rst +++ b/doc/apiref.rst @@ -519,6 +519,13 @@ can be ORed together to obtain *flags*. .. versionadded:: 1.2 +``JSON_ENSURE_ASCII`` + If this flag is used, the output is guaranteed to consist only of + ASCII characters. This is achived by escaping all Unicode + characters outside the ASCII range. + + .. versionadded:: 1.2 + The following functions perform the actual JSON encoding. The result is in UTF-8. diff --git a/src/dump.c b/src/dump.c index 8d2a82b..dc3fcbc 100644 --- a/src/dump.c +++ b/src/dump.c @@ -14,6 +14,7 @@ #include #include "jansson_private.h" #include "strbuffer.h" +#include "utf.h" #define MAX_INTEGER_STR_LENGTH 100 #define MAX_REAL_STR_LENGTH 100 @@ -65,34 +66,49 @@ static int dump_indent(unsigned long flags, int depth, int space, dump_func dump return 0; } -static int dump_string(const char *str, dump_func dump, void *data) +static int dump_string(const char *str, int ascii, dump_func dump, void *data) { - const char *end; + const char *pos, *end; + int32_t codepoint; if(dump("\"", 1, data)) return -1; - end = str; + end = pos = str; while(1) { const char *text; - char seq[7]; + char seq[13]; int length; - while(*end && *end != '\\' && *end != '"' && (unsigned char)*end > 0x1F) - end++; + while(*end) + { + end = utf8_iterate(pos, &codepoint); + if(!end) + return -1; - if(end != str) { - if(dump(str, end - str, data)) + /* mandatory escape or control char */ + if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20) + break; + + /* non-ASCII */ + if(ascii && codepoint > 0x7F) + break; + + pos = end; + } + + if(pos != str) { + if(dump(str, pos - str, data)) return -1; } - if(!*end) + if(end == pos) break; /* handle \, ", and control codes */ length = 2; - switch(*end) + switch(codepoint) { case '\\': text = "\\\\"; break; case '\"': text = "\\\""; break; @@ -103,9 +119,27 @@ static int dump_string(const char *str, dump_func dump, void *data) case '\t': text = "\\t"; break; default: { - sprintf(seq, "\\u00%02x", *end); + /* codepoint is in BMP */ + if(codepoint < 0x10000) + { + sprintf(seq, "\\u%04x", codepoint); + length = 6; + } + + /* not in BMP -> construct a UTF-16 surrogate pair */ + else + { + int32_t first, last; + + codepoint -= 0x10000; + first = 0xD800 | ((codepoint & 0xffc00) >> 10); + last = 0xDC00 | (codepoint & 0x003ff); + + sprintf(seq, "\\u%04x\\u%04x", first, last); + length = 12; + } + text = seq; - length = 6; break; } } @@ -113,8 +147,7 @@ static int dump_string(const char *str, dump_func dump, void *data) if(dump(text, length, data)) return -1; - end++; - str = end; + str = pos = end; } return dump("\"", 1, data); @@ -123,6 +156,8 @@ static int dump_string(const char *str, dump_func dump, void *data) static int do_dump(const json_t *json, unsigned long flags, int depth, dump_func dump, void *data) { + int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0; + switch(json_typeof(json)) { case JSON_NULL: return dump("null", 4, data); @@ -158,7 +193,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth, } case JSON_STRING: - return dump_string(json_string_value(json), dump, data); + return dump_string(json_string_value(json), ascii, dump, data); case JSON_ARRAY: { @@ -238,7 +273,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth, { void *next = json_object_iter_next((json_t *)json, iter); - dump_string(json_object_iter_key(iter), dump, data); + dump_string(json_object_iter_key(iter), ascii, dump, data); if(dump(separator, separator_length, data) || do_dump(json_object_iter_value(iter), flags, depth + 1, dump, data)) diff --git a/src/jansson.h b/src/jansson.h index c8a5a90..d59fe10 100644 --- a/src/jansson.h +++ b/src/jansson.h @@ -141,8 +141,9 @@ json_t *json_loads(const char *input, json_error_t *error); json_t *json_loadf(FILE *input, json_error_t *error); json_t *json_load_file(const char *path, json_error_t *error); -#define JSON_INDENT(n) (n & 0xFF) -#define JSON_COMPACT 0x100 +#define JSON_INDENT(n) (n & 0xFF) +#define JSON_COMPACT 0x100 +#define JSON_ENSURE_ASCII 0x200 char *json_dumps(const json_t *json, unsigned long flags); int json_dumpf(const json_t *json, FILE *output, unsigned long flags); diff --git a/src/load.c b/src/load.c index 32d6500..278f35e 100644 --- a/src/load.c +++ b/src/load.c @@ -149,7 +149,7 @@ static char stream_get(stream_t *stream, json_error_t *error) for(i = 1; i < count; i++) stream->buffer[i] = stream->get(stream->data); - if(!utf8_check_full(stream->buffer, count)) + if(!utf8_check_full(stream->buffer, count, NULL)) goto out; stream->stream_pos += count; diff --git a/src/utf.c b/src/utf.c index 2efcb68..dda80f0 100644 --- a/src/utf.c +++ b/src/utf.c @@ -80,7 +80,7 @@ int utf8_check_first(char byte) } } -int utf8_check_full(const char *buffer, int size) +int utf8_check_full(const char *buffer, int size, int32_t *codepoint) { int i; int32_t value = 0; @@ -130,9 +130,38 @@ int utf8_check_full(const char *buffer, int size) return 0; } + if(codepoint) + *codepoint = value; + return 1; } +const char *utf8_iterate(const char *buffer, int32_t *codepoint) +{ + int count; + int32_t value; + + if(!*buffer) + return buffer; + + count = utf8_check_first(buffer[0]); + if(count <= 0) + return NULL; + + if(count == 1) + value = (unsigned char)buffer[0]; + else + { + if(!utf8_check_full(buffer, count, &value)) + return NULL; + } + + if(codepoint) + *codepoint = value; + + return buffer + count; +} + int utf8_check_string(const char *string, int length) { int i; @@ -150,7 +179,7 @@ int utf8_check_string(const char *string, int length) if(i + count > length) return 0; - if(!utf8_check_full(&string[i], count)) + if(!utf8_check_full(&string[i], count, NULL)) return 0; i += count - 1; diff --git a/src/utf.h b/src/utf.h index 75d7b6e..03fba69 100644 --- a/src/utf.h +++ b/src/utf.h @@ -11,7 +11,8 @@ int utf8_encode(int codepoint, char *buffer, int *size); int utf8_check_first(char byte); -int utf8_check_full(const char *buffer, int size); +int utf8_check_full(const char *buffer, int size, int32_t *codepoint); +const char *utf8_iterate(const char *buffer, int32_t *codepoint); int utf8_check_string(const char *string, int length); diff --git a/test/testprogs/test_dump.c b/test/testprogs/test_dump.c index 2532fca..548de06 100644 --- a/test/testprogs/test_dump.c +++ b/test/testprogs/test_dump.c @@ -131,8 +131,8 @@ static void test_compact() #define INDENTED_COMPACT_OBJECT \ "{\n" \ - " \"a\":1,\n" \ - " \"b\":2\n" \ + " \"a\":1,\n" \ + " \"b\":2\n" \ "}" #define INDENTED_COMPACT_ARRAY \ "[\n" \ @@ -163,12 +163,65 @@ static void test_compact_indent() json_decref(array); } + +static const char *test_ensure_ascii_data[][2] = { + /* + { "input", "output" } + */ + + /* ascii */ + { "foo", "foo" }, + + /* BMP */ + { "\xc3\xa4 \xc3\xb6 \xc3\xa5", "\\u00e4 \\u00f6 \\u00e5" }, + { "foo \xc3\xa4\xc3\xa5", "foo \\u00e4\\u00e5" }, + { "\xc3\xa4\xc3\xa5 foo", "\\u00e4\\u00e5 foo" }, + { "\xc3\xa4 foo \xc3\xa5", "\\u00e4 foo \\u00e5" }, + + /* non-BMP */ + { "clef g: \xf0\x9d\x84\x9e", "clef g: \\ud834\\udd1e" }, +}; + +static void test_ensure_ascii() +{ + int i; + int num_tests = sizeof(test_ensure_ascii_data) / sizeof(const char *) / 2; + + for(i = 0; i < num_tests; i++) { + json_t *array, *string; + const char *input, *output; + char *result, *stripped; + + input = test_ensure_ascii_data[i][0]; + output = test_ensure_ascii_data[i][1]; + + array = json_array(); + string = json_string(input); + if(!array || !string) + fail("unable to create json values"); + + json_array_append(array, string); + result = json_dumps(array, JSON_ENSURE_ASCII); + + /* strip leading [" and trailing "] */ + stripped = &result[2]; + stripped[strlen(stripped) - 2] = '\0'; + + if(strcmp(stripped, output) != 0) { + free(result); + fail("the result of json_dumps is invalid"); + } + free(result); + } +} + int main(void) { test_normal(); test_indent(); test_compact(); test_compact_indent(); + test_ensure_ascii(); return 0; } -- 2.1.4