X-Git-Url: http://www.project-moonshot.org/gitweb/?a=blobdiff_plain;f=src%2Fload.c;h=f05230ba41e0d1f2867b3d314a8d47ccd99212a9;hb=5422a862de2354b3419ef628bac5a18c6ef522da;hp=ace963a91f64932a8ffbbcd9ee9b7bdefa743017;hpb=17a69c2d66a86757877c3d4159e999da3a5434bf;p=jansson.git diff --git a/src/load.c b/src/load.c index ace963a..f05230b 100644 --- a/src/load.c +++ b/src/load.c @@ -1,97 +1,306 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + #define _GNU_SOURCE +#include +#include +#include #include #include #include -#include +#include #include #include +#include "jansson_private.h" +#include "strbuffer.h" +#include "utf.h" + +#define TOKEN_INVALID -1 +#define TOKEN_EOF 0 +#define TOKEN_STRING 256 +#define TOKEN_INTEGER 257 +#define TOKEN_REAL 258 +#define TOKEN_TRUE 259 +#define TOKEN_FALSE 260 +#define TOKEN_NULL 261 +/* read one byte from stream, return EOF on end of file */ +typedef int (*get_func)(void *data); + +/* return non-zero if end of file has been reached */ +typedef int (*eof_func)(void *data); + +typedef struct { + get_func get; + eof_func eof; + void *data; + int stream_pos; + char buffer[5]; + int buffer_pos; +} stream_t; -#define JSON_TOKEN_INVALID -1 -#define JSON_TOKEN_EOF 0 -#define JSON_TOKEN_STRING 256 -#define JSON_TOKEN_NUMBER 257 -#define JSON_TOKEN_TRUE 258 -#define JSON_TOKEN_FALSE 259 -#define JSON_TOKEN_NULL 260 typedef struct { - const char *input; - const char *start; + stream_t stream; + strbuffer_t saved_text; int token; int line, column; union { char *string; - double number; + json_int_t integer; + double real; } value; -} json_lex; +} lex_t; /*** error reporting ***/ -static __thread char *json_error_msg = NULL; +static void error_set(json_error_t *error, const lex_t *lex, + const char *msg, ...) +{ + va_list ap; + char msg_text[JSON_ERROR_TEXT_LENGTH]; + + int line = -1, col = -1; + const char *result = msg_text; + + if(!error) + return; + + va_start(ap, msg); + vsnprintf(msg_text, JSON_ERROR_TEXT_LENGTH, msg, ap); + va_end(ap); + + if(lex) + { + const char *saved_text = strbuffer_value(&lex->saved_text); + char msg_with_context[JSON_ERROR_TEXT_LENGTH]; + + line = lex->line; -static void json_set_error(const json_lex *lex, const char *msg) + if(saved_text && saved_text[0]) + { + if(lex->saved_text.length <= 20) { + snprintf(msg_with_context, JSON_ERROR_TEXT_LENGTH, + "%s near '%s'", msg_text, saved_text); + result = msg_with_context; + } + } + else + { + snprintf(msg_with_context, JSON_ERROR_TEXT_LENGTH, + "%s near end of file", msg_text); + result = msg_with_context; + } + } + + jsonp_error_set(error, line, col, "%s", result); +} + + +/*** lexical analyzer ***/ + +static void +stream_init(stream_t *stream, get_func get, eof_func eof, void *data) { - free(json_error_msg); - if(*lex->start) - asprintf(&json_error_msg, "%s near '%.*s' on line %d", msg, - (int)(lex->input - lex->start), lex->start, lex->line); - else - asprintf(&json_error_msg, "%s near end of file", msg); + stream->get = get; + stream->eof = eof; + stream->data = data; + stream->stream_pos = 0; + stream->buffer[0] = '\0'; + stream->buffer_pos = 0; } -const char *json_get_error(void) +static char stream_get(stream_t *stream, json_error_t *error) { - if(!json_error_msg) - json_error_msg = strdup("success"); - return json_error_msg; + char c; + + if(!stream->buffer[stream->buffer_pos]) + { + stream->buffer[0] = stream->get(stream->data); + stream->buffer_pos = 0; + + c = stream->buffer[0]; + + if((unsigned char)c >= 0x80 && c != (char)EOF) + { + /* multi-byte UTF-8 sequence */ + int i, count; + + count = utf8_check_first(c); + if(!count) + goto out; + + assert(count >= 2); + + for(i = 1; i < count; i++) + stream->buffer[i] = stream->get(stream->data); + + if(!utf8_check_full(stream->buffer, count, NULL)) + goto out; + + stream->stream_pos += count; + stream->buffer[count] = '\0'; + } + else { + stream->buffer[1] = '\0'; + stream->stream_pos++; + } + } + + return stream->buffer[stream->buffer_pos++]; + +out: + error_set(error, NULL, "unable to decode byte 0x%x at position %d", + (unsigned char)c, stream->stream_pos); + + stream->buffer[0] = EOF; + stream->buffer[1] = '\0'; + stream->buffer_pos = 1; + + return EOF; +} + +static void stream_unget(stream_t *stream, char c) +{ + assert(stream->buffer_pos > 0); + stream->buffer_pos--; + assert(stream->buffer[stream->buffer_pos] == c); } -/*** lexical analyzer ***/ +static int lex_get(lex_t *lex, json_error_t *error) +{ + return stream_get(&lex->stream, error); +} -static void json_scan_string(json_lex *lex) +static int lex_eof(lex_t *lex) { - /* skip the " */ - const char *p = lex->input + 1; + return lex->stream.eof(lex->stream.data); +} + +static void lex_save(lex_t *lex, char c) +{ + strbuffer_append_byte(&lex->saved_text, c); +} + +static int lex_get_save(lex_t *lex, json_error_t *error) +{ + char c = stream_get(&lex->stream, error); + lex_save(lex, c); + return c; +} + +static void lex_unget_unsave(lex_t *lex, char c) +{ + char d; + stream_unget(&lex->stream, c); + d = strbuffer_pop(&lex->saved_text); + assert(c == d); +} + +static void lex_save_cached(lex_t *lex) +{ + while(lex->stream.buffer[lex->stream.buffer_pos] != '\0') + { + lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]); + lex->stream.buffer_pos++; + } +} + +/* assumes that str points to 'u' plus at least 4 valid hex digits */ +static int32_t decode_unicode_escape(const char *str) +{ + int i; + int32_t value = 0; + + assert(str[0] == 'u'); + + for(i = 1; i <= 4; i++) { + char c = str[i]; + value <<= 4; + if(isdigit(c)) + value += c - '0'; + else if(islower(c)) + value += c - 'a' + 10; + else if(isupper(c)) + value += c - 'A' + 10; + else + assert(0); + } + + return value; +} + +static void lex_scan_string(lex_t *lex, json_error_t *error) +{ + char c; + const char *p; char *t; + int i; - lex->token = JSON_TOKEN_INVALID; + lex->value.string = NULL; + lex->token = TOKEN_INVALID; - while(*p != '"') { - if(*p == '\0') { - /* unterminated string literal */ + c = lex_get_save(lex, error); + + while(c != '"') { + if(c == (char)EOF) { + lex_unget_unsave(lex, c); + if(lex_eof(lex)) + error_set(error, lex, "premature end of input"); goto out; } - if(0 <= *p && *p <= 31) { + else if((unsigned char)c <= 0x1F) { /* control character */ + lex_unget_unsave(lex, c); + if(c == '\n') + error_set(error, lex, "unexpected newline", c); + else + error_set(error, lex, "control character 0x%x", c); goto out; } - else if(*p == '\\') { - p++; - if(*p == 'u') { - p++; - for(int i = 0; i < 4; i++, p++) { - if(!isxdigit(*p)) + + else if(c == '\\') { + c = lex_get_save(lex, error); + if(c == 'u') { + c = lex_get_save(lex, error); + for(i = 0; i < 4; i++) { + if(!isxdigit(c)) { + lex_unget_unsave(lex, c); + error_set(error, lex, "invalid escape"); goto out; + } + c = lex_get_save(lex, error); } } - if(*p == '"' || *p == '\\' || *p == '/' || *p == 'b' || - *p == 'f' || *p == 'n' || *p == 'r' || *p == 't') - p++; - else + else if(c == '"' || c == '\\' || c == '/' || c == 'b' || + c == 'f' || c == 'n' || c == 'r' || c == 't') + c = lex_get_save(lex, error); + else { + lex_unget_unsave(lex, c); + error_set(error, lex, "invalid escape"); goto out; + } } else - p++; + c = lex_get_save(lex, error); } /* the actual value is at most of the same length as the source - string */ - lex->value.string = malloc(p - lex->start); + string, because: + - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte + - a single \uXXXX escape (length 6) is converted to at most 3 bytes + - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair + are converted to 4 bytes + */ + lex->value.string = malloc(lex->saved_text.length + 1); if(!lex->value.string) { /* this is not very nice, since TOKEN_INVALID is returned */ goto out; @@ -100,16 +309,65 @@ static void json_scan_string(json_lex *lex) /* the target */ t = lex->value.string; - p = lex->input + 1; + /* + 1 to skip the " */ + p = strbuffer_value(&lex->saved_text) + 1; + while(*p != '"') { if(*p == '\\') { p++; if(*p == 'u') { - /* TODO: \uXXXX not supported yet */ - free(lex->value.string); - lex->value.string = NULL; - goto out; - } else { + char buffer[4]; + int length; + int32_t value; + + value = decode_unicode_escape(p); + p += 5; + + if(0xD800 <= value && value <= 0xDBFF) { + /* surrogate pair */ + if(*p == '\\' && *(p + 1) == 'u') { + int32_t value2 = decode_unicode_escape(++p); + p += 5; + + if(0xDC00 <= value2 && value2 <= 0xDFFF) { + /* valid second surrogate */ + value = + ((value - 0xD800) << 10) + + (value2 - 0xDC00) + + 0x10000; + } + else { + /* invalid second surrogate */ + error_set(error, lex, + "invalid Unicode '\\u%04X\\u%04X'", + value, value2); + goto out; + } + } + else { + /* no second surrogate */ + error_set(error, lex, "invalid Unicode '\\u%04X'", + value); + goto out; + } + } + else if(0xDC00 <= value && value <= 0xDFFF) { + error_set(error, lex, "invalid Unicode '\\u%04X'", value); + goto out; + } + else if(value == 0) + { + error_set(error, lex, "\\u0000 is not allowed"); + goto out; + } + + if(utf8_encode(value, buffer, &length)) + assert(0); + + memcpy(t, buffer, length); + t += length; + } + else { switch(*p) { case '"': case '\\': case '/': *t = *p; break; @@ -120,186 +378,271 @@ static void json_scan_string(json_lex *lex) case 't': *t = '\t'; break; default: assert(0); } + t++; + p++; } } else - *t = *p; - - t++; - p++; + *(t++) = *(p++); } - /* skip the " */ - p++; - *t = '\0'; - lex->token = JSON_TOKEN_STRING; + lex->token = TOKEN_STRING; + return; out: - lex->input = p; + free(lex->value.string); } -static void json_scan_number(json_lex *lex) +#if JSON_INTEGER_IS_LONG_LONG +#define json_strtoint strtoll +#else +#define json_strtoint strtol +#endif + +static int lex_scan_number(lex_t *lex, char c, json_error_t *error) { - const char *p = lex->input; + const char *saved_text; char *end; + double value; - lex->token = JSON_TOKEN_INVALID; + lex->token = TOKEN_INVALID; - if(*p == '-') - p++; + if(c == '-') + c = lex_get_save(lex, error); - if(*p == '0') - p++; - else /* *p != '0' */ { - p++; - while(isdigit(*p)) - p++; + if(c == '0') { + c = lex_get_save(lex, error); + if(isdigit(c)) { + lex_unget_unsave(lex, c); + goto out; + } + } + else if(isdigit(c)) { + c = lex_get_save(lex, error); + while(isdigit(c)) + c = lex_get_save(lex, error); } + else { + lex_unget_unsave(lex, c); + goto out; + } + + if(c != '.' && c != 'E' && c != 'e') { + json_int_t value; + + lex_unget_unsave(lex, c); - if(*p == '.') { - p++; - if(!isdigit(*(p++))) + saved_text = strbuffer_value(&lex->saved_text); + + errno = 0; + value = json_strtoint(saved_text, &end, 10); + if(errno == ERANGE) { + if(value < 0) + error_set(error, lex, "too big negative integer"); + else + error_set(error, lex, "too big integer"); goto out; + } - while(isdigit(*p)) - p++; + assert(end == saved_text + lex->saved_text.length); + + lex->token = TOKEN_INTEGER; + lex->value.integer = value; + return 0; } - if(*p == 'E' || *p == 'e') { - p++; - if(*p == '+' || *p == '-') - p++; + if(c == '.') { + c = lex_get(lex, error); + if(!isdigit(c)) + goto out; + lex_save(lex, c); + + c = lex_get_save(lex, error); + while(isdigit(c)) + c = lex_get_save(lex, error); + } - if(!isdigit(*(p++))) + if(c == 'E' || c == 'e') { + c = lex_get_save(lex, error); + if(c == '+' || c == '-') + c = lex_get_save(lex, error); + + if(!isdigit(c)) { + lex_unget_unsave(lex, c); goto out; + } - while(isdigit(*p)) - p++; + c = lex_get_save(lex, error); + while(isdigit(c)) + c = lex_get_save(lex, error); } - lex->token = JSON_TOKEN_NUMBER; + lex_unget_unsave(lex, c); - lex->value.number = strtod(lex->start, &end); - assert(end == p); + saved_text = strbuffer_value(&lex->saved_text); + value = strtod(saved_text, &end); + assert(end == saved_text + lex->saved_text.length); + + if(errno == ERANGE && value != 0) { + error_set(error, lex, "real number overflow"); + goto out; + } + + lex->token = TOKEN_REAL; + lex->value.real = value; + return 0; out: - lex->input = p; + return -1; } -static int json_lex_scan(json_lex *lex) +static int lex_scan(lex_t *lex, json_error_t *error) { char c; - if(lex->token == JSON_TOKEN_STRING) { - free(lex->value.string); - lex->value.string = NULL; + strbuffer_clear(&lex->saved_text); + + if(lex->token == TOKEN_STRING) { + free(lex->value.string); + lex->value.string = NULL; } - while(isspace(*lex->input)) { - if(*lex->input == '\n') + c = lex_get(lex, error); + while(c == ' ' || c == '\t' || c == '\n' || c == '\r') + { + if(c == '\n') lex->line++; - lex->input++; + c = lex_get(lex, error); } - lex->start = lex->input; - c = *lex->input; + if(c == (char)EOF) { + if(lex_eof(lex)) + lex->token = TOKEN_EOF; + else + lex->token = TOKEN_INVALID; + goto out; + } - if(c == '\0') - lex->token = JSON_TOKEN_EOF; + lex_save(lex, c); - else if(c == '{' || c == '}' || c == '[' || c == ']' || - c == ':' || c == ',') { + if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',') lex->token = c; - lex->input++; - } else if(c == '"') - json_scan_string(lex); + lex_scan_string(lex, error); - else if(isdigit(c) || c == '-') - json_scan_number(lex); + else if(isdigit(c) || c == '-') { + if(lex_scan_number(lex, c, error)) + goto out; + } - else if(isalpha(c)) { + else if(isupper(c) || islower(c)) { /* eat up the whole identifier for clearer error messages */ - int len; - - while(isalpha(*lex->input)) - lex->input++; - len = lex->input - lex->start; - - if(strncmp(lex->start, "true", len) == 0) - lex->token = JSON_TOKEN_TRUE; - else if(strncmp(lex->start, "false", len) == 0) - lex->token = JSON_TOKEN_FALSE; - else if(strncmp(lex->start, "null", len) == 0) - lex->token = JSON_TOKEN_NULL; + const char *saved_text; + + c = lex_get_save(lex, error); + while(isupper(c) || islower(c)) + c = lex_get_save(lex, error); + lex_unget_unsave(lex, c); + + saved_text = strbuffer_value(&lex->saved_text); + + if(strcmp(saved_text, "true") == 0) + lex->token = TOKEN_TRUE; + else if(strcmp(saved_text, "false") == 0) + lex->token = TOKEN_FALSE; + else if(strcmp(saved_text, "null") == 0) + lex->token = TOKEN_NULL; else - lex->token = JSON_TOKEN_INVALID; + lex->token = TOKEN_INVALID; } else { - lex->token = JSON_TOKEN_INVALID; - lex->input++; + /* save the rest of the input UTF-8 sequence to get an error + message of valid UTF-8 */ + lex_save_cached(lex); + lex->token = TOKEN_INVALID; } +out: return lex->token; } -static int json_lex_init(json_lex *lex, const char *input) +static char *lex_steal_string(lex_t *lex) +{ + char *result = NULL; + if(lex->token == TOKEN_STRING) + { + result = lex->value.string; + lex->value.string = NULL; + } + return result; +} + +static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data) { - lex->input = input; - lex->token = JSON_TOKEN_INVALID; + stream_init(&lex->stream, get, eof, data); + if(strbuffer_init(&lex->saved_text)) + return -1; + + lex->token = TOKEN_INVALID; lex->line = 1; - json_lex_scan(lex); return 0; } -static void json_lex_close(json_lex *lex) +static void lex_close(lex_t *lex) { - if(lex->token == JSON_TOKEN_STRING) + if(lex->token == TOKEN_STRING) free(lex->value.string); + strbuffer_close(&lex->saved_text); } /*** parser ***/ -static json_t *json_parse(json_lex *lex); +static json_t *parse_value(lex_t *lex, json_error_t *error); -static json_t *json_parse_object(json_lex *lex) +static json_t *parse_object(lex_t *lex, json_error_t *error) { json_t *object = json_object(); if(!object) return NULL; - json_lex_scan(lex); + lex_scan(lex, error); + if(lex->token == '}') + return object; + while(1) { char *key; json_t *value; - if(lex->token != JSON_TOKEN_STRING) { - json_set_error(lex, "string expected"); + if(lex->token != TOKEN_STRING) { + error_set(error, lex, "string or '}' expected"); goto error; } - key = strdup(lex->value.string); + key = lex_steal_string(lex); if(!key) return NULL; - json_lex_scan(lex); + lex_scan(lex, error); if(lex->token != ':') { - json_set_error(lex, "':' expected"); + free(key); + error_set(error, lex, "':' expected"); goto error; } - json_lex_scan(lex); - - value = json_parse(lex); - if(!value) + lex_scan(lex, error); + value = parse_value(lex, error); + if(!value) { + free(key); goto error; + } - if(json_object_set(object, key, value)) { + if(json_object_set_nocheck(object, key, value)) { + free(key); json_decref(value); goto error; } @@ -307,14 +650,15 @@ static json_t *json_parse_object(json_lex *lex) json_decref(value); free(key); + lex_scan(lex, error); if(lex->token != ',') break; - json_lex_scan(lex); + lex_scan(lex, error); } if(lex->token != '}') { - json_set_error(lex, "'}' expected"); + error_set(error, lex, "'}' expected"); goto error; } @@ -325,34 +669,36 @@ error: return NULL; } -static json_t *json_parse_array(json_lex *lex) +static json_t *parse_array(lex_t *lex, json_error_t *error) { json_t *array = json_array(); if(!array) return NULL; - json_lex_scan(lex); - if(lex->token != ']') { - while(1) { - json_t *elem = json_parse(lex); - if(!elem) - goto error; - - if(json_array_append(array, elem)) { - json_decref(elem); - goto error; - } - json_decref(elem); + lex_scan(lex, error); + if(lex->token == ']') + return array; - if(lex->token != ',') - break; + while(lex->token) { + json_t *elem = parse_value(lex, error); + if(!elem) + goto error; - json_lex_scan(lex); + if(json_array_append(array, elem)) { + json_decref(elem); + goto error; } + json_decref(elem); + + lex_scan(lex, error); + if(lex->token != ',') + break; + + lex_scan(lex, error); } if(lex->token != ']') { - json_set_error(lex, "']' expected"); + error_set(error, lex, "']' expected"); goto error; } @@ -363,81 +709,177 @@ error: return NULL; } -static json_t *json_parse(json_lex *lex) +static json_t *parse_value(lex_t *lex, json_error_t *error) { json_t *json; switch(lex->token) { - case JSON_TOKEN_STRING: { - json = json_string(lex->value.string); + case TOKEN_STRING: { + json = json_string_nocheck(lex->value.string); break; } - case JSON_TOKEN_NUMBER: { - json = json_number(lex->value.number); + case TOKEN_INTEGER: { + json = json_integer(lex->value.integer); break; } - case JSON_TOKEN_TRUE: + case TOKEN_REAL: { + json = json_real(lex->value.real); + break; + } + + case TOKEN_TRUE: json = json_true(); break; - case JSON_TOKEN_FALSE: + case TOKEN_FALSE: json = json_false(); break; - case JSON_TOKEN_NULL: + case TOKEN_NULL: json = json_null(); break; case '{': - json = json_parse_object(lex); + json = parse_object(lex, error); break; case '[': - json = json_parse_array(lex); + json = parse_array(lex, error); break; - case JSON_TOKEN_INVALID: - json_set_error(lex, "invalid token"); + case TOKEN_INVALID: + error_set(error, lex, "invalid token"); return NULL; default: - json_set_error(lex, "unexpected token"); + error_set(error, lex, "unexpected token"); return NULL; } if(!json) return NULL; - json_lex_scan(lex); return json; } -json_t *json_loads(const char *string) +static json_t *parse_json(lex_t *lex, json_error_t *error) { - json_lex lex; - json_t *result = NULL; + lex_scan(lex, error); + if(lex->token != '[' && lex->token != '{') { + error_set(error, lex, "'[' or '{' expected"); + return NULL; + } - if(json_lex_init(&lex, string)) + return parse_value(lex, error); +} + +typedef struct +{ + const char *data; + int pos; +} string_data_t; + +static int string_get(void *data) +{ + char c; + string_data_t *stream = (string_data_t *)data; + c = stream->data[stream->pos]; + if(c == '\0') + return EOF; + else + { + stream->pos++; + return c; + } +} + +static int string_eof(void *data) +{ + string_data_t *stream = (string_data_t *)data; + return (stream->data[stream->pos] == '\0'); +} + +json_t *json_loads(const char *string, size_t flags, json_error_t *error) +{ + lex_t lex; + json_t *result; + (void)flags; /* unused */ + + string_data_t stream_data = {string, 0}; + + if(lex_init(&lex, string_get, string_eof, (void *)&stream_data)) return NULL; - if(lex.token != '[' && lex.token != '{') { - json_set_error(&lex, "'[' or '{' expected"); + jsonp_error_init(error, ""); + + result = parse_json(&lex, error); + if(!result) goto out; + + lex_scan(&lex, error); + if(lex.token != TOKEN_EOF) { + error_set(error, &lex, "end of file expected"); + json_decref(result); + result = NULL; } - result = json_parse(&lex); +out: + lex_close(&lex); + return result; +} + +json_t *json_loadf(FILE *input, size_t flags, json_error_t *error) +{ + lex_t lex; + const char *source; + json_t *result; + (void)flags; /* unused */ + + if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input)) + return NULL; + + if(input == stdin) + source = ""; + else + source = ""; + + jsonp_error_init(error, source); + + result = parse_json(&lex, error); if(!result) goto out; - if(lex.token != JSON_TOKEN_EOF) { - json_set_error(&lex, "end of file expected"); + lex_scan(&lex, error); + if(lex.token != TOKEN_EOF) { + error_set(error, &lex, "end of file expected"); json_decref(result); result = NULL; } out: - json_lex_close(&lex); + lex_close(&lex); + return result; +} + +json_t *json_load_file(const char *path, size_t flags, json_error_t *error) +{ + json_t *result; + FILE *fp; + + jsonp_error_init(error, path); + + fp = fopen(path, "r"); + if(!fp) + { + error_set(error, NULL, "unable to open %s: %s", + path, strerror(errno)); + return NULL; + } + + result = json_loadf(fp, flags, error); + + fclose(fp); return result; }