error->line = lex->line;
if(saved_text && saved_text[0])
{
- snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
- "%s near '%s'", text, saved_text);
+ if(lex->saved_text.length <= 20) {
+ snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
+ "%s near '%s'", text, saved_text);
+ }
+ else
+ snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
}
else
{
}
}
+/* assumes that str points to 'u' plus at least 4 valid hex digits */
+static int decode_unicode_escape(const char *str)
+{
+ int i;
+ int value = 0;
+
+ assert(str[0] == 'u');
+
+ for(i = 1; i <= 4; i++) {
+ char c = str[i];
+ value <<= 4;
+ if(isdigit(c))
+ value += c - '0';
+ else if(islower(c))
+ value += c - 'a' + 10;
+ else if(isupper(c))
+ value += c - 'A' + 10;
+ else
+ assert(0);
+ }
+
+ return value;
+}
+
static void lex_scan_string(lex_t *lex, json_error_t *error)
{
char c;
const char *p;
char *t;
+ int i;
lex->token = TOKEN_INVALID;
c = lex_get_save(lex, error);
if(c == 'u') {
c = lex_get_save(lex, error);
- for(int i = 0; i < 4; i++) {
+ for(i = 0; i < 4; i++) {
if(!isxdigit(c)) {
lex_unget_unsave(lex, c);
error_set(error, lex, "invalid escape");
if(*p == '\\') {
p++;
if(*p == 'u') {
- /* TODO */
- error_set(error, lex, "\\u escapes are not yet supported");
- free(lex->value.string);
- lex->value.string = NULL;
- goto out;
- } else {
+ char buffer[4];
+ int length;
+ int value;
+
+ value = decode_unicode_escape(p);
+ p += 5;
+
+ if(0xD800 <= value && value <= 0xDBFF) {
+ /* surrogate pair */
+ if(*p == '\\' && *(p + 1) == 'u') {
+ int value2 = decode_unicode_escape(++p);
+ p += 5;
+
+ if(0xDC00 <= value2 && value2 <= 0xDFFF) {
+ /* valid second surrogate */
+ value = ((value - 0xD800) << 10) +
+ (value2 - 0xDC00) +
+ 0x10000;
+ }
+ else {
+ /* invalid second surrogate */
+ error_set(error, lex,
+ "invalid Unicode '\\u%04X\\u%04X'",
+ value, value2);
+ goto out;
+ }
+ }
+ else {
+ /* no second surrogate */
+ error_set(error, lex, "invalid Unicode '\\u%04X'",
+ value);
+ goto out;
+ }
+ }
+ else if(0xDC00 <= value && value <= 0xDFFF) {
+ error_set(error, lex, "invalid Unicode '\\u%04X'", value);
+ goto out;
+ }
+ else if(value == 0)
+ {
+ error_set(error, lex, "\\u0000 is not allowed");
+ goto out;
+ }
+
+ if(utf8_encode(value, buffer, &length))
+ assert(0);
+
+ memcpy(t, buffer, length);
+ t += length;
+ }
+ else {
switch(*p) {
case '"': case '\\': case '/':
*t = *p; break;
case 't': *t = '\t'; break;
default: assert(0);
}
+ t++;
+ p++;
}
}
else
- *t = *p;
-
- t++;
- p++;
+ *(t++) = *(p++);
}
*t = '\0';
lex->token = TOKEN_STRING;
#include <string.h>
+int utf8_encode(int codepoint, char *buffer, int *size)
+{
+ if(codepoint < 0)
+ return -1;
+ else if(codepoint < 0x80)
+ {
+ buffer[0] = (char)codepoint;
+ *size = 1;
+ }
+ else if(codepoint < 0x800)
+ {
+ buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
+ buffer[1] = 0x80 + ((codepoint & 0x03F));
+ *size = 2;
+ }
+ else if(codepoint < 0x10000)
+ {
+ buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
+ buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
+ buffer[2] = 0x80 + ((codepoint & 0x003F));
+ *size = 3;
+ }
+ else if(codepoint <= 0x10FFFF)
+ {
+ buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
+ buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
+ buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
+ buffer[3] = 0x80 + ((codepoint & 0x00003F));
+ *size = 4;
+ }
+ else
+ return -1;
+
+ return 0;
+}
+
int utf8_check_first(char byte)
{
unsigned char u = (unsigned char)byte;