Implement support for \u escapes

author Petri Lehtinen <petri@digip.org>

Tue, 14 Jul 2009 17:47:57 +0000 (20:47 +0300)

committer Petri Lehtinen <petri@digip.org>

Thu, 16 Jul 2009 06:58:23 +0000 (09:58 +0300)
author Petri Lehtinen <petri@digip.org>
Tue, 14 Jul 2009 17:47:57 +0000 (20:47 +0300)
committer Petri Lehtinen <petri@digip.org>
Thu, 16 Jul 2009 06:58:23 +0000 (09:58 +0300)
diff --git a/src/load.c b/src/load.c

index f9bcf7b..af6635a 100644 (file)
--- a/src/load.c
+++ b/src/load.c
@@ -83,8 +83,12 @@ static void error_set(json_error_t *error, const lex_t *lex,
          error->line = lex->line;
          if(saved_text && saved_text[0])
          {
-            snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
-                     "%s near '%s'", text, saved_text);
+            if(lex->saved_text.length <= 20) {
+                snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
+                         "%s near '%s'", text, saved_text);
+            }
+            else
+                snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
          }
          else
          {
@@ -208,11 +212,36 @@ static void lex_save_cached(lex_t *lex)
      }
  }
  
+/* assumes that str points to 'u' plus at least 4 valid hex digits */
+static int decode_unicode_escape(const char *str)
+{
+    int i;
+    int value = 0;
+
+    assert(str[0] == 'u');
+
+    for(i = 1; i <= 4; i++) {
+        char c = str[i];
+        value <<= 4;
+        if(isdigit(c))
+            value += c - '0';
+        else if(islower(c))
+            value += c - 'a' + 10;
+        else if(isupper(c))
+            value += c - 'A' + 10;
+        else
+            assert(0);
+    }
+
+    return value;
+}
+
  static void lex_scan_string(lex_t *lex, json_error_t *error)
  {
      char c;
      const char *p;
      char *t;
+    int i;
  
      lex->token = TOKEN_INVALID;
  
@@ -240,7 +269,7 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
              c = lex_get_save(lex, error);
              if(c == 'u') {
                  c = lex_get_save(lex, error);
-                for(int i = 0; i < 4; i++) {
+                for(i = 0; i < 4; i++) {
                      if(!isxdigit(c)) {
                          lex_unget_unsave(lex, c);
                          error_set(error, lex, "invalid escape");
@@ -285,12 +314,57 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
          if(*p == '\\') {
              p++;
              if(*p == 'u') {
-                /* TODO */
-                error_set(error, lex, "\\u escapes are not yet supported");
-                free(lex->value.string);
-                lex->value.string = NULL;
-                goto out;
-            } else {
+                char buffer[4];
+                int length;
+                int value;
+
+                value = decode_unicode_escape(p);
+                p += 5;
+
+                if(0xD800 <= value && value <= 0xDBFF) {
+                    /* surrogate pair */
+                    if(*p == '\\' && *(p + 1) == 'u') {
+                        int value2 = decode_unicode_escape(++p);
+                        p += 5;
+
+                        if(0xDC00 <= value2 && value2 <= 0xDFFF) {
+                            /* valid second surrogate */
+                            value = ((value - 0xD800) << 10) +
+                                    (value2 - 0xDC00) +
+                                    0x10000;
+                        }
+                        else {
+                            /* invalid second surrogate */
+                            error_set(error, lex,
+                                      "invalid Unicode '\\u%04X\\u%04X'",
+                                      value, value2);
+                            goto out;
+                        }
+                    }
+                    else {
+                        /* no second surrogate */
+                        error_set(error, lex, "invalid Unicode '\\u%04X'",
+                                  value);
+                        goto out;
+                    }
+                }
+                else if(0xDC00 <= value && value <= 0xDFFF) {
+                    error_set(error, lex, "invalid Unicode '\\u%04X'", value);
+                    goto out;
+                }
+                else if(value == 0)
+                {
+                    error_set(error, lex, "\\u0000 is not allowed");
+                    goto out;
+                }
+
+                if(utf8_encode(value, buffer, &length))
+                    assert(0);
+
+                memcpy(t, buffer, length);
+                t += length;
+            }
+            else {
                  switch(*p) {
                      case '"': case '\\': case '/':
                          *t = *p; break;
@@ -301,13 +375,12 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
                      case 't': *t = '\t'; break;
                      default: assert(0);
                  }
+                t++;
+                p++;
              }
          }
          else
-            *t = *p;
-
-        t++;
-        p++;
+            *(t++) = *(p++);
      }
      *t = '\0';
      lex->token = TOKEN_STRING;
diff --git a/src/utf.c b/src/utf.c

index 092959d..0adf01b 100644 (file)
--- a/src/utf.c
+++ b/src/utf.c
@@ -1,5 +1,41 @@
  #include <string.h>
  
+int utf8_encode(int codepoint, char *buffer, int *size)
+{
+    if(codepoint < 0)
+        return -1;
+    else if(codepoint < 0x80)
+    {
+        buffer[0] = (char)codepoint;
+        *size = 1;
+    }
+    else if(codepoint < 0x800)
+    {
+        buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
+        buffer[1] = 0x80 + ((codepoint & 0x03F));
+        *size = 2;
+    }
+    else if(codepoint < 0x10000)
+    {
+        buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
+        buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
+        buffer[2] = 0x80 + ((codepoint & 0x003F));
+        *size = 3;
+    }
+    else if(codepoint <= 0x10FFFF)
+    {
+        buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
+        buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
+        buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
+        buffer[3] = 0x80 + ((codepoint & 0x00003F));
+        *size = 4;
+    }
+    else
+        return -1;
+
+    return 0;
+}
+
  int utf8_check_first(char byte)
  {
      unsigned char u = (unsigned char)byte;
diff --git a/src/utf.h b/src/utf.h

index b8b0662..b49d8a1 100644 (file)
--- a/src/utf.h
+++ b/src/utf.h
@@ -1,6 +1,8 @@
  #ifndef UTF_H
  #define UTF_H
  
+int utf8_encode(int codepoint, char *buffer, int *size);
+
  int utf8_check_first(char byte);
  int utf8_check_full(const char *buffer, int size);
  
diff --git a/test/testdata/invalid b/test/testdata/invalid

index df167ae..1871d85 100644 (file)
--- a/test/testdata/invalid
+++ b/test/testdata/invalid
@@ -136,3 +136,23 @@ invalid token near '-0'
  ====
  1
  control character 0x9 near '"'
+========
+["\u0000 (null byte not allowed)"]
+====
+1
+\u0000 is not allowed
+========
+["\uDADA (first surrogate without the second)"]
+====
+1
+invalid Unicode '\uDADA'
+========
+["\uD888\u3210 (first surrogate and invalid second surrogate)"]
+====
+1
+invalid Unicode '\uD888\u3210'
+========
+["\uDFAA (second surrogate on it's own)"]
+====
+1
+invalid Unicode '\uDFAA'
diff --git a/test/testdata/valid b/test/testdata/valid

index 863933d..ebe7f95 100644 (file)
--- a/test/testdata/valid
+++ b/test/testdata/valid
@@ -8,6 +8,14 @@
  ========
  ["\"\\\/\b\f\n\r\t"]
  ========
+["\u002c one-byte UTF-8"]
+========
+["\u0123 two-byte UTF-8"]
+========
+["\u0821 three-byte UTF-8"]
+========
+["\uD834\uDD1E surrogate, four-byte UTF-8"]
+========
  [0]
  ========
  [1]
author	Petri Lehtinen <petri@digip.org>
	Tue, 14 Jul 2009 17:47:57 +0000 (20:47 +0300)
committer	Petri Lehtinen <petri@digip.org>
	Thu, 16 Jul 2009 06:58:23 +0000 (09:58 +0300)
src/load.c		patch \| blob \| history
src/utf.c		patch \| blob \| history
src/utf.h		patch \| blob \| history
test/testdata/invalid		patch \| blob \| history
test/testdata/valid		patch \| blob \| history