JSON-Parse-0.62/000755 001751 001751 00000000000 14264373424 012647 5ustar00benben000000 000000 JSON-Parse-0.62/t/000755 001751 001751 00000000000 14264373424 013112 5ustar00benben000000 000000 JSON-Parse-0.62/examples/000755 001751 001751 00000000000 14264373424 014465 5ustar00benben000000 000000 JSON-Parse-0.62/json-common.c000644 001751 001751 00000107153 14010625307 015246 0ustar00benben000000 000000 /* These things are common between the validation and the parsing routines. This is #included into "Json3.xs". */ /* The following matches bytes which are not allowed in JSON strings. "All Unicode characters may be placed within the quotation marks except for the characters that must be escaped: quotation mark, reverse solidus, and the control characters (U+0000 through U+001F)." - from section 2.5 of RFC 4627 */ #define BADBYTES \ '\0':case 0x01:case 0x02:case 0x03: \ case 0x04:case 0x05:case 0x06:case 0x07: \ case 0x08:case 0x09:case 0x0A:case 0x0B: \ case 0x0C:case 0x0D:case 0x0E:case 0x0F: \ case 0x10:case 0x11:case 0x12:case 0x13: \ case 0x14:case 0x15:case 0x16:case 0x17: \ case 0x18:case 0x19:case 0x1A:case 0x1B: \ case 0x1C:case 0x1D:case 0x1E:case 0x1F /* Match whitespace. Whitespace is as defined by the JSON standard, not by Perl. "Insignificant whitespace is allowed before or after any of the six structural characters. ws = *( %x20 / ; Space %x09 / ; Horizontal tab %x0A / ; Line feed or New line %x0D ; Carriage return )" From JSON RFC. */ #define WHITESPACE \ '\n': \ parser->line++; \ /* Fallthrough. */ \ case ' ': \ case '\t': \ case '\r' /* Match digits. */ #define DIGIT \ '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9' /* Match digits from 1-9. This is handled differently because JSON disallows leading zeros in numbers. */ #define DIGIT19 \ '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9' /* Hexadecimal, in upper and lower case. */ #define UHEX 'A': case 'B': case 'C': case 'D': case 'E': case 'F' #define LHEX 'a': case 'b': case 'c': case 'd': case 'e': case 'f' /* As of version 0.45 of JSON::Parse, most of the UTF-8 switches are now in "unicode.c", but the following one is JSON-specific. */ /* This excludes '"' and '\'. */ #define BYTE_20_7F \ 0x20: case 0x21:\ case 0x23: case 0x24: case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:\ case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: case 0x2F: case 0x30:\ case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:\ case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E:\ case 0x3F: case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45:\ case 0x46: case 0x47: case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:\ case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: case 0x52: case 0x53:\ case 0x54: case 0x55: case 0x56: case 0x57: case 0x58: case 0x59: case 0x5A:\ case 0x5B: case 0x5D: case 0x5E: case 0x5F: case 0x60: case 0x61:\ case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: case 0x68:\ case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:\ case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76:\ case 0x77: case 0x78: case 0x79: case 0x7A: case 0x7B: case 0x7C: case 0x7D:\ case 0x7E: case 0x7F /* A "string_t" is a pointer into the input, which lives in "parser->input". The "string_t" structure is used for copying strings when the string does not contain any escapes. When a string contains escapes, it is copied into "parser->buffer". */ typedef struct string { unsigned char * start; #ifdef NOPERL int length; #else /* def NOPERL */ STRLEN length; #endif /* def NOPERL */ /* The "contains_escapes" flag is set if there are backslash escapes in the string like "\r", so that it needs to be cleaned up before using it. That means we use "parser->buffer". This is to speed things up, by not doing the cleanup when it isn't necessary. */ unsigned contains_escapes : 1; } string_t; typedef enum { json_invalid, json_initial_state, json_string, json_number, json_literal, json_object, json_array, json_unicode_escape, json_overflow } json_type_t; const char * type_names[json_overflow] = { "invalid", "initial state", "string", "number", "literal", "object", "array", "unicode escape" }; /* The maximum value of bytes to check for. */ #define JSON3MAXBYTE 0x100 // uncomment this when running random test to terminal otherwise the // random characters will mess up the terminal. //#define JSON3MAXBYTE 0x80 #include "errors.c" /* Anything which could be the start of a value. */ #define VALUE_START (XARRAYOBJECTSTART|XSTRING_START|XDIGIT|XMINUS|XLITERAL) typedef struct parser { /* The length of "input". */ unsigned int length; /* The input. This is fixed at the beginning throughout parsing. */ unsigned char * input; /* The end-point of the parsing, the last parsed thing. This increments through "input". */ unsigned char * end; /* The last byte of "input", "parser->input + parser->length". This is used to detect overflows. */ unsigned char * last_byte; /* Allocated size of "buffer". */ int buffer_size; /* Buffer to stick strings into temporarily. */ unsigned char * buffer; /* Line number. */ int line; /* Where the beginning of the series of unfortunate events was. For example if we are parsing an array, this points to the "[" at the start of the array, or if we are parsing a string, this points to the byte after the '"' at the start of the string. */ unsigned char * bad_beginning; /* The bad type itself. */ json_type_t bad_type; /* What we were expecting to see when the error occurred. */ int expected; /* The byte which caused the parser to fail. */ unsigned char * bad_byte; unsigned bad_length; /* The type of error encountered. */ json_error_t error; /* If we were parsing a literal and found a bad character, what were we expecting? */ unsigned char literal_char; /* The end expected. */ int end_expected; /* Number of mallocs. */ int n_mallocs; /* Bytes we accept. */ int valid_bytes[JSON3MAXBYTE]; /* Current depth into arrays or objects. */ int depth; /* Maximum depth we accept. */ int max_depth; /* Perl SV * pointers to copy for our true, false, and null values. */ void * user_true; void * user_false; void * user_null; /* If this is 1, we copy the literals into new SVs. */ unsigned int copy_literals : 1; /* If this is 1, we don't die on errors. */ unsigned int warn_only : 1; /* If this is 1, we check for hash collisions before inserting values. */ unsigned int detect_collisions : 1; /* Don't warn the user about non-false false and untrue true values, etc. */ unsigned int no_warn_literals : 1; /* Are we tokenizing the input? */ unsigned int tokenizing : 1; #ifdef TESTRANDOM /* Return point for longjmp. */ jmp_buf biscuit; char * last_error; #endif /* Unicode? */ unsigned int unicode : 1; /* Force unicode. This happens when we hit "\uxyzy". */ unsigned int force_unicode : 1; /* Upgrade the input from bytes to characters. */ unsigned int upgrade_utf8 : 1; /* Top-level value? We need to know this for the case when we are parsing a number and suddenly meet a '\0' byte. If it's a top level value then we can assume that is just the end of the JSON, but if it's not a top-level value then that is an error, since the end array or end object at least are missing. */ unsigned int top_level_value : 1; /* Produce diagnostics as a hash rather than a string. */ unsigned int diagnostics_hash : 1; #ifdef TESTRANDOM /* This is true if we are testing with random bytes. */ unsigned randomtest : 1; #endif /* def TESTRANDOM */ } json_parse_t; /* Maximum depth of parsing. */ #define JSON_PARSE_DEFAULT_MAX_DEPTH 10000 static void json_parse_init (json_parse_t * parser) { parser->max_depth = JSON_PARSE_DEFAULT_MAX_DEPTH; } /* Check if the user has set something different from the default, and don't croak if we are still OK. */ /* Increment the parsing depth, with check. */ //#define DEBUG_DEPTH #ifdef DEBUG_DEPTH #define PRINT_DEPTH \ printf ("%s:%d: %d\n", __FILE__, __LINE__, parser->depth); #else #define PRINT_DEPTH #endif #define INCDEPTH \ PRINT_DEPTH; \ parser->depth++; \ if (parser->depth > parser->max_depth) { \ croak ("error: too many [ or {, maximum is %d", \ parser->max_depth); \ } #define DECDEPTH \ parser->depth--; \ PRINT_DEPTH; #ifndef NOPERL static SV * error_to_hash (json_parse_t * parser, char * error_as_string); #endif /* ndef NOPERL */ #ifdef __GNUC__ #define INLINE inline #else #define INLINE #endif /* def __GNUC__ */ /* The size of the buffer for printing errors. */ #define ERRORMSGBUFFERSIZE 0x1000 /* Declare all bad inputs as non-returning. */ #ifdef __GNUC__ #if 0 static void failbadinput_json (json_parse_t * parser) __attribute__ ((noreturn)); #endif /* 0 */ static void failbadinput (json_parse_t * parser) __attribute__ ((noreturn)); static INLINE void failbug (char * file, int line, json_parse_t * parser, const char * format, ...) __attribute__ ((noreturn)); #endif /* Assert failure handler. Coming here means there is a bug in the code rather than in the JSON input. We still send it to Perl via "croak". */ static INLINE void failbug (char * file, int line, json_parse_t * parser, const char * format, ...) { char buffer[ERRORMSGBUFFERSIZE]; va_list a; va_start (a, format); vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a); va_end (a); croak ("JSON::Parse: %s:%d: Internal error at line %d: %s", file, line, parser->line, buffer); } /* This is a test for whether the string has ended, which we use when we catch a zero byte in an unexpected part of the input. Here we use ">" rather than ">=" because "parser->end" is incremented by one after each access. See the NEXTBYTE macro. */ #define STRINGEND (parser->end > parser->last_byte) /* One of the types which demands a specific next byte. */ #define SPECIFIC(c) (((c) & XIN_LITERAL) || ((c) & XIN_SURROGATE_PAIR)) /* Make the list of valid bytes. */ static void make_valid_bytes (json_parse_t * parser) { int i; for (i = 0; i < JSON3MAXBYTE; i++) { parser->valid_bytes[i] = 0; } for (i = 0; i < n_expectations; i++) { int X; X = 1<expected & X) { int j; for (j = 0; j < JSON3MAXBYTE; j++) { parser->valid_bytes[j] |= allowed[i][j]; } } } if (SPECIFIC (parser->expected)) { parser->valid_bytes[parser->literal_char] = 1; } } /* Repeated arguments to snprintf. */ #define SNEND buffer + string_end #define SNSIZE ERRORMSGBUFFERSIZE - string_end /* Disabled due to clash with Darwin compiler: http://www.cpantesters.org/cpan/report/7c69e0f0-70c0-11e3-95aa-bcf4d95af652 http://www.cpantesters.org/cpan/report/6cde36da-6fd1-11e3-946f-2b87da5af652 #define SNEND, SNSIZE buffer + string_end, ERRORMSGBUFFERSIZE - string_end */ #define EROVERFLOW \ if (string_end >= ERRORMSGBUFFERSIZE - 0x100) { \ failbug (__FILE__, __LINE__, parser, \ "Error string length is %d" \ " of maximum %d. Bailing out.", \ string_end, ERRORMSGBUFFERSIZE); \ } #if 0 /* Coming in to this routine, we have checked the error for validity and converted at failbadinput. If this is called directly the bug traps won't work. */ static void failbadinput_json (json_parse_t * parser) { char buffer[ERRORMSGBUFFERSIZE]; int string_end; string_end = 0; string_end += snprintf (SNEND, SNSIZE, "{" "\"input length\":%d" ",\"bad type\":\"%s\"" ",\"error\":\"%s\"", parser->length, type_names[parser->bad_type], json_errors[parser->error]); EROVERFLOW; if (parser->bad_byte) { int position; position = (int) (parser->bad_byte - parser->input) + 1, string_end += snprintf (SNEND, SNSIZE, ",\"bad byte position\":%d" ",\"bad byte contents\":%d", position, * parser->bad_byte); EROVERFLOW; } if (parser->bad_beginning) { int bcstart; bcstart = (int) (parser->bad_beginning - parser->input) + 1; string_end += snprintf (SNEND, SNSIZE, ",\"start of broken component\":%d", bcstart); EROVERFLOW; } if (parser->error == json_error_unexpected_character) { int j; make_valid_bytes (parser); string_end += snprintf (SNEND, SNSIZE, ",\"valid bytes\":[%d", parser->valid_bytes[0]); EROVERFLOW; for (j = 1; j < JSON3MAXBYTE; j++) { string_end += snprintf (SNEND, SNSIZE, ",%d", parser->valid_bytes[j]); } EROVERFLOW; string_end += snprintf (SNEND, SNSIZE, "]"); EROVERFLOW; } string_end += snprintf (SNEND, SNSIZE, "}\n"); EROVERFLOW; croak (buffer); } #endif /* 0 */ static void failbadinput (json_parse_t * parser) { char buffer[ERRORMSGBUFFERSIZE]; int string_end; int i; int l; const char * format; /* If the error is "unexpected character", and we are at the end of the input, change to "unexpected end of input". This is probably triggered by reading a byte with value '\0', but we don't check the value of "* parser->bad_byte" in the following "if" statement, since it's an error to go past the expected end of the string regardless of whether the byte is '\0'. */ if (parser->error == json_error_unexpected_character && STRINGEND) { parser->error = json_error_unexpected_end_of_input; /* We don't care about what byte it was, we went past the end of the string, which is already a failure. */ parser->bad_byte = 0; /* It trips an assertion if "parser->expected" is set for anything other than an "unexpected character" error. */ parser->expected = 0; } /* Array bounds check for error message. */ if (parser->error <= json_error_invalid && parser->error >= json_error_overflow) { failbug (__FILE__, __LINE__, parser, "Bad value for parser->error: %d\n", parser->error); } format = json_errors[parser->error]; l = strlen (format); if (l >= ERRORMSGBUFFERSIZE - 1) { l = ERRORMSGBUFFERSIZE - 1; } for (i = 0; i < l; i++) { buffer[i] = format[i]; } buffer[l] = '\0'; string_end = l; /* If we got an unexpected character somewhere, append the exact value of the character to the error message. */ if (parser->error == json_error_unexpected_character) { /* This contains the unexpected character itself, from the "parser->bad_byte" pointer. */ unsigned char bb; /* Make sure that we were told where the unexpected character was. Unlocated unexpected characters are a bug. */ if (! parser->bad_byte) { failbug (__FILE__, __LINE__, parser, "unexpected character error but " "parser->bad_byte is invalid"); } bb = * parser->bad_byte; /* We have to check what kind of character. For example printing '\0' with %c will just give a message which suddenly ends when printed to the terminal, and other control characters will be invisible. So display the character in a different way depending on whether it's printable or not. */ /* Don't use "isprint" because on Windows it seems to think that 0x80 is printable: http://www.cpantesters.org/cpan/report/d6438b68-6bf4-1014-8647-737bdb05e747 */ if (bb >= 0x20 && bb < 0x7F) { /* Printable character, print the character itself. */ string_end += snprintf (SNEND, SNSIZE, " '%c'", bb); EROVERFLOW; } else { /* Unprintable character, print its hexadecimal value. */ string_end += snprintf (SNEND, SNSIZE, " 0x%02x", bb); EROVERFLOW; } } else if (parser->error == json_error_name_is_not_unique) { string_end += snprintf (SNEND, SNSIZE, ": \"%.*s\"", parser->bad_length, parser->bad_byte); } /* "parser->bad_type" contains what was being parsed when the error occurred. This should never be undefined. */ if (parser->bad_type <= json_invalid || parser->bad_type >= json_overflow) { failbug (__FILE__, __LINE__, parser, "parsing type set to invalid value %d in error message", parser->bad_type); } string_end += snprintf (SNEND, SNSIZE, " parsing %s", type_names[parser->bad_type]); EROVERFLOW; if (parser->bad_beginning) { int bad_byte; bad_byte = (parser->bad_beginning - parser->input) + 1; string_end += snprintf (SNEND, SNSIZE, " starting from byte %d", bad_byte); EROVERFLOW; } /* "parser->expected" is set for the "unexpected character" error and it tells the user what kind of input was expected. It contains various flags or'd together, so this goes through each possible flag and prints a message for it. */ if (parser->expected) { if (parser->error == json_error_unexpected_character) { int i; int joined; unsigned char bb; bb = * parser->bad_byte; string_end += snprintf (SNEND, SNSIZE, ": expecting "); EROVERFLOW; joined = 0; if (SPECIFIC (parser->expected)) { if (! parser->literal_char) { failbug (__FILE__, __LINE__, parser, "expected literal character unset"); } string_end += snprintf (SNEND, SNSIZE, "'%c'", parser->literal_char); EROVERFLOW; } for (i = 0; i < n_expectations; i++) { int X; X = 1<expected & X) { /* Check that this really is disallowed. */ if (allowed[i][bb]) { failbug (__FILE__, __LINE__, parser, "mismatch parsing %s: got %X " "but it's allowed by %s (%d)", type_names[parser->bad_type], bb, input_expectation[i], i); } if (joined) { string_end += snprintf (SNEND, SNSIZE, " or "); EROVERFLOW; } string_end += snprintf (SNEND, SNSIZE, "%s", input_expectation[i]); EROVERFLOW; joined = 1; } } } else { failbug (__FILE__, __LINE__, parser, "'expected' is set but error %s != unexp. char", json_errors[parser->error]); } } else if (parser->error == json_error_unexpected_character) { failbug (__FILE__, __LINE__, parser, "unexpected character error for 0X%02X at byte %d " "with no expected value set", * parser->bad_byte, parser->bad_byte - parser->input); } #undef SNEND #undef SNSIZE #ifdef TESTRANDOM /* Go back to where we came from. */ if (parser->randomtest) { parser->last_error = buffer; make_valid_bytes (parser); longjmp (parser->biscuit, 1); } #endif /* def TESTRANDOM */ #ifndef NOPERL if (parser->diagnostics_hash) { #if PERL_VERSION > 12 croak_sv (error_to_hash (parser, buffer)); #endif /* PERL_VERSION > 12 */ } #endif /* ndef NOPERL */ if (parser->length > 0) { if (parser->end - parser->input > parser->length) { croak ("JSON error at line %d: %s", parser->line, buffer); } else if (parser->bad_byte) { croak ("JSON error at line %d, byte %d/%d: %s", parser->line, (int) (parser->bad_byte - parser->input + 1), parser->length, buffer); } else { croak ("JSON error at line %d: %s", parser->line, buffer); } } else { croak ("JSON error: %s", buffer); } } #undef SPECIFIC /* This is for failures not due to errors in the input or to bugs but to exhaustion of resources, i.e. out of memory, or file errors would go here if there were any C file opening things anywhere. */ static INLINE void failresources (json_parse_t * parser, const char * format, ...) { char buffer[ERRORMSGBUFFERSIZE]; va_list a; va_start (a, format); vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a); va_end (a); croak ("Parsing failed at line %d, byte %d/%d: %s", parser->line, (int) (parser->end - parser->input), parser->length, buffer); } #undef ERRORMSGBUFFERSIZE /* Get more memory for "parser->buffer". */ static void expand_buffer (json_parse_t * parser, int length) { if (parser->buffer_size < 2 * length + 0x100) { parser->buffer_size = 2 * length + 0x100; if (parser->buffer) { Renew (parser->buffer, parser->buffer_size, unsigned char); } else { Newx (parser->buffer, parser->buffer_size, unsigned char); parser->n_mallocs++; } if (! parser->buffer) { failresources (parser, "out of memory"); } } } #define UNIFAIL(err) \ parser->bad_type = json_unicode_escape; \ parser->error = json_error_ ## err; \ failbadinput (parser) /* Parse the hex bit of a \uXYZA escape. */ static INLINE int parse_hex_bytes (json_parse_t * parser, unsigned char * p) { int k; int unicode; unicode = 0; for (k = 0; k < strlen ("ABCD"); k++) { unsigned char c; c = p[k]; switch (c) { case DIGIT: unicode = unicode * 16 + c - '0'; break; case UHEX: unicode = unicode * 16 + c - 'A' + 10; break; case LHEX: unicode = unicode * 16 + c - 'a' + 10; break; case '\0': if (p + k - parser->input >= parser->length) { UNIFAIL (unexpected_end_of_input); } break; default: parser->bad_byte = p + k; parser->expected = XHEXADECIMAL_CHARACTER; UNIFAIL (unexpected_character); } } return unicode; } /* STRINGFAIL applies for any kind of failure within a string, not just unexpected character errors. */ #define STRINGFAIL(err) \ parser->error = json_error_ ## err; \ parser->bad_type = json_string; \ failbadinput (parser) #define FAILSURROGATEPAIR(c) \ parser->expected = XIN_SURROGATE_PAIR; \ parser->literal_char = c; \ parser->bad_beginning = start - 2; \ parser->error = json_error_unexpected_character; \ parser->bad_type = json_unicode_escape; \ parser->bad_byte = p - 1; \ failbadinput (parser) static INLINE unsigned char * do_unicode_escape (json_parse_t * parser, unsigned char * p, unsigned char ** b_ptr) { int unicode; unsigned int plus; unsigned char * start; start = p; unicode = parse_hex_bytes (parser, p); p += 4; plus = ucs2_to_utf8 (unicode, *b_ptr); if (plus == UTF8_BAD_LEADING_BYTE || plus == UTF8_BAD_CONTINUATION_BYTE) { failbug (__FILE__, __LINE__, parser, "Failed to parse unicode input %.4s", start); } else if (plus == UNICODE_SURROGATE_PAIR) { int unicode2; int plus2; if (parser->last_byte - p < 6) { parser->bad_beginning = start - 2; parser->bad_type = json_unicode_escape; parser->error = json_error_unexpected_end_of_input; failbadinput (parser); } if (*p++ == '\\') { if (*p++ == 'u') { unicode2 = parse_hex_bytes (parser, p); p += 4; plus2 = surrogate_to_utf8 (unicode, unicode2, * b_ptr); if (plus2 <= 0) { if (plus2 == UNICODE_NOT_SURROGATE_PAIR) { parser->bad_byte = 0; parser->bad_beginning = p - 4; UNIFAIL (not_surrogate_pair); } else { failbug (__FILE__, __LINE__, parser, "unhandled error %d from surrogate_to_utf8", plus2); } } * b_ptr += plus2; goto end; } else { FAILSURROGATEPAIR ('u'); } } else { FAILSURROGATEPAIR ('\\'); } } else if (plus <= 0) { failbug (__FILE__, __LINE__, parser, "unhandled error code %d while decoding unicode escape", plus); } * b_ptr += plus; end: if (unicode >= 0x80 && ! parser->unicode) { /* Force the UTF-8 flag on for this string. */ parser->force_unicode = 1; } return p; } /* Handle backslash escapes. We can't use the NEXTBYTE macro here for the reasons outlined below. */ #if 0 /* I expected a switch statement to compile to faster code, but it doesn't seem to. */ #define HANDLE_ESCAPES(p,start) \ switch (c = * ((p)++)) { \ \ case '\\': \ case '/': \ case '"': \ *b++ = c; \ break; \ \ case 'b': \ *b++ = '\b'; \ break; \ \ case 'f': \ *b++ = '\f'; \ break; \ \ case 'n': \ *b++ = '\n'; \ break; \ \ case 'r': \ *b++ = '\r'; \ break; \ \ case 't': \ *b++ = '\t'; \ break; \ \ case 'u': \ p = do_unicode_escape (parser, p, & b); \ break; \ \ default: \ parser->bad_beginning = start; \ parser->bad_byte = p - 1; \ parser->expected = XESCAPE; \ STRINGFAIL (unexpected_character); \ } #else /* This is identical to the above macro, but it uses if statements rather than a switch statement. Using the Clang compiler, this results in about 2.5% faster code, for some reason or another. */ #define HANDLE_ESCAPES(p,start) \ c = * ((p)++); \ if (c == '\\' || c == '/' || c == '"') { \ *b++ = c; \ } \ else if (c == 'b') { \ *b++ = '\b'; \ } \ else if (c == 'f') { \ *b++ = '\f'; \ } \ else if (c == 'n') { \ *b++ = '\n'; \ } \ else if (c == 'r') { \ *b++ = '\r'; \ } \ else if (c == 't') { \ *b++ = '\t'; \ } \ else if (c == 'u') { \ p = do_unicode_escape (parser, p, & b); \ } \ else { \ parser->bad_beginning = start; \ parser->bad_byte = p - 1; \ parser->expected = XESCAPE; \ STRINGFAIL (unexpected_character); \ } #endif /* Resolve "s" by converting escapes into the appropriate things. Put the result into "parser->buffer". The return value is the length of the string. */ static INLINE int resolve_string (json_parse_t * parser, string_t * s) { /* The pointer where we copy the string. This points into "parser->buffer". */ unsigned char * b; /* "p" is the pointer into "parser->input", using "s->start" to get the start point. We don't use "parser->end" for this job because "resolve_string" is called only after the value of the object is resolved. E.g. if the object goes like {"hot":{"potatoes":"tomatoes"}} then this routine is called first for "potatoes" and then for "hot" as each sub-element of the hashes is resolved. We don't want to mess around with the value of "parser->end", which is always pointing to one after the last byte viewed. */ unsigned char * p; p = s->start; /* Ensure we have enough memory to fit the string. */ expand_buffer (parser, s->length); b = parser->buffer; while (p - s->start < s->length) { unsigned char c; c = *p++; if (c == '\\') { HANDLE_ESCAPES (p, s->start - 1); } else { *b++ = c; } } /* This is the length of the string in bytes. */ return b - parser->buffer; } #define NEXTBYTE (c = *parser->end++) /* Get an object key value and put it into "key". Check for escapes. */ static INLINE void get_key_string (json_parse_t * parser, string_t * key) { unsigned char c; int i; key->start = parser->end; key->contains_escapes = 0; key_string_next: switch (NEXTBYTE) { case '"': /* Go on eating bytes until we find a ". */ break; case '\\': /* Mark this string as containing escapes. */ key->contains_escapes = 1; switch (NEXTBYTE) { case '\\': case '/': case '"': case 'b': case 'f': case 'n': case 'r': case 't': /* Eat another byte. */ goto key_string_next; case 'u': /* i counts the bytes, from 0 to 3. */ i = 0; unitunes: switch (NEXTBYTE) { case DIGIT: case UHEX: case LHEX: i++; if (i >= strlen ("ABCD")) { goto key_string_next; } else { goto unitunes; } /* not a fall through, we always "goto" above. */ default: parser->bad_beginning = parser->end - 1 - i; parser->expected = XHEXADECIMAL_CHARACTER; parser->bad_byte = parser->end - 1; UNIFAIL (unexpected_character); } /* not a fall through, we either UNIFAIL or goto above. */ default: parser->bad_beginning = key->start - 1; parser->expected = XESCAPE; parser->bad_byte = parser->end - 1; STRINGFAIL (unexpected_character); } /* Not a fall through, we never arrive here. */ case BADBYTES: parser->bad_beginning = key->start - 1; parser->expected = XSTRINGCHAR; parser->bad_byte = parser->end - 1; STRINGFAIL (unexpected_character); /* Not a fall through, STRINGFAIL does not return. */ #define ADDBYTE #define string_start key_string_next #define startofutf8string (key->start) #include "utf8-byte-one.c" /* Not a fall through. */ default: parser->bad_beginning = key->start - 1; parser->expected = XSTRINGCHAR; parser->bad_byte = parser->end - 1; STRINGFAIL (unexpected_character); } key->length = parser->end - key->start - 1; return; #include "utf8-next-byte.c" #undef startofutf8string #undef string_start #undef ADDBYTE } /* "start - 1" puts the start on the " rather than after it. "start" is usually after the quote because the quote is eaten on the way here. */ #define ILLEGALBYTE \ parser->bad_beginning = start - 1; \ parser->bad_byte = parser->end - 1; \ parser->expected = XSTRINGCHAR; \ STRINGFAIL (unexpected_character) /* Resolve the string pointed to by "parser->end" into "parser->buffer". The return value is the length of the string. This is only called if the string has \ escapes in it. */ static INLINE int get_string (json_parse_t * parser) { unsigned char * b; unsigned char c; unsigned char * start; start = parser->end; if (! parser->buffer) { expand_buffer (parser, 0x1000); } b = parser->buffer; string_start: if (b - parser->buffer >= parser->buffer_size - 0x100) { /* Save our offset in parser->buffer, because "realloc" is called by "expand_buffer", and "b" may no longer point to a meaningful location. */ int size = b - parser->buffer; expand_buffer (parser, 2 * parser->buffer_size); b = parser->buffer + size; } switch (NEXTBYTE) { case '"': goto string_end; break; case '\\': HANDLE_ESCAPES (parser->end, start - 1); goto string_start; #define ADDBYTE (* b++ = c) #define startofutf8string start #include "utf8-byte-one.c" /* Not a fall through. */ default: /* fall through */ case BADBYTES: ILLEGALBYTE; } if (STRINGEND) { STRINGFAIL (unexpected_end_of_input); } string_end: return b - parser->buffer; #include "utf8-next-byte.c" #undef ADDBYTE goto string_end; } static void parser_free (json_parse_t * parser) { if (parser->buffer) { Safefree (parser->buffer); parser->n_mallocs--; } /* There is a discrepancy between the number of things used and the number freed. */ if (parser->n_mallocs != 0) { /* The tokenizing parser is freed before the tokens themselves are freed. Whether or not the tokens are freed correctly can be checked in "tokenize_free" in "json-entry-points.c". */ if (! parser->tokenizing) { fprintf (stderr, "%s:%d: %d pieces of unfreed memory remain.\n", __FILE__, __LINE__, parser->n_mallocs); } } parser->buffer = 0; parser->buffer_size = 0; } typedef enum json_token_type { json_token_invalid, json_token_number, json_token_string, json_token_key, json_token_literal, json_token_comma, json_token_colon, json_token_object, json_token_array, n_json_tokens } json_token_type_t; const char * token_names[n_json_tokens] = { "invalid", "number", "string", "key", "literal", "comma", "colon", "object", "array" }; typedef struct json_token json_token_t; struct json_token { json_token_t * child; json_token_t * next; unsigned int start; unsigned int end; json_token_type_t type; unsigned int parent; unsigned blessed : 1; }; #define JSON_TOKEN_PARENT_INVALID 0 /* "start" is the first character of the thing. "end" is the last character of the thing. If the thing only takes one character then "start == end" should be true. */ static json_token_t * json_token_new (json_parse_t * parser, unsigned char * start, unsigned char * end, json_token_type_t type) { json_token_t * new; /* Check the token in various ways. */ switch (type) { case json_token_string: case json_token_key: if (* start != '"') { if (end) { failbug (__FILE__, __LINE__, parser, "no quotes at start of string '%.*s'", end - start, start); } else { failbug (__FILE__, __LINE__, parser, "no quotes at start of string '%.10s'", start); } } if (end && * end != '"') { failbug (__FILE__, __LINE__, parser, "'%c' is not a quote at end of string '%.*s'", * end, end - start, start); } break; case json_token_number: if (* start - '0' > 9 && * start != '-') { failbug (__FILE__, __LINE__, parser, "bad character %c at start of number", * start); } if (* end - '0' > 9) { failbug (__FILE__, __LINE__, parser, "bad character %c at end of number", * end); } break; case json_token_object: if (* start != '{' || (end && * end != '}')) { failbug (__FILE__, __LINE__, parser, "no { or } in object %.*s: char %X", end ? end - start : strlen ((char *) start), start, * start); } break; case json_token_array: if (* start != '[' || (end && * end != ']')) { failbug (__FILE__, __LINE__, parser, "no [ or ] in array"); } break; case json_token_comma: if (end - start != 0 || * start != ',') { failbug (__FILE__, __LINE__, parser, "not a comma %.*s", end - start); } break; case json_token_colon: if (end - start != 0 || * start != ':') { failbug (__FILE__, __LINE__, parser, "not a colon %.*s", end - start); } break; case json_token_literal: break; default: croak ("%s:%d: bad type %d\n", __FILE__, __LINE__, type); } Newx (new, 1, json_token_t); // static int nnew; // nnew++; // fprintf (stderr, "New %d %p\n", nnew, new); parser->n_mallocs++; #if 0 fprintf (stderr, "%s:%d: parser->n_mallocs = %d\n", __FILE__, __LINE__, parser->n_mallocs); #endif /* 0 */ new->start = start - parser->input; if (end) { new->end = end - parser->input + 1; } else { new->end = 0; } new->type = type; new->parent = JSON_TOKEN_PARENT_INVALID; new->child = 0; new->next = 0; return new; } static void json_token_set_end (json_parse_t * parser, json_token_t * jt, unsigned char * end) { if (jt->end != 0) { int offset = (int) (end - parser->input); failbug (__FILE__, __LINE__, parser, "attempt to set end as %d is now %d\n", offset, jt->end); } switch (jt->type) { case json_token_string: case json_token_key: if (* end != '"') { failbug (__FILE__, __LINE__, parser, "no quotes at end of string"); } break; case json_token_object: if (* end != '}') { failbug (__FILE__, __LINE__, parser, "no } at end of object"); } break; case json_token_array: if (* end != ']') { failbug (__FILE__, __LINE__, parser, "no ] at end of array"); } break; default: failbug (__FILE__, __LINE__, parser, "set end for unknown type %d", jt->type); break; } jt->end = end - parser->input + 1; } static json_token_t * json_token_set_child (json_parse_t * parser, json_token_t * parent, json_token_t * child) { switch (parent->type) { case json_token_object: case json_token_array: break; default: failbug (__FILE__, __LINE__, parser, "bad parent type %d\n", parent->type); } parent->child = child; return child; } static json_token_t * json_token_set_next (json_token_t * prev, json_token_t * next) { prev->next = next; return next; } JSON-Parse-0.62/README000644 001751 001751 00000006454 14264373417 013542 0ustar00benben000000 000000 _ ____ ___ _ _ ____ | / ___| / _ \| \ | | _ _ | _ \ __ _ _ __ ___ ___ _ | \___ \| | | | \| | (_|_) | |_) / _` | '__/ __|/ _ \ | |_| |___) | |_| | |\ | _ _ | __/ (_| | | \__ \ __/ \___/|____/ \___/|_| \_| (_|_) |_| \__,_|_| |___/\___| This is the README for JSON::Parse version 0.62. JSON::Parse is a "module" for the Perl computer programming language, a library of computer code to install on a computer. This document contains four sections: 1. About - what the module does 2. Documentation - how to learn more about the module 3. Installation - how to install this module on a computer 4. Help - what to do if you get stuck ----------------------------------------------------------------------------- 1. ABOUT JSON::Parse - Parse JSON A module for parsing JSON. (JSON means "JavaScript Object Notation" and it is specified in "RFC 8259".) JSON::Parse offers the function "parse_json", which takes a string containing JSON, and returns an equivalent Perl structure. It also offers validation of JSON via "valid_json", which returns true or false depending on whether the JSON is correct or not, and "assert_valid_json", which produces a descriptive fatal error if the JSON is invalid. A function "read_json" reads JSON from a file, and there is a safer version of "parse_json" called "parse_json_safe" which doesn't throw exceptions. For special cases of parsing, there are also methods "new" and "parse", which create a JSON parsing object and run it on text. See "METHODS". JSON::Parse accepts only UTF-8 as input. See "UTF-8 only" and "Handling of Unicode". ----------------------------------------------------------------------------- 2. DOCUMENTATION You can read the documentation for the module online at the following website: * http://metacpan.org/release/JSON-Parse (This link goes to the latest version of the module.) After installing the module, you can read the documentation on your computer using perldoc JSON::Parse ----------------------------------------------------------------------------- 3. INSTALLATION This module requires Perl version 5.8.9 or later. To install the module from CPAN, use cpan JSON::Parse If you have the App::cpanminus installer, you may prefer cpanm JSON::Parse To install the module from the source file, JSON-Parse-0.62.tar.gz, follow this sequence of commands: tar xfz JSON-Parse-0.62.tar.gz cd JSON-Parse-0.62 perl Makefile.PL make make install If you want to test the module before installing it, use "make test" after "make" and before "make install". ----------------------------------------------------------------------------- 4. HELP To get help with the module, you can email the author, Ben Bullock, at . If you think there is a problem in the module, you can report a bug at , or if you want to alter the source code of JSON::Parse, try the public repository on github at . ----------------------------------------------------------------------------- This README was written on Sat Jul 16 08:23:59 2022. ----------------------------------------------------------------------------- JSON-Parse-0.62/unicode.h000444 001751 001751 00000015164 14264373365 014457 0ustar00benben000000 000000 /* This file was generated by the following command: /home/ben/software/install/bin/cfunctions unicode.c */ #ifndef CFH_UNICODE_H #define CFH_UNICODE_H #line 8 "unicode.c" #define UTF8_MAX_LENGTH 5 #define UNICODE_MAXIMUM 0x10ffff #define UNICODE_UTF8_4 0x1fffff #define UNICODE_OK 0 #define UTF8_BAD_LEADING_BYTE -1 #define UNICODE_SURROGATE_PAIR -2 #define UNICODE_NOT_SURROGATE_PAIR -3 #define UTF8_BAD_CONTINUATION_BYTE -4 #define UNICODE_EMPTY_INPUT -5 #define UTF8_NON_SHORTEST -6 #define UNICODE_TOO_BIG -7 #define UNICODE_NOT_CHARACTER -8 #define UTF8_VALID 1 #define UTF8_INVALID 0 #line 98 "unicode.c" extern const uint8_t utf8_sequence_len[]; #line 103 "unicode.c" int32_t utf8_bytes (uint8_t c); #line 146 "unicode.c" int32_t utf8_no_checks (const uint8_t* input, const uint8_t** end_ptr); #line 197 "unicode.c" int32_t utf8_to_ucs2 (const uint8_t* input, const uint8_t** end_ptr); #line 295 "unicode.c" int32_t ucs2_to_utf8 (int32_t ucs2, uint8_t* utf8); #line 339 "unicode.c" int32_t unicode_to_surrogates (int32_t unicode, int32_t* hi_ptr, int32_t* lo_ptr); #line 358 "unicode.c" int32_t surrogates_to_unicode (int32_t hi, int32_t lo); #line 381 "unicode.c" int32_t surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t* utf8); #line 394 "unicode.c" int32_t unicode_chars_to_bytes (const uint8_t* utf8, int32_t n_chars); #line 414 "unicode.c" int32_t unicode_count_chars_fast (const uint8_t* utf8); #line 436 "unicode.c" int32_t unicode_count_chars (const uint8_t* utf8); #line 459 "unicode.c" #define BYTE_80_8F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F #define BYTE_80_9F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F #define BYTE_80_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_80_8F_B0_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_80_B6_B8_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_80_BD \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD #define BYTE_90_BF \ 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \ case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \ case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \ case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \ case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \ case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \ case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_A0_BF \ 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \ case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \ case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \ case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \ case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_C2_DF \ 0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \ case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \ case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \ case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \ case 0xDE: case 0xDF #define BYTE_E1_EC \ 0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \ case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC #define BYTE_F1_F3 \ 0xF1: case 0xF2: case 0xF3 #line 549 "unicode.c" int32_t valid_utf8 (const uint8_t* input, int32_t input_length); #line 563 "unicode.c" typedef struct utf8_info { int32_t len_read; int32_t runes_read; } utf8_info_t; #line 573 "unicode.c" int32_t validate_utf8 (const uint8_t* input, int32_t len, utf8_info_t* info); #line 775 "unicode.c" int32_t trim_to_utf8_start (const uint8_t** ptr); #line 802 "unicode.c" const char* unicode_code_to_error (int32_t code); #endif /* CFH_UNICODE_H */ JSON-Parse-0.62/Changes000644 001751 001751 00000012215 14264373325 014143 0ustar00benben000000 000000 0.62 2022-07-16 * Bug fix for tokenize with backslash-double quote 0.61 2021-02-11 * Add upgrade_utf8 method 0.60 2021-01-26 * Add "read" method to read a file from an object * Rename "run" to "parse" (old name is still OK) * "tokenize_child" no longer returns parent on failure * Add JSON::Whitespace (manipulate JSON whitespace) to the module 0.59 2021-01-26 * "json_file_to_perl" renamed "read_json" * Documentation work -- Benchmarks section updated with latest versions, add Cpanel::JSON::XS -- Add Metacpan vote ratings to other CPAN modules section -- Sections reordered to put methods above parsing details 0.58 2021-01-01 * Protect against stack overflows by having maximum parsing depth -- set_max_depth, get_max_depth methods added * Documentation updated -- JSON RFC changed to 8259 -- Discussion of Unicode tests in JSON Test Suite 0.57 2020-07-09 * Bug fix for long strings 0.56 2020-02-06 * Allow build on Solaris and SunOS 0.55_02 2019-01-24 * For testing of Daxim segfault through CPAN testers 0.55 2017-10-21 * Versions synchronised * Memory problem on Cygwin addressed 0.54 2017-10-20 * Refuse to support Solaris 0.53 2017-10-18 * Windows build fix 0.52 2017-10-18 * Removed unnecessary test 0.50 2017-10-18 * tokenize_text method added to JSON::Tokenize 0.49 2016-11-26 * Behaviour for undefined, whitespace-only inputs was altered * JSON Parsing Test Suite compliance 0.48 2016-11-20 * Add "check" method for the benefit of JSON::Repair 0.47 2016-11-16 * Bug fixes for Perl 12 and earlier regarding diagnostics_hash method 0.46 2016-11-16 * Remove experimental feature $json_diagnostics * Add diagnostics_hash method as a replacement for $json_diagnostics * Better warnings for parse_json_safe * Add the examples directory files to the CPAN distribution * Documentation rewrites 0.45 2016-11-15 * Remove non-functioning JSON::Whitespace module from CPAN distribution 0.44 2016-11-13 * Add a primitive interface to JSON::Tokenize * Document the JSON::Tokenize interface 0.43 2016-11-05 * Remove validjson dependence on Path::Tiny. * Produce better outputs from validjson script 0.42 2016-08-22 * Long string copying bug fixed. 0.41 2016-06-14 * Implement "warn_only" * Documentation clarifications 0.40 2016-05-22 * Compilation warning, possible memory leaks (SHLOMIF) 0.39 2015-11-18 * Fix for 64 bit * Changed error message documentation to machine-readable 0.38 2015-10-30 * Add object interface for more flexible parsing. * Add "parse_json_safe" with no exceptions and hash collision detection. * Add choice to copy literals. * Better documentation of changes and examples 0.37 2015-10-29 * Remove $JSON::Parse::true and false, use &PL_sv_yes and &PL_sv_no instead. 0.36 2015-10-24 * Document and install "validjson" script. * Document the "$JSON::Parse::json_diagnostics" global variable. 0.35 2015-10-19 * Minor documentation changes 0.34 2015-10-19 * Document key collisions * Extend SEE ALSO section on other JSON modules on CPAN. 0.33 2015-10-09 * Memory leak plug * Small speed improvements * Minor bug about commas after bare values * Documentation rewritten to clarify changes in 0.32 and testing. 0.32 2015-07-29 * Update to accept bare values, per RFC 7159. 0.31 2014-12-07 * Make sure to cast to STRLEN in newSVpv(n) calls. * Remove reference to deleted Google Group from documentation. 0.30 2014-03-23 * Prototypes removed from functions * Documentation rewrites * Changes to compile on Darwin OS * Use "assert_valid_json" in DIAGNOSTICS section of pod. 0.29 2013-12-25 * UTF-8 validation in strings * Number, object, array parsing improvements * Fewer error messages & documented (see DIAGNOSTICS in pod) * Testing done with random input to root out problems (see TESTING in pod) 0.28 2013-12-12 * Change function name from "validate_json" to "assert_valid_json" (old name is still accepted). * More tests * Style of error messages changed for "end of input" errors. * More error checking for invalid \u escapes. 0.27 2013-12-11 * More thorough checking of input for format errors (missing commas etc.) * Check for bytes 0x00 - 0x1F in strings and throw error if present * Don't upgrade \u0000 - \u0080 to character strings * Documentation rewrites 0.26 2013-12-10 * Remove core file from distribution. 0.25 2013-12-10 * Fix bug where missing commas were not detected. * Now parses \uXXXX constructions. * "valid_json" now back to being a fast validator. * Add new "validate_json", a fast, exception throwing validator. * Surrogate pairs now handled (tentative) * Upgrade \uXXXX to character string. 0.24 2013-11-27 * Update "Changes" to conform to CPAN::Changes::Spec 0.23 2013-09-15 0.22 2013-08-26 * Typos and minor details in documentation. * Changes to compile on Microsoft C compiler. 0.21 2013-08-07 * Changes documented. 0.20 2013-08-06 * From-scratch rewrite of module. 0.19 2013-03-15 0.18 2013-03-12 0.17 2013-03-10 0.16 2013-03-08 0.15 2013-03-08 0.14 2012-06-12 0.13 2012-06-12 0.12 2012-06-12 0.11 2012-04-13 0.10 2012-03-17 0.09 2012-03-16 0.07 2011-06-09 0.06 2011-06-02 0.05 2011-05-31 * Changed name from JSON::Argo to JSON::Parse 0.04 2011-05-29 0.03 2011-05-25 0.02 2011-05-25 0.01 2011-05-23 JSON-Parse-0.62/errors.c000444 001751 001751 00000035561 14264373417 014341 0ustar00benben000000 000000 typedef enum { json_error_invalid, json_error_unexpected_character, json_error_unexpected_end_of_input, json_error_not_surrogate_pair, json_error_empty_input, json_error_name_is_not_unique, json_error_overflow } json_error_t; const char * json_errors[json_error_overflow] = { "Invalid", "Unexpected character", "Unexpected end of input", "Not surrogate pair", "Empty input", "Name is not unique", }; enum expectation { xwhitespace, xcomma, xvalue_separator, xobject_end, xarray_end, xhexadecimal_character, xstring_start, xdigit, xdot, xminus, xplus, xexponential, xarrayobjectstart, xescape, xstringchar, xliteral, xin_literal, xin_surrogate_pair, xbytes_80_8f, xbytes_80_9f, xbytes_80_bf, xbytes_90_bf, xbytes_a0_bf, n_expectations }; #define XWHITESPACE (1<bad_type = json_initial_state; parser->expected = 0; parser->error = json_error_empty_input; failbadinput (parser); } /* Check for stray non-whitespace after the end and free memory. */ static void check_end (json_parse_t * parser) { int c; end: switch (NEXTBYTE) { case WHITESPACE: goto end; case '\0': parser_free (parser); return; default: parser->bad_type = json_initial_state; parser->bad_byte = parser->end - 1; parser->expected = XWHITESPACE; parser->error = json_error_unexpected_character; failbadinput (parser); } } #define ENTRYDECL \ /* Our collection of bits and pieces. */ \ \ json_parse_t parser_o = {0}; \ json_parse_t * parser = & parser_o; \ json_parse_init (parser) #ifndef NOPERL /* Set up "parser" with the string from "json". */ static void getstring (SV * json, json_parse_t * parser) { STRLEN length; parser->input = (unsigned char *) SvPV (json, length); parser->end = parser->input; parser->length = (unsigned int) length; parser->unicode = SvUTF8 (json) ? 1 : 0; } #endif /* ndef NOPERL */ #define SETUPPARSER \ parser->line = 1; \ parser->last_byte = parser->input + parser->length /* Error to throw if there is a character other than whitespace, "[" or "{" at the start of the JSON. */ #define BADCHAR \ parser->bad_byte = parser->end - 1; \ parser->bad_type = json_initial_state; \ parser->expected = XARRAYOBJECTSTART | VALUE_START | XWHITESPACE; \ parser->error = json_error_unexpected_character; \ failbadinput (parser) #ifndef NOPERL static SV * json_parse_run (json_parse_t * parser, SV * json) { /* The currently-parsed character. */ char c; /* The returned object. */ SV * r = & PL_sv_undef; getstring (json, parser); if (parser->length == 0) { fail_empty (parser); } SETUPPARSER; parse_start: switch (NEXTBYTE) { case '{': INCDEPTH; r = object (parser); break; case '[': INCDEPTH; r = array (parser); break; case '-': case '0': case DIGIT19: parser->top_level_value = 1; r = number (parser); break; case '"': parser->top_level_value = 1; r = string (parser); break; case 't': parser->top_level_value = 1; r = literal_true (parser); break; case 'f': parser->top_level_value = 1; r = literal_false (parser); break; case 'n': parser->top_level_value = 1; r = literal_null (parser); break; case WHITESPACE: goto parse_start; case '\0': /* We have an empty string. */ fail_empty (parser); default: BADCHAR; } check_end (parser); return r; } /* This is the entry point for non-object parsing. */ SV * parse (SV * json) { /* Make our own parser object on the stack. */ ENTRYDECL; /* Run it. */ return json_parse_run (parser, json); } /* This is the entry point for "safe" non-object parsing. */ SV * parse_safe (SV * json) { /* Make our own parser object on the stack. */ ENTRYDECL; parser_o.detect_collisions = 1; parser_o.copy_literals = 1; parser_o.warn_only = 1; parser_o.diagnostics_hash = 1; /* Run it. */ return json_parse_run (parser, json); } #endif /* ndef NOPERL */ /* Validation without Perl structures. */ static void c_validate (json_parse_t * parser) { /* The currently-parsed character. */ char c; /* If the string is empty, throw an exception. */ if (parser->length == 0) { fail_empty (parser); } SETUPPARSER; validate_start: switch (NEXTBYTE) { case '{': INCDEPTH; valid_object (parser); break; case '[': INCDEPTH; valid_array (parser); break; case '-': case '0': case DIGIT19: parser->top_level_value = 1; valid_number (parser); break; case '"': parser->top_level_value = 1; valid_string (parser); break; case 't': parser->top_level_value = 1; valid_literal_true (parser); break; case 'f': parser->top_level_value = 1; valid_literal_false (parser); break; case 'n': parser->top_level_value = 1; valid_literal_null (parser); break; case WHITESPACE: goto validate_start; default: BADCHAR; } check_end (parser); } static INLINE void print_tokens (json_token_t * t) { printf ("Start: %d End: %d: Type: %s\n", t->start, t->end, token_names[t->type]); if (t->child) { printf ("Children:\n"); print_tokens (t->child); } if (t->next) { printf ("Next:\n"); print_tokens (t->next); } } #ifndef NOPERL static json_token_t * c_tokenize (json_parse_t * parser) { /* The currently-parsed character. */ char c; json_token_t * r; SETUPPARSER; tokenize_start: switch (NEXTBYTE) { case '{': r = tokenize_object (parser); break; case '[': r = tokenize_array (parser); break; case WHITESPACE: goto tokenize_start; default: BADCHAR; } check_end (parser); return r; } static void tokenize_free (json_token_t * token) { json_token_t * next; next = token->child; if (next) { if (! next->blessed) { tokenize_free (next); } token->child = 0; } next = token->next; if (next) { if (! next->blessed) { tokenize_free (next); } token->next = 0; } if (! token->blessed) { Safefree (token); } } /* This is the entry point for validation. */ static void validate (SV * json, unsigned int flags) { ENTRYDECL; getstring (json, parser); if (parser->length == 0) { fail_empty (parser); } c_validate (& parser_o); } static void check (json_parse_t * parser, SV * json) { getstring (json, parser); c_validate (parser); } static json_token_t * tokenize (SV * json) { ENTRYDECL; getstring (json, parser); /* Mark this parser as being used for tokenizing to bypass the checks for memory leaks when the parser is freed. */ parser_o.tokenizing = 1; return c_tokenize (& parser_o); } /* Make a hash containing a diagnostic error from the parser. */ static SV * error_to_hash (json_parse_t * parser, char * error_as_string) { HV * error; error = newHV (); #ifdef HK #warn "Redefinition of macro HK" #endif /* def HK */ #undef HK #define HK(x, val) (void) hv_store (error, x, strlen (x), val, 0) HK("length", newSViv (parser->length)); HK("bad type", newSVpv (type_names[parser->bad_type], 0)); HK("error", newSVpv (json_errors[parser->error], 0)); HK("error as string", newSVpv (error_as_string, 0)); if (parser->bad_byte) { int position; position = (int) (parser->bad_byte - parser->input) + 1; HK("bad byte position", newSViv (position)); HK("bad byte contents", newSViv (* parser->bad_byte)); } if (parser->bad_beginning) { int bcstart; bcstart = (int) (parser->bad_beginning - parser->input) + 1; HK("start of broken component", newSViv (bcstart)); } if (parser->error == json_error_unexpected_character) { int j; AV * valid_bytes; valid_bytes = newAV (); make_valid_bytes (parser); for (j = 0; j < JSON3MAXBYTE; j++) { av_push (valid_bytes, newSViv (parser->valid_bytes[j])); } HK("valid bytes", newRV_inc ((SV *) valid_bytes)); } return newRV_inc ((SV *) error); #undef HK } #endif /* ndef NOPERL */ JSON-Parse-0.62/lib/000755 001751 001751 00000000000 14264373424 013415 5ustar00benben000000 000000 JSON-Parse-0.62/json-whitespace.c000644 001751 001751 00000007747 14004651620 016122 0ustar00benben000000 000000 /* Type for adding whitespace. */ typedef struct json_s { SV * sv; char * s; STRLEN sl; } json_s_t; typedef struct json_ws { SV * news; SV * olds; /* Length of original string. */ STRLEN olds_l; /* Length of new string. */ unsigned int news_l; /* Copy point. */ char * q; /* Origin */ char * p; /* Top of token tree. */ json_token_t * t; /* Token under examination now. */ json_token_t * next; /* Whitespace to add before and after. */ char * before[n_json_tokens]; char * after[n_json_tokens]; int array_depth; int object_depth; char * array_indent; char * object_indent; } json_ws_t; static void copy_whitespace (json_ws_t * ws, char * w) { char * q; q = ws->q; while (*w) { *q++ = *w++; } ws->q = q; } static INLINE int whitespace_json (json_ws_t * ws) { /* Copy place. */ char * c; /* Value of q at entry to this routine, used to calculate added length. */ char * qorig; json_token_t * next; char * q; q = ws->q; qorig = q; next = ws->next; while (next) { /* Copy start of string. */ copy_whitespace (ws, ws->before[next->type]); switch (next->type) { case json_token_object: *q++ = '{'; ws->object_depth++; ws->q = q; q += whitespace_json (ws); ws->object_depth--; *q++ = '}'; break; case json_token_array: *q++ = '['; ws->array_depth++; ws->q = q; q += whitespace_json (ws); ws->object_depth--; *q++ = ']'; break; case json_token_string: case json_token_key: case json_token_literal: case json_token_number: for (c = ws->p + next->start; c <= ws->p + next->end; c++) { *q++ = *c; } break; case json_token_comma: *q++ = ','; break; case json_token_colon: *q++ = ':'; break; default: croak ("unhandled token type %d", next->type); } /* Copy end of string. */ c = ws->after[next->type]; while (*c) { *q++ = *c++; } next = next->next; } return q - qorig; } static int copy_json (char * p, char * q, json_token_t * t) { /* Loop variable. */ json_token_t * next; /* Copy place. */ char * c; /* Value of q at entry to this routine, used to calculate added length. */ char * qorig; next = t; qorig = q; while (next) { switch (next->type) { case json_token_object: *q++ = '{'; q += copy_json (p, q, next->child); *q++ = '}'; break; case json_token_array: *q++ = '['; q += copy_json (p, q, next->child); *q++ = ']'; break; case json_token_string: case json_token_key: case json_token_literal: case json_token_number: for (c = p + next->start; c < p + next->end; c++) { *q++ = *c; } break; case json_token_comma: *q++ = ','; break; case json_token_colon: *q++ = ':'; break; default: croak ("unhandled token type %d", next->type); } next = next->next; } return q - qorig; } /* Remove all the whitespace. */ static SV * strip_whitespace (json_token_t * tokens, SV * json) { SV * stripped; char * p; char * q; /* Original length. */ STRLEN l; /* Length of output. */ unsigned int m; p = SvPV (json, l); stripped = newSV (l); /* Tell Perl it's a string. */ SvPOK_on (stripped); /* Set UTF-8 if necessary. */ if (SvUTF8 (json)) { SvUTF8_on (stripped); } /* Get a pointer to the string inside "stripped". */ q = SvPVX (stripped); m = copy_json (p, q, tokens); /* Set the length. */ SvCUR_set (stripped, m); return stripped; } static SV * indent (json_token_t * tokens, SV * json) { int i; json_ws_t j = {0}; j.olds = json; j.p = SvPV (j.olds, j.olds_l); j.t = tokens; j.next = tokens; for (i = 0; i < n_json_tokens; i++) { j.before[i] = ""; j.after[i] = ""; } j.after[json_token_comma] = "\n"; j.after[json_token_object] = "\n"; j.after[json_token_array] = "\n"; return &PL_sv_undef; } JSON-Parse-0.62/CONTRIBUTING.md000644 001751 001751 00000000537 13773471644 015114 0ustar00benben000000 000000 If you want to contribute to this module, you can file a bug report at github issues or email us with your suggestions. As of version 0.58, it is just about possible to install this module from the github repository, so you may be able to fork the module and install it successfully on your local computer. If so, please try sending a pull request. JSON-Parse-0.62/json-perl.c000644 001751 001751 00000063213 14264115541 014723 0ustar00benben000000 000000 /* The C part is broken into three pieces, "json-common.c", "json-perl.c", and "json-entry-points.c". This file contains the "Perl" stuff, for example if we have a string, the stuff to convert it into a Perl hash key or a Perl scalar is in this file. */ /* There are two routes through the code, the PERLING route and the non-PERLING route. If we go via the non-PERLING route, we never create or alter any Perl-related stuff, we just parse each byte and possibly throw an error. This makes validation faster. */ #ifdef PERLING /* We are creating Perl structures from the JSON. */ #define PREFIX(x) x #define SVPTR SV * #define SETVALUE value = #elif defined(TOKENING) /* We are just tokenizing the JSON. */ #define PREFIX(x) tokenize_ ## x #define SVPTR json_token_t * #define SETVALUE value = #else /* not def PERLING/TOKENING */ /* Turn off everything to do with creating Perl things. */ #define PREFIX(x) valid_ ## x #define SVPTR void #define SETVALUE #endif /* def PERLING */ /* This is what INT_MAX_DIGITS is, but #defining it like this causes huge amounts of unnecessary calculation, so this is commented out. #define INT_MAX_DIGITS ((int) (log (INT_MAX) / log (10)) - 1) */ /* The maximum digits we allow an integer before throwing in the towel and returning a Perl string type. */ #define INT_MAX_DIGITS 8 #define USEDIGIT guess = guess * 10 + (c - '0') static INLINE SVPTR PREFIX (number) (json_parse_t * parser) { /* End marker for strtod. */ char * end; /* Start marker for strtod. */ char * start; /* A guess for integer numbers. */ int guess; /* The parsed character itself, the cause of our motion. */ unsigned char c; /* If it has exp or dot in it. */ double d; /* Negative number. */ int minus; /* When this is called, it means that a byte indicating a number was found. We need to re-examine that byte as a number. */ parser->end--; start = (char *) parser->end; #define FAILNUMBER(err) \ if (STRINGEND && \ parser->top_level_value && \ c == '\0') { \ goto exp_number_end; \ } \ parser->bad_byte = parser->end - 1; \ parser->error = json_error_ ## err; \ parser->bad_type = json_number; \ parser->bad_beginning = \ (unsigned char*) start; \ failbadinput (parser) #define NUMBEREND \ WHITESPACE: \ case ']': \ case '}': \ case ',' #define XNUMBEREND (XCOMMA|XWHITESPACE|parser->end_expected) guess = 0; minus = 0; switch (NEXTBYTE) { case DIGIT19: guess = c - '0'; goto leading_digit19; case '0': goto leading_zero; case '-': minus = 1; goto leading_minus; default: parser->expected = XDIGIT | XMINUS; FAILNUMBER (unexpected_character); } leading_digit19: switch (NEXTBYTE) { case DIGIT: USEDIGIT; goto leading_digit19; case '.': goto dot; case 'e': case 'E': goto exp; case NUMBEREND: goto int_number_end; default: parser->expected = XDIGIT | XDOT | XEXPONENTIAL | XNUMBEREND; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } leading_zero: switch (NEXTBYTE) { case '.': /* "0." */ goto dot; case 'e': case 'E': /* "0e" */ goto exp; case NUMBEREND: /* "0" */ goto int_number_end; default: parser->expected = XDOT | XEXPONENTIAL | XNUMBEREND; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } leading_minus: switch (NEXTBYTE) { case DIGIT19: USEDIGIT; goto leading_digit19; case '0': goto leading_zero; default: parser->expected = XDIGIT; FAILNUMBER (unexpected_character); } /* Things like "5." are not allowed so there is no NUMBEREND here. */ dot: switch (NEXTBYTE) { case DIGIT: goto dot_digits; default: parser->expected = XDIGIT; FAILNUMBER (unexpected_character); } /* We have as much as 5.5 so we can stop. */ dot_digits: switch (NEXTBYTE) { case DIGIT: goto dot_digits; case 'e': case 'E': goto exp; case NUMBEREND: goto exp_number_end; default: parser->expected = XDIGIT | XNUMBEREND | XEXPONENTIAL; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } /* Things like "10E" are not allowed so there is no NUMBEREND here. */ exp: switch (NEXTBYTE) { case '-': case '+': goto exp_sign; case DIGIT: goto exp_digits; default: parser->expected = XDIGIT | XMINUS | XPLUS; FAILNUMBER (unexpected_character); } exp_sign: switch (NEXTBYTE) { case DIGIT: goto exp_digits; default: parser->expected = XDIGIT; FAILNUMBER (unexpected_character); } /* We have as much as "3.0e1" or similar. */ exp_digits: switch (NEXTBYTE) { case DIGIT: goto exp_digits; case NUMBEREND: goto exp_number_end; default: parser->expected = XDIGIT | XNUMBEREND; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } exp_number_end: parser->end--; #ifdef PERLING d = strtod (start, & end); #else strtod (start, & end); #endif if ((unsigned char *) end == parser->end) { /* Success, strtod worked as planned. */ #ifdef PERLING return newSVnv (d); #elif defined (TOKENING) return json_token_new (parser, (unsigned char *) start, parser->end, json_token_number); #else return; #endif } else { /* Failure, strtod rejected the number. */ goto string_number_end; } int_number_end: parser->end--; if (parser->end - (unsigned char *) start < INT_MAX_DIGITS + minus) { if (minus) { guess = -guess; } /* printf ("number debug: '%.*s': %d\n", parser->end - (unsigned char *) start, start, guess); */ #ifdef PERLING return newSViv (guess); #elif defined (TOKENING) return json_token_new (parser, (unsigned char *) start, parser->end - 1, json_token_number); #else return; #endif } else { goto string_number_end; } string_number_end: /* We could not convert this number using a number conversion routine, so we are going to convert it to a string. This might happen with ridiculously long numbers or something. The JSON standard doesn't explicitly disallow integers with a million digits. */ #ifdef PERLING return newSVpv (start, (STRLEN) ((char *) parser->end - start)); #elif defined (TOKENING) return json_token_new (parser, (unsigned char *) start, parser->end - 1, json_token_number); #else return; #endif } #ifdef PERLING /* This copies our on-stack buffer "buffer" of size "size" into the end of a Perl SV called "string". */ #define COPYBUFFER { \ if (! string) { \ string = newSVpvn ((char *) buffer, size); \ } \ else { \ char * svbuf; \ STRLEN cur = SvCUR (string); \ if (SvLEN (string) <= cur + size) { \ SvGROW (string, cur + size); \ } \ svbuf = SvPVX (string); \ memcpy (svbuf + cur, buffer, size); \ SvCUR_set (string, cur + size); \ } \ } /* The size of the on-stack buffer. */ #define BUFSIZE 0x1000 /* We need a safety margin when dealing with the buffer, for example if we hit a Unicode \uabcd escape which needs to be decoded, we need to have enough bytes to write into the buffer. */ #define MARGIN 0x10 /* Speedup hack, a special "get_string" for Perl parsing which doesn't use parser->buffer but its own buffer on the stack. */ static INLINE SV * perl_get_string (json_parse_t * parser, STRLEN prefixlen) { unsigned char * b; unsigned char c; unsigned char * start; unsigned char buffer[BUFSIZE]; STRLEN size; SV * string; string = 0; start = parser->end; b = buffer; if (prefixlen > 0) { /* The string from parser->end to parser->end + prefixlen has already been checked and found not to contain the end of the string or any escapes, so we just copy the memory straight into the buffer. This was supposed to speed things up, but it didn't seem to. However this presumably cannot hurt either. */ if (prefixlen > BUFSIZE - MARGIN) { /* This is to account for the very unlikely case that the key of the JSON object is more than BUFSIZE - MARGIN bytes long and has an escape after more than BUFSIZE - MARGIN bytes. */ prefixlen = BUFSIZE - MARGIN; } memcpy (buffer, parser->end, prefixlen); start += prefixlen; } string_start: size = b - buffer; if (size >= BUFSIZE - MARGIN) { /* Spot-check for an overflow. */ if (STRINGEND) { STRINGFAIL (unexpected_end_of_input); } /* "string_start" is a label for a goto which is applied until we get to the end of the string, so size keeps getting larger and larger. Now the string being parsed has proved to be too big for our puny BUFSIZE buffer, so we copy the contents of the buffer into the nice Perl scalar. */ COPYBUFFER; /* Set the point of copying bytes back to the beginning of buffer. We don't reset the memory in buffer. */ b = buffer; size = b - buffer; } NEXTBYTE; /* "if" statements seem to compile to something marginally faster than "switch" statements, for some reason. */ if (c < 0x20) { ILLEGALBYTE; } else if (c >= 0x20 && c <= 0x80) { /* For some reason or another, putting the following "if" statements after the above one results in about 4% faster code than putting them before it. */ if (c == '"') { goto string_end; } if (c == '\\') { HANDLE_ESCAPES (parser->end, start - 1); goto string_start; } * b++ = c; goto string_start; } else { /* Resort to switch statements for the UTF-8 stuff. This actually also contains statements to handle ASCII but they will never be executed. */ switch (c) { #define ADDBYTE * b = c; b++ #define startofutf8string start #include "utf8-byte-one.c" default: /* We have to give up, this byte is too mysterious for our weak minds. */ ILLEGALBYTE; } } string_end: if (STRINGEND) { STRINGFAIL (unexpected_end_of_input); } COPYBUFFER; return string; /* The rest of the UTF-8 stuff goes in here. */ #include "utf8-next-byte.c" #undef ADDBYTE goto string_end; } #endif /* PERLING */ static SVPTR PREFIX (string) (json_parse_t * parser) { unsigned char c; #ifdef PERLING SV * string; STRLEN len; STRLEN prefixlen; #elif defined (TOKENING) json_token_t * string; int len; #else int len; #endif unsigned char * start; start = parser->end; len = 0; /* First of all, we examine the string to work out how long it is and to look for escapes. If we find them, we go to "contains_escapes" and go back and do all the hard work of converting the escapes into the right things. If we don't find any escapes, we just use "start" and "len" and copy the string from inside "input". This is a trick to increase the speed of processing. */ string_start: switch (NEXTBYTE) { case '"': goto string_end; case '\\': goto contains_escapes; #define ADDBYTE len++ #include "utf8-byte-one.c" /* Not a fall through. */ case BADBYTES: ILLEGALBYTE; } /* Parsing of the string ended due to a \0 byte flipping the "while" switch and we dropped into this section before reaching the string's end. */ ILLEGALBYTE; #include "utf8-next-byte.c" #undef ADDBYTE string_end: #ifdef PERLING /* Our string didn't contain any escape sequences, so we can just make a new SV * by copying the string from "start", the old position within the thing we're parsing to start + len. */ string = newSVpvn ((char *) start, len); #elif defined (TOKENING) string = json_token_new (parser, start - 1, start + len, json_token_string); #endif goto string_done; contains_escapes: #ifdef PERLING /* Use "perl_get_string" which keeps the buffer on the stack. Results in a minor speed increase. */ parser->end = start; prefixlen = (STRLEN) (parser->end - start); string = perl_get_string (parser, prefixlen); #elif defined (TOKENING) /* Don't use "len" here since it subtracts the escapes. */ parser->end = start; len = get_string (parser); string = json_token_new (parser, /* Location of first quote. */ start - 1, /* Location of last quote. */ parser->end - 1, json_token_string); #else parser->end = start; len = get_string (parser); #endif string_done: #ifdef PERLING if (parser->unicode || parser->force_unicode || parser->upgrade_utf8) { SvUTF8_on (string); parser->force_unicode = 0; } #endif #if defined (PERLING) || defined (TOKENING) return string; #else return; #endif } #define FAILLITERAL(c) \ parser->expected = XIN_LITERAL; \ parser->literal_char = c; \ parser->bad_beginning = start; \ parser->error = json_error_unexpected_character; \ parser->bad_type = json_literal; \ parser->bad_byte = parser->end - 1; \ failbadinput (parser) static SVPTR PREFIX (literal_true) (json_parse_t * parser) { unsigned char * start; start = parser->end - 1; if (* parser->end++ == 'r') { if (* parser->end++ == 'u') { if (* parser->end++ == 'e') { #ifdef PERLING if (parser->user_true) { return newSVsv (parser->user_true); } else if (parser->copy_literals) { return newSVsv (&PL_sv_yes); } else { return &PL_sv_yes; } #elif defined (TOKENING) return json_token_new (parser, start, parser->end - 1, json_token_literal); #else return; #endif } FAILLITERAL ('e'); } FAILLITERAL ('u'); } FAILLITERAL ('r'); } static SVPTR PREFIX (literal_false) (json_parse_t * parser) { unsigned char * start; start = parser->end - 1; if (* parser->end++ == 'a') { if (* parser->end++ == 'l') { if (* parser->end++ == 's') { if (* parser->end++ == 'e') { #ifdef PERLING if (parser->user_false) { return newSVsv (parser->user_false); } else if (parser->copy_literals) { return newSVsv (&PL_sv_no); } else { return &PL_sv_no; } #elif defined (TOKENING) return json_token_new (parser, start, parser->end - 1, json_token_literal); #else return; #endif } FAILLITERAL ('e'); } FAILLITERAL ('s'); } FAILLITERAL ('l'); } FAILLITERAL ('a'); } static SVPTR PREFIX (literal_null) (json_parse_t * parser) { unsigned char * start; start = parser->end - 1; if (* parser->end++ == 'u') { if (* parser->end++ == 'l') { if (* parser->end++ == 'l') { #ifdef PERLING if (parser->user_null) { return newSVsv (parser->user_null); } else if (parser->copy_literals) { return newSVsv (&PL_sv_undef); } else { SvREFCNT_inc (json_null); return json_null; } #elif defined (TOKENING) return json_token_new (parser, start, parser-> end - 1, json_token_literal); #else return; #endif } FAILLITERAL ('l'); } FAILLITERAL ('l'); } FAILLITERAL ('u'); } static SVPTR PREFIX (object) (json_parse_t * parser); /* Given one character, decide what to do next. This goes in the switch statement in both "object ()" and "array ()". */ #define PARSE(start,expected) \ \ case WHITESPACE: \ goto start; \ \ case '"': \ SETVALUE PREFIX (string) (parser); \ break; \ \ case '-': \ case DIGIT: \ parser->end_expected = expected; \ SETVALUE PREFIX (number) (parser); \ break; \ \ case '{': \ INCDEPTH; \ SETVALUE PREFIX (object) (parser); \ break; \ \ case '[': \ INCDEPTH; \ SETVALUE PREFIX (array) (parser); \ break; \ \ case 'f': \ SETVALUE PREFIX (literal_false) (parser); \ break; \ \ case 'n': \ SETVALUE PREFIX (literal_null) (parser); \ break; \ \ case 't': \ SETVALUE PREFIX (literal_true) (parser); \ break #define FAILARRAY(err) \ parser->bad_byte = parser->end - 1; \ parser->bad_type = json_array; \ parser->bad_beginning = start; \ parser->error = json_error_ ## err; \ failbadinput (parser) /* We have seen "[", so now deal with the contents of an array. At the end of this routine, "parser->end" is pointing one beyond the final "]" of the array. */ static SVPTR PREFIX (array) (json_parse_t * parser) { unsigned char c; unsigned char * start; #ifdef PERLING AV * av; SV * value = & PL_sv_undef; #elif defined (TOKENING) json_token_t * av; json_token_t * prev; json_token_t * value; #endif start = parser->end - 1; #ifdef PERLING av = newAV (); #elif defined (TOKENING) av = json_token_new (parser, start, 0, json_token_array); prev = 0; #endif array_start: switch (NEXTBYTE) { PARSE (array_start, XARRAY_END); case ']': goto array_end; default: parser->expected = VALUE_START | XWHITESPACE | XARRAY_END; FAILARRAY (unexpected_character); } #ifdef PERLING av_push (av, value); #elif defined (TOKENING) prev = json_token_set_child (parser, av, value); #endif /* Accept either a comma or whitespace or the end of the array. */ array_middle: switch (NEXTBYTE) { case WHITESPACE: goto array_middle; case ',': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, parser->end - 1, json_token_comma); prev = json_token_set_next (prev, value); #endif goto array_next; case ']': /* Array with at least one element. */ goto array_end; default: parser->expected = XWHITESPACE | XCOMMA | XARRAY_END; FAILARRAY (unexpected_character); } array_next: switch (NEXTBYTE) { PARSE (array_next, XARRAY_END); default: parser->expected = VALUE_START | XWHITESPACE; FAILARRAY (unexpected_character); } #ifdef PERLING av_push (av, value); #elif defined (TOKENING) prev = json_token_set_next (prev, value); #endif goto array_middle; array_end: DECDEPTH; #ifdef PERLING return newRV_noinc ((SV *) av); #elif defined (TOKENING) /* We didn't know where the end was until now. */ json_token_set_end (parser, av, parser->end - 1); return av; #else return; #endif } #define FAILOBJECT(err) \ parser->bad_byte = parser->end - 1; \ parser->bad_type = json_object; \ parser->bad_beginning = start; \ parser->error = json_error_ ## err; \ failbadinput (parser) /* We have seen "{", so now deal with the contents of an object. At the end of this routine, "parser->end" is pointing one beyond the final "}" of the object. */ static SVPTR PREFIX (object) (json_parse_t * parser) { char c; #ifdef PERLING HV * hv; SV * value; /* This is set to -1 if we want a Unicode key. See "perldoc perlapi" under "hv_store". */ int uniflag; #elif defined (TOKENING) json_token_t * hv; json_token_t * value; json_token_t * prev; #endif string_t key; /* Start of parsing. */ unsigned char * start; start = parser->end - 1; #ifdef PERLING if (parser->unicode || parser->upgrade_utf8) { /* Keys are unicode. */ uniflag = -1; } else { /* Keys are not unicode. */ uniflag = 1; } hv = newHV (); #elif defined (TOKENING) hv = json_token_new (parser, start, 0, json_token_object); prev = 0; #endif hash_start: switch (NEXTBYTE) { case WHITESPACE: goto hash_start; case '}': goto hash_end; case '"': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, 0, json_token_string); /* We only come past the label "hash_start" once, so we don't need to check that there is not already a child. */ json_token_set_child (parser, hv, value); prev = value; #endif get_key_string (parser, & key); #ifdef TOKENING /* We didn't know where the end of the string was until now so we wait until after "get_key_string" to set the end. */ json_token_set_end (parser, value, parser->end - 1); #endif goto hash_next; default: parser->expected = XWHITESPACE | XSTRING_START | XOBJECT_END; FAILOBJECT (unexpected_character); } hash_middle: /* We are in the middle of a hash. We have seen a key:value pair, and now we're looking for either a comma and then another key-value pair, or a closing curly brace and the end of the hash. */ switch (NEXTBYTE) { case WHITESPACE: goto hash_middle; case '}': goto hash_end; case ',': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, parser->end - 1, json_token_comma); prev = json_token_set_next (prev, value); #endif goto hash_key; default: parser->expected = XWHITESPACE | XCOMMA | XOBJECT_END; FAILOBJECT (unexpected_character); } hash_key: /* We're looking for a key in the hash, which is a string starting with a double quotation mark. */ switch (NEXTBYTE) { case WHITESPACE: goto hash_key; case '"': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, 0, json_token_string); prev = json_token_set_next (prev, value); #endif get_key_string (parser, & key); #ifdef TOKENING /* We didn't know where the end of the string was until now so we wait until after "get_key_string" to set the end. */ json_token_set_end (parser, value, parser->end - 1); #endif goto hash_next; default: parser->expected = XWHITESPACE | XSTRING_START; FAILOBJECT (unexpected_character); } hash_next: /* We've seen a key, now we're looking for a colon. */ switch (NEXTBYTE) { case WHITESPACE: goto hash_next; case ':': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, parser->end - 1, json_token_colon); prev = json_token_set_next (prev, value); #endif goto hash_value; default: parser->expected = XWHITESPACE | XVALUE_SEPARATOR; FAILOBJECT (unexpected_character); } hash_value: /* We've seen a colon, now we're looking for a value, which can be anything at all, including another hash. Most of the cases are dealt with in the PARSE macro. */ switch (NEXTBYTE) { PARSE (hash_value, XOBJECT_END); default: parser->expected = XWHITESPACE | VALUE_START; FAILOBJECT (unexpected_character); } if (key.contains_escapes) { /* The key had something like "\n" in it, so we can't just copy the value but have to process it to remove the escapes. */ #ifdef PERLING int klen; klen = resolve_string (parser, & key); key.start = parser->buffer; key.length = klen; #else resolve_string (parser, & key); #endif } #ifdef PERLING if (parser->detect_collisions) { /* Look in hv for an existing key with our values. */ SV ** sv_ptr; sv_ptr = hv_fetch (hv, (char *) key.start, key.length * uniflag, 0); if (sv_ptr) { parser->bad_byte = key.start; parser->bad_length = key.length; parser->bad_type = json_object; parser->bad_beginning = start; parser->error = json_error_name_is_not_unique; failbadinput (parser); } } (void) hv_store (hv, (char *) key.start, key.length * uniflag, value, 0); #endif #if defined(TOKENING) prev = json_token_set_next (prev, value); #endif goto hash_middle; hash_end: DECDEPTH; #ifdef PERLING return newRV_noinc ((SV *) hv); #elif defined (TOKENING) json_token_set_end (parser, hv, parser->end - 1); return hv; #else return; #endif } #undef PREFIX #undef SVPTR #undef SETVALUE #ifdef PERLING /* Set and delete user-defined literals. */ static void json_parse_delete_true (json_parse_t * parser) { if (parser->user_true) { SvREFCNT_dec (parser->user_true); parser->user_true = 0; } } static void json_parse_set_true (json_parse_t * parser, SV * user_true) { json_parse_delete_true (parser); if (! SvTRUE (user_true) && ! parser->no_warn_literals) { warn ("User-defined value for JSON true evaluates as false"); } if (parser->copy_literals && ! parser->no_warn_literals) { warn ("User-defined value overrules copy_literals"); } parser->user_true = user_true; SvREFCNT_inc (user_true); } static void json_parse_delete_false (json_parse_t * parser) { if (parser->user_false) { SvREFCNT_dec (parser->user_false); parser->user_false = 0; } } static void json_parse_set_false (json_parse_t * parser, SV * user_false) { json_parse_delete_false (parser); if (SvTRUE (user_false) && ! parser->no_warn_literals) { warn ("User-defined value for JSON false evaluates as true"); } if (parser->copy_literals && ! parser->no_warn_literals) { warn ("User-defined value overrules copy_literals"); } parser->user_false = user_false; SvREFCNT_inc (user_false); } static void json_parse_delete_null (json_parse_t * parser) { if (parser->user_null) { SvREFCNT_dec (parser->user_null); parser->user_null = 0; } } static void json_parse_set_null (json_parse_t * parser, SV * user_null) { if (parser->copy_literals && ! parser->no_warn_literals) { warn ("User-defined value overrules copy_literals"); } json_parse_delete_null (parser); parser->user_null = user_null; SvREFCNT_inc (user_null); } static void json_parse_free (json_parse_t * parser) { /* We can get here with depth > 0 if the parser fails and then the error is caught. */ if (parser->depth < 0) { warn ("Parser depth underflow %d", parser->depth); } json_parse_delete_true (parser); json_parse_delete_false (parser); json_parse_delete_null (parser); Safefree (parser); } static void json_parse_copy_literals (json_parse_t * parser, SV * onoff) { if (! parser->no_warn_literals && (parser->user_true || parser->user_false || parser->user_null)) { warn ("User-defined value overrules copy_literals"); } parser->copy_literals = SvTRUE (onoff) ? 1 : 0; } #endif /* def PERLING */ JSON-Parse-0.62/utf8-next-byte.c000644 001751 001751 00000002423 13012464667 015617 0ustar00benben000000 000000 #define FAILUTF8(want) \ parser->bad_beginning = startofutf8string - 1; \ parser->bad_type = json_string; \ parser->bad_byte = parser->end - 1; \ parser->expected = want; \ parser->error = json_error_unexpected_character; \ failbadinput (parser) byte_last_80_bf: switch (NEXTBYTE) { case BYTE_80_BF: ADDBYTE; goto string_start; default: FAILUTF8 (XBYTES_80_BF); } byte_penultimate_80_bf: switch (NEXTBYTE) { case BYTE_80_BF: ADDBYTE; goto byte_last_80_bf; default: FAILUTF8 (XBYTES_80_BF); } byte24_90_bf: switch (NEXTBYTE) { case BYTE_90_BF: ADDBYTE; goto byte_penultimate_80_bf; default: FAILUTF8 (XBYTES_90_BF); } byte23_80_9f: switch (NEXTBYTE) { case BYTE_80_9F: ADDBYTE; goto byte_last_80_bf; default: FAILUTF8 (XBYTES_80_9F); } byte23_a0_bf: switch (NEXTBYTE) { case BYTE_A0_BF: ADDBYTE; goto byte_last_80_bf; default: FAILUTF8 (XBYTES_A0_BF); } byte24_80_bf: switch (NEXTBYTE) { case BYTE_80_BF: ADDBYTE; goto byte_penultimate_80_bf; default: FAILUTF8 (XBYTES_80_BF); } byte24_80_8f: switch (NEXTBYTE) { case BYTE_80_8F: ADDBYTE; goto byte_penultimate_80_bf; default: FAILUTF8 (XBYTES_80_8F); } JSON-Parse-0.62/script/000755 001751 001751 00000000000 14264373424 014153 5ustar00benben000000 000000 JSON-Parse-0.62/MANIFEST.SKIP000644 001751 001751 00000002552 14003431376 014541 0ustar00benben000000 000000 (build|make-pod|clean|make-errors|run)\.pl blib/.* .gitignore .git/.* Makefile$ pm_to_blib MYMETA..* .*\.bak ^Json3-[0-9.]/$ ^Json3-[0-9.]\.tar\.gz$ ^Parse.(?:c|o|bs)$ ^peek\.pl$ ^test-empty-string\.pl$ ^xt/.* ^lib/JSON/Parse\.pod\.tmpl$ ^Parse\.bs$ # Can download from github if needed. benchmarks # Spurious build guff ^makeitfile$ # Corefiles, a.out sometimes get into the distribution. ^.*\.core$ ^a\.out$ # There aren't any files I plan to distribute with these file types. ^[^/]*\.pl$ ^[^/]*\.txt$ # Don't put the "torture test" code into the CPAN distribution. It # contains bad stuff like "setjmp" which is incompatible with many # people's operating systems. People who want this kind of stuff can # get it from the repository. ^Json3-random-test\.c$ ^(?:json-)?random-test\.c$ ^random-test$ ^randomjson\.pl$ # Don't add the UTF-8 test code to the repo: ^utf8test/.*$ # Makefile for C random stuff ^randmakefile$ # Not necessary or useful for CPAN ^FILES$ # Design document directory, no use to put on CPAN. ^doc/.*$ # ^examples/.*-out\.txt$ # Template for POD ^lib/JSON/Tokenize\.pod\.tmpl$ ^lib/JSON/Whitespace\.pod\.tmpl$ # Obsolete files ^obsolete/.*$ # Author version change script ^versionup\.pl$ # CI control files ^\.travis\.yml$ ^build/.*$ # These are used to make the documentation. ^see-also-info\.json$ ^modules\.pl$ # Local variables: # comment-start: "#" # End: JSON-Parse-0.62/Parse.xs000644 001751 001751 00000012426 14010625451 014267 0ustar00benben000000 000000 #include "EXTERN.h" #include "perl.h" #include #include "XSUB.h" /* TESTRANDOM should never be defined in the code released to CPAN. That this is not defined is tested in "xt/testrandom-invalid.t". */ //#define TESTRANDOM #ifdef TESTRANDOM #include #endif /* def TESTRANDOM */ /* A structure representing the "null" in JSON. Although we're now using PL_sv_yes and PL_sv_no, we don't use PL_sv_undef, because perldoc perlguts says it's a bad idea. */ static SV * json_null; /* Code starts here. */ #include "unicode.h" #include "unicode.c" #include "json-common.c" #define PERLING #include "json-perl.c" #undef PERLING #define TOKENING #include "json-perl.c" #undef TOKENING #include "json-perl.c" #include "json-entry-points.c" #ifdef TESTRANDOM #include "json-random-test.c" #endif /* def TESTRANDOM */ #include "json-whitespace.c" #ifdef NOPERL #error "Cannot define NOPERL error when compiling Perl version" #endif /* def NOPERL */ typedef json_parse_t * JSON__Parse; typedef json_token_t * JSON__Tokenize; MODULE=JSON::Parse PACKAGE=JSON::Parse PROTOTYPES: DISABLE BOOT: { json_null = get_sv ("JSON::Parse::null", GV_ADD); SvREADONLY_on (json_null); } SV * parse_json (json) SV * json; CODE: RETVAL = parse (json); OUTPUT: RETVAL SV * parse_json_safer (json) SV * json; CODE: RETVAL = parse_safe (json); OUTPUT: RETVAL void assert_valid_json (json) SV * json; CODE: validate (json, 0); JSON::Parse new (char * class, ...) CODE: if (! class) { croak ("no class"); } Newxz (RETVAL, 1, json_parse_t); json_parse_init (RETVAL); OUTPUT: RETVAL SV * run_internal (parser, json) JSON::Parse parser SV * json CODE: RETVAL = json_parse_run (parser, json); OUTPUT: RETVAL void check (parser, json) JSON::Parse parser SV * json CODE: check (parser, json); void DESTROY (parser) JSON::Parse parser; CODE: json_parse_free (parser); void set_true (parser, user_true) JSON::Parse parser; SV * user_true; CODE: json_parse_set_true (parser, user_true); void set_false (parser, user_false) JSON::Parse parser; SV * user_false; CODE: json_parse_set_false (parser, user_false); void set_null (parser, user_null) JSON::Parse parser; SV * user_null; CODE: json_parse_set_null (parser, user_null); void delete_true (parser) JSON::Parse parser; CODE: json_parse_delete_true (parser); void delete_false (parser) JSON::Parse parser; CODE: json_parse_delete_false (parser); void copy_literals (parser, onoff) JSON::Parse parser; SV * onoff; CODE: json_parse_copy_literals (parser, onoff); void delete_null (parser) JSON::Parse parser; CODE: json_parse_delete_null (parser); void no_warn_literals (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->no_warn_literals = SvTRUE (onoff) ? 1 : 0; void detect_collisions (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->detect_collisions = SvTRUE (onoff) ? 1 : 0; void diagnostics_hash (parser, onoff) JSON::Parse parser; SV * onoff; CODE: #if PERL_VERSION > 12 parser->diagnostics_hash = SvTRUE (onoff) ? 1 : 0; #else warn ("diagnostics_hash () requires Perl 5.14 or later; this is 5.%d", PERL_VERSION); #endif void warn_only (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->warn_only = SvTRUE (onoff) ? 1 : 0; int get_warn_only (parser) JSON::Parse parser; CODE: if (parser->warn_only) { RETVAL = 1; } else { RETVAL = 0; } OUTPUT: RETVAL void set_max_depth (json, max_depth) JSON::Parse json; int max_depth; CODE: if (max_depth < 0) { croak ("Invalid max depth %d", max_depth); } json->max_depth = max_depth; int get_max_depth (json) JSON::Parse json; CODE: RETVAL = json->max_depth; if (json->max_depth == 0) { RETVAL = JSON_PARSE_DEFAULT_MAX_DEPTH; } OUTPUT: RETVAL #ifdef TESTRANDOM int random_json () CODE: RETVAL = random_json (); OUTPUT: RETVAL #endif /* def TESTRANDOM */ void upgrade_utf8 (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->upgrade_utf8 = SvTRUE (onoff) ? 1 : 0; MODULE=JSON::Parse PACKAGE=JSON::Tokenize JSON::Tokenize tokenize_json (json) SV * json; CODE: RETVAL = tokenize (json); RETVAL->blessed = 1; OUTPUT: RETVAL JSON::Tokenize tokenize_child (token) JSON::Tokenize token CODE: RETVAL = 0; if (token->child) { RETVAL = token->child; RETVAL->blessed = 1; } OUTPUT: RETVAL JSON::Tokenize tokenize_next (token) JSON::Tokenize token CODE: if (token->next) { RETVAL = token->next; RETVAL->blessed = 1; } else { RETVAL = 0; } OUTPUT: RETVAL int tokenize_start (token) JSON::Tokenize token CODE: RETVAL = token->start; OUTPUT: RETVAL int tokenize_end (token) JSON::Tokenize token CODE: RETVAL = token->end; OUTPUT: RETVAL SV * tokenize_type (token) JSON::Tokenize token CODE: /* Only set this to the real value if everything is OK. */ RETVAL = & PL_sv_undef; if (token->type > json_token_invalid && token->type < n_json_tokens) { RETVAL = newSVpv (token_names[token->type], 0); } else { warn ("Invalid JSON token type %d", token->type); } OUTPUT: RETVAL void DESTROY (token) JSON::Tokenize token CODE: tokenize_free (token); MODULE=JSON::Parse PACKAGE=JSON::Whitespace SV * strip_whitespace (tokens, json) JSON::Tokenize tokens; SV * json; CODE: RETVAL = strip_whitespace (tokens, json); OUTPUT: RETVAL JSON-Parse-0.62/Makefile.PL000644 001751 001751 00000002400 14003431361 014577 0ustar00benben000000 000000 use strict; use warnings; use ExtUtils::MakeMaker; my $pm = 'lib/JSON/Parse.pm'; my $pod = 'lib/JSON/Parse.pod'; my $github = 'github.com/benkasminbullock/JSON-Parse'; my $repo = "https://$github"; WriteMakefile ( NAME => 'JSON::Parse', VERSION_FROM => $pm, ABSTRACT_FROM => $pod, AUTHOR => 'Ben Bullock ', LICENSE => 'perl', PREREQ_PM => { 'Carp' => '0', }, META_MERGE => { 'meta-spec' => { version => 2, }, resources => { repository => { type => 'git', url => "git://$github.git", web => $repo, }, bugtracker => { web => "$repo/issues", }, }, no_index => { file => [ 'build/JPB.pm', 't/JPT.pm', 'xt/JPXT.pm', ], }, }, # All the C files are actually #included into Parse.xs so there is # only one object file. OBJECT => 'Parse.o', # Currently the oldest version to pass at CPAN testers. # http://matrix.cpantesters.org/?dist=JSON-Parse+0.24 MIN_PERL_VERSION => '5.008009', EXE_FILES => [ 'script/validjson', ], # Uncomment the following to get "author tests" # OPTIMIZE => '-Wall -g', # OPTIMIZE => '-Wall -O', # This achieves only small speedups with GCC. # OPTIMIZE => '-O3', # CC => 'gcc', ); JSON-Parse-0.62/unicode.c000444 001751 001751 00000111357 14264373365 014453 0ustar00benben000000 000000 /* This is a Unicode library in the programming language C which deals with conversions to and from the UTF-8 format. */ /* Author: Ben Bullock , Repository: https://github.com/benkasminbullock/unicode-c */ #include #include #include "unicode.h" #ifdef HEADER /* _ _ _ _ | | (_)_ __ ___ (_) |_ ___ | | | | '_ ` _ \| | __/ __| | |___| | | | | | | | |_\__ \ |_____|_|_| |_| |_|_|\__|___/ */ /* The maximum number of bytes we need to contain any Unicode code point as UTF-8 as a C string. This length includes one trailing nul byte. */ #define UTF8_MAX_LENGTH 5 /* The maximum possible value of a Unicode code point. See http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */ #define UNICODE_MAXIMUM 0x10ffff /* The maximum possible value which will fit into four bytes of UTF-8. This is larger than UNICODE_MAXIMUM. */ #define UNICODE_UTF8_4 0x1fffff /* ____ _ _ | _ \ ___| |_ _ _ _ __ _ __ __ ____ _| |_ _ ___ ___ | |_) / _ \ __| | | | '__| '_ \ \ \ / / _` | | | | |/ _ \/ __| | _ < __/ |_| |_| | | | | | | \ V / (_| | | |_| | __/\__ \ |_| \_\___|\__|\__,_|_| |_| |_| \_/ \__,_|_|\__,_|\___||___/ */ /* All of the functions in this library return an "int32_t". Negative values are used to indicate errors. */ /* This return value indicates the successful completion of a routine which doesn't use the return value to communicate data back to the caller. */ #define UNICODE_OK 0 /* This return value means that the leading byte of a UTF-8 sequence was not valid. */ #define UTF8_BAD_LEADING_BYTE -1 /* This return value means the caller attempted to turn a code point for a surrogate pair to or from UTF-8. */ #define UNICODE_SURROGATE_PAIR -2 /* This return value means that code points which did not form a surrogate pair were tried to be converted into a code point as if they were a surrogate pair. */ #define UNICODE_NOT_SURROGATE_PAIR -3 /* This return value means that input which was supposed to be UTF-8 encoded contained an invalid continuation byte. If the leading byte of a UTF-8 sequence is not valid, UTF8_BAD_LEADING_BYTE is returned instead of this. */ #define UTF8_BAD_CONTINUATION_BYTE -4 /* This return value indicates a zero byte was found in a string which was supposed to contain UTF-8 bytes. It is returned only by the functions which are documented as not allowing zero bytes. */ #define UNICODE_EMPTY_INPUT -5 /* This return value indicates that UTF-8 bytes were not in the shortest possible form. See http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. This return value is currently unused. If a character is not in the shortest form, the error UTF8_BAD_CONTINUATION_BYTE is returned. */ #define UTF8_NON_SHORTEST -6 /* This return value indicates that there was an attempt to convert a code point which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4 into UTF-8 bytes. */ #define UNICODE_TOO_BIG -7 /* This return value indicates that the Unicode code-point ended with either 0xFFFF or 0xFFFE, meaning it cannot be used as a character code point, or it was in the disallowed range FDD0 to FDEF. */ #define UNICODE_NOT_CHARACTER -8 /* This return value indicates that the UTF-8 is valid. It is only used by "valid_utf8". */ #define UTF8_VALID 1 /* This return value indicates that the UTF-8 is not valid. It is only used by "valid_utf8". */ #define UTF8_INVALID 0 #endif /* def HEADER */ /* This table contains the length of a sequence which begins with the byte given. A value of zero indicates that the byte can not begin a UTF-8 sequence. */ /* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */ const uint8_t utf8_sequence_len[0x100] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ }; /* This function returns the number of bytes of UTF-8 a sequence starting with byte "c" will become, either 1 (c = 0000xxxx), 2 (c = 110xxxxx), 3 (c = 1110xxxx), or 4 (c = 111100xx or c = 11110100). If "c" is not a valid UTF-8 first byte, the value UTF8_BAD_LEADING_BYTE is returned. */ int32_t utf8_bytes (uint8_t c) { int32_t r; r = utf8_sequence_len[c]; if (r == 0) { return UTF8_BAD_LEADING_BYTE; } return r; } /* This macro converts four bytes of UTF-8 into the corresponding code point. */ #define FOUR(x) \ (((int32_t) (x[0] & 0x07)) << 18) \ | (((int32_t) (x[1] & 0x3F)) << 12) \ | (((int32_t) (x[2] & 0x3F)) << 6) \ | (((int32_t) (x[3] & 0x3F))) /* Reject code points which end in either FFFE or FFFF. */ #define REJECT_FFFF(x) \ if ((x & 0xFFFF) >= 0xFFFE) { \ return UNICODE_NOT_CHARACTER; \ } /* Reject code points in a certain range. */ #define REJECT_NOT_CHAR(r) \ if (r >= UNI_NOT_CHAR_MIN && r <= UNI_NOT_CHAR_MAX) { \ return UNICODE_NOT_CHARACTER; \ } /* Reject surrogates. */ #define REJECT_SURROGATE(ucs2) \ if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { \ /* Ill-formed. */ \ return UNICODE_SURROGATE_PAIR; \ } /* Try to convert "input" from UTF-8 to UCS-2, and return a value even if the input is partly broken. This checks the first byte of the input, but it doesn't check the subsequent bytes. */ int32_t utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr) { uint8_t c; c = input[0]; switch (utf8_sequence_len[c]) { case 1: * end_ptr = input + 1; return c; case 2: * end_ptr = input + 2; return (c & 0x1F) << 6 | (input[1] & 0x3F); case 3: * end_ptr = input + 3; return (c & 0x0F) << 12 | (input[1] & 0x3F) << 6 | (input[2] & 0x3F); case 4: * end_ptr = input + 4; return FOUR (input); case 0: /* fall through */ default: return UTF8_BAD_LEADING_BYTE; } } /* Surrogate pair zone. */ #define UNI_SUR_HIGH_START 0xD800 #define UNI_SUR_HIGH_END 0xDBFF #define UNI_SUR_LOW_START 0xDC00 #define UNI_SUR_LOW_END 0xDFFF /* Start of the "not character" range. */ #define UNI_NOT_CHAR_MIN 0xFDD0 /* End of the "not character" range. */ #define UNI_NOT_CHAR_MAX 0xFDEF /* This function converts UTF-8 encoded bytes in "input" into the equivalent Unicode code point. The return value is the Unicode code point corresponding to the UTF-8 character in "input" if successful, and a negative number if not successful. Nul bytes are rejected. "*end_ptr" is set to the next character after the read character on success. "*end_ptr" is set to the start of input on all failures. "end_ptr" may not be NULL. If the first byte of "input" is zero, in other words a NUL or '\0', UNICODE_EMPTY_INPUT is returned. If the first byte of "input" is not valid UTF-8, UTF8_BAD_LEADING_BYTE is returned. If the second or later bytes of "input" are not valid UTF-8, including NUL, UTF8_BAD_CONTINUATION_BYTE is returned. If the value extrapolated from "input" is greater than UNICODE_MAXIMUM, UNICODE_TOO_BIG is returned. If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE, UNICODE_NOT_CHARACTER is returned. If the value extrapolated from "input" is between 0xFDD0 and 0xFDEF, UNICODE_NOT_CHARACTER is returned. If the value is within the range of surrogate pairs, the error UNICODE_SURROGATE_PAIR is returned. */ int32_t utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr) { uint8_t c; uint8_t l; *end_ptr = input; c = input[0]; if (c == 0) { return UNICODE_EMPTY_INPUT; } l = utf8_sequence_len[c]; if (l == 1) { * end_ptr = input + 1; return (int32_t) c; } if (l == 2) { uint8_t d; d = input[1]; /* Two byte case. */ if (d < 0x80 || d > 0xBF) { return UTF8_BAD_CONTINUATION_BYTE; } if (c <= 0xC1) { return UTF8_BAD_CONTINUATION_BYTE; } * end_ptr = input + 2; return ((int32_t) (c & 0x1F) << 6) | ((int32_t) (d & 0x3F)); } if (l == 3) { uint8_t d; uint8_t e; int32_t r; d = input[1]; e = input[2]; /* Three byte case. */ if (d < 0x80 || d > 0xBF || e < 0x80 || e > 0xBF) { return UTF8_BAD_CONTINUATION_BYTE; } if (c == 0xe0 && d < 0xa0) { /* We don't need to check the value of input[2], because the if statement above this one already guarantees that it is 10xxxxxx. */ return UTF8_BAD_CONTINUATION_BYTE; } r = ((int32_t) (c & 0x0F)) << 12 | ((int32_t) (d & 0x3F)) << 6 | ((int32_t) (e & 0x3F)); REJECT_SURROGATE(r); REJECT_FFFF(r); REJECT_NOT_CHAR(r); * end_ptr = input + 3; return r; } else if (l == 4) { /* Four byte case. */ uint8_t d; uint8_t e; uint8_t f; int32_t v; d = input[1]; e = input[2]; f = input[3]; if (/* c must be 11110xxx. */ c >= 0xf8 || /* d, e, f must be 10xxxxxx. */ d < 0x80 || d >= 0xC0 || e < 0x80 || e >= 0xC0 || f < 0x80 || f >= 0xC0) { return UTF8_BAD_CONTINUATION_BYTE; } if (c == 0xf0 && d < 0x90) { /* We don't need to check the values of e and f, because the if statement above this one already guarantees that e and f are 10xxxxxx. */ return UTF8_BAD_CONTINUATION_BYTE; } /* Calculate the code point. */ v = FOUR (input); /* Greater than U+10FFFF */ if (v > UNICODE_MAXIMUM) { return UNICODE_TOO_BIG; } /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ REJECT_FFFF(v); /* We don't need to check for surrogate pairs here, since the minimum value of UCS2 if there are four bytes of UTF-8 is 0x10000. */ * end_ptr = input + 4; return v; } return UTF8_BAD_LEADING_BYTE; } /* Input: a Unicode code point, "ucs2". Output: UTF-8 characters in buffer "utf8". Return value: the number of bytes written into "utf8", or a negative number if there was an error. If the value of "ucs2" is invalid because of being in the surrogate pair range from 0xD800 to 0xDFFF, the return value is UNICODE_SURROGATE_PAIR. If the value of "ucs2" is in the range 0xFDD0 to 0xFDEF inclusive, the return value is UNICODE_NOT_CHARACTER. If the lower two bytes of "ucs2" are either 0xFFFE or 0xFFFF, the return value is UNICODE_NOT_CHARACTER. If the value is too big to fit into four bytes of UTF-8, UNICODE_UTF8_4, the return value is UNICODE_TOO_BIG. However, it does not insist on ucs2 being less than UNICODE_MAXIMUM, so the user needs to check that "ucs2" is a valid code point. This adds a zero byte to the end of the string. It assumes that the buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to write to, without checking. */ int32_t ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8) { REJECT_FFFF(ucs2); if (ucs2 < 0x80) { utf8[0] = ucs2; utf8[1] = '\0'; return 1; } if (ucs2 < 0x800) { utf8[0] = (ucs2 >> 6) | 0xC0; utf8[1] = (ucs2 & 0x3F) | 0x80; utf8[2] = '\0'; return 2; } if (ucs2 < 0xFFFF) { utf8[0] = ((ucs2 >> 12) ) | 0xE0; utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80; utf8[2] = ((ucs2 ) & 0x3F) | 0x80; utf8[3] = '\0'; REJECT_SURROGATE(ucs2); REJECT_NOT_CHAR(ucs2); return 3; } if (ucs2 <= UNICODE_UTF8_4) { /* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */ utf8[0] = 0xF0 | (ucs2 >> 18); utf8[1] = 0x80 | ((ucs2 >> 12) & 0x3F); utf8[2] = 0x80 | ((ucs2 >> 6) & 0x3F); utf8[3] = 0x80 | ((ucs2 & 0x3F)); utf8[4] = '\0'; return 4; } return UNICODE_TOO_BIG; } /* For shifting by 10 bits. */ #define TEN_BITS 10 #define HALF_BASE 0x0010000UL /* 0b1111111111 */ #define LOW_TEN_BITS 0x3FF /* This converts the Unicode code point in "unicode" into a surrogate pair, and returns the two parts in "* hi_ptr" and "* lo_ptr". Return value: If "unicode" does not need to be a surrogate pair, the error UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr" and "*lo_ptr" are undefined. If the conversion is successful, UNICODE_OK is returned. */ int32_t unicode_to_surrogates (int32_t unicode, int32_t * hi_ptr, int32_t * lo_ptr) { int32_t hi = UNI_SUR_HIGH_START; int32_t lo = UNI_SUR_LOW_START; if (unicode < HALF_BASE) { /* Doesn't need to be a surrogate pair. */ return UNICODE_NOT_SURROGATE_PAIR; } unicode -= HALF_BASE; hi |= ((unicode >> TEN_BITS) & LOW_TEN_BITS); lo |= ((unicode) & LOW_TEN_BITS); * hi_ptr = hi; * lo_ptr = lo; return UNICODE_OK; } /* Convert a surrogate pair in "hi" and "lo" to a single Unicode value. The return value is the Unicode value. If the return value is negative, an error has occurred. If "hi" and "lo" do not form a surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is returned. https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */ int32_t surrogates_to_unicode (int32_t hi, int32_t lo) { int32_t u; if (hi < UNI_SUR_HIGH_START || hi > UNI_SUR_HIGH_END || lo < UNI_SUR_LOW_START || lo > UNI_SUR_LOW_END) { return UNICODE_NOT_SURROGATE_PAIR; } u = ((hi - UNI_SUR_HIGH_START) << TEN_BITS) + (lo - UNI_SUR_LOW_START) + HALF_BASE; return u; } #undef UNI_SUR_HIGH_START #undef UNI_SUR_HIGH_END #undef UNI_SUR_LOW_START #undef UNI_SUR_LOW_END #undef TEN_BITS #undef HALF_BASE #undef LOW_TEN_BITS /* Convert the surrogate pair in "hi" and "lo" to UTF-8 in "utf8". This calls "surrogates_to_unicode" and "ucs2_to_utf8", thus it can return the same errors as them, and has the same restriction on "utf8" as "ucs2_to_utf8". */ int32_t surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t * utf8) { int32_t C; C = surrogates_to_unicode (hi, lo); if (C < 0) { return C; } return ucs2_to_utf8 (C, utf8); } /* Given a nul-terminated string "utf8" and a number of Unicode characters "n_chars", return the number of bytes into "utf8" at which the end of the characters occurs. A negative value indicates some kind of error. If "utf8" contains a zero byte, the return value is UNICODE_EMPTY_INPUT. This may also return any of the error values of "utf8_to_ucs2". */ int32_t unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars) { int32_t i; const uint8_t * p = utf8; int32_t len = strlen ((const char *) utf8); if (len == 0 && n_chars != 0) { return UNICODE_EMPTY_INPUT; } for (i = 0; i < n_chars; i++) { int32_t ucs2 = utf8_to_ucs2 (p, & p); if (ucs2 < 0) { return ucs2; } } return p - utf8; } /* Like unicode_count_chars, but without error checks or validation of the input. This only checks the first byte of each UTF-8 sequence, then jumps over the succeeding bytes. It may return UTF8_BAD_LEADING_BYTE if the first byte is invalid. */ int32_t unicode_count_chars_fast (const uint8_t * utf8) { int32_t chars; const uint8_t * p; chars = 0; p = utf8; while (*p) { int32_t len; len = utf8_sequence_len[*p]; if (len == 0) { /* The first byte of a UTF-8 sequence is bad, so return this, not BAD_UTF8. */ return UTF8_BAD_LEADING_BYTE; } p += len; chars++; } return chars; } /* Given a nul-terminated string "utf8", return the total number of Unicode characters it contains. Return value If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the errors of "utf8_to_ucs2". */ int32_t unicode_count_chars (const uint8_t * utf8) { int32_t chars = 0; const uint8_t * p = utf8; int32_t len = strlen ((const char *) utf8); if (len == 0) { return 0; } while (p - utf8 < len) { int32_t ucs2; ucs2 = utf8_to_ucs2 (p, & p); if (ucs2 < 0) { /* Return the error from utf8_to_ucs2. */ return ucs2; } chars++; if (*p == '\0') { return chars; } } /* Cannot be reached in practice, since strlen indicates the null byte. */ return UTF8_BAD_LEADING_BYTE; } #ifdef HEADER /* These are intended for use in switch statements, for example switch (c) { case BYTE_80_8F: do_something; They originally come from the Json3 project. */ #define BYTE_80_8F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F #define BYTE_80_9F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F #define BYTE_80_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_80_8F_B0_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_80_B6_B8_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_80_BD \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD #define BYTE_90_BF \ 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \ case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \ case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \ case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \ case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \ case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \ case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_A0_BF \ 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \ case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \ case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \ case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \ case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_C2_DF \ 0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \ case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \ case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \ case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \ case 0xDE: case 0xDF #define BYTE_E1_EC \ 0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \ case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC #define BYTE_F1_F3 \ 0xF1: case 0xF2: case 0xF3 #endif /* def HEADER */ #define UNICODEADDBYTE i++ #define UNICODEFAILUTF8(want) return UTF8_INVALID #define UNICODENEXTBYTE c = input[i] /* Given "input" and "input_length", validate "input" byte by byte up to "input_length". The return value may be UTF8_VALID or UTF8_INVALID. */ int32_t valid_utf8 (const uint8_t * input, int32_t input_length) { int32_t error; utf8_info_t info; error = validate_utf8 (input, input_length, & info); if (error < 0) { return UTF8_INVALID; } return UTF8_VALID; } #define FAIL(x) \ info->len_read = i; \ return x #ifdef HEADER typedef struct utf8_info { int32_t len_read; int32_t runes_read; } utf8_info_t; #endif /* def HEADER */ /* Given "input" and "len", validate "input" byte by byte up to "len". The return value is "UNICODE_OK" (zero) on success or the error found (a negative number) on failure. utf8_info_t is defined in "unicode.h". The value of "info.len_read" is the number of bytes processed. the value of "info.runes_read" is the number of Unicode code points in the input. */ int32_t validate_utf8 (const uint8_t * input, int32_t len, utf8_info_t * info) { int32_t i; uint8_t c; info->len_read = 0; /* We want to increment the runes after "string_start", but that would give us one too many. */ info->runes_read = -1; i = 0; string_start: /* We get here after successfully reading a "rune". */ info->runes_read++; if (i >= len) { info->len_read = len; return UNICODE_OK; /* 0 */ } /* Set c separately here since we use a range comparison before the switch statement. */ c = input[i]; if (c == 0) { FAIL (UNICODE_EMPTY_INPUT); } /* Admit all bytes < 0x80. */ if (c < 0x80) { i++; goto string_start; } switch (c) { case BYTE_C2_DF: UNICODEADDBYTE; goto byte_last_80_bf; case 0xE0: UNICODEADDBYTE; goto byte23_a0_bf; case BYTE_E1_EC: UNICODEADDBYTE; goto byte_penultimate_80_bf; case 0xED: UNICODEADDBYTE; goto byte23_80_9f; case 0xEE: UNICODEADDBYTE; goto byte_penultimate_80_bf; case 0xEF: UNICODEADDBYTE; goto byte_ef_80_bf; case 0xF0: UNICODEADDBYTE; goto byte24_90_bf; case BYTE_F1_F3: UNICODEADDBYTE; goto byte24_80_bf; case 0xF4: UNICODEADDBYTE; goto byte24_80_8f; default: FAIL (UTF8_BAD_LEADING_BYTE); } byte_last_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_BF: UNICODEADDBYTE; goto string_start; default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte_ef_b7: switch (UNICODENEXTBYTE) { case BYTE_80_8F_B0_BF: UNICODEADDBYTE; goto string_start; default: if (c >= 0x90 && c <= 0xAF) { FAIL (UNICODE_NOT_CHARACTER); } else { FAIL (UTF8_BAD_CONTINUATION_BYTE); } } byte_last_80_bd: switch (UNICODENEXTBYTE) { case BYTE_80_BD: UNICODEADDBYTE; goto string_start; case 0xBE: case 0xBF: FAIL (UNICODE_NOT_CHARACTER); default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte_penultimate_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_BF: UNICODEADDBYTE; goto byte_last_80_bf; default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte_ef_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_B6_B8_BF: UNICODEADDBYTE; goto byte_last_80_bd; case 0xB7: UNICODEADDBYTE; /* FDD0 - FDE7 */ goto byte_ef_b7; default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte24_90_bf: switch (UNICODENEXTBYTE) { case BYTE_90_BF: UNICODEADDBYTE; goto byte_penultimate_80_bf; default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte23_80_9f: switch (UNICODENEXTBYTE) { case BYTE_80_9F: UNICODEADDBYTE; goto byte_last_80_bf; default: if (c >= 0xA0 && c <= 0xBF) { FAIL (UNICODE_SURROGATE_PAIR); } else { FAIL (UTF8_BAD_CONTINUATION_BYTE); } } byte23_a0_bf: switch (UNICODENEXTBYTE) { case BYTE_A0_BF: UNICODEADDBYTE; goto byte_last_80_bf; default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte24_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_BF: UNICODEADDBYTE; goto byte_ef_80_bf; default: FAIL (UTF8_BAD_CONTINUATION_BYTE); } byte24_80_8f: switch (UNICODENEXTBYTE) { case BYTE_80_8F: UNICODEADDBYTE; goto byte_ef_80_bf; default: if (c >= 0x90) { FAIL (UNICODE_TOO_BIG); } else { FAIL (UTF8_BAD_CONTINUATION_BYTE); } } } #define REJECT_FE_FF(c) \ if (c == 0xFF || c == 0xFE) { \ return UNICODE_NOT_CHARACTER; \ } /* Make "* ptr" point to the start of the first UTF-8 character after its initial value. This assumes that there are at least four bytes which can be read, and that "* ptr" points to valid UTF-8. If "** ptr" does not have its top bit set, 00xx_xxxx, this does not change the value of "* ptr", and it returns UNICODE_OK. If "** ptr" has its top two bits set, 11xx_xxxx, this does not change the value of "* ptr" and it returns UNICODE_OK. If "**ptr" has its top bit set but its second-to-top bit unset, 10xx_xxxx, so it is the second, third, or fourth byte of a multibyte sequence, "* ptr" is incremented until either "** ptr" is a valid first byte of a UTF-8 sequence, or too many bytes have passed for it to be valid UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is returned and "*ptr" is left unchanged. If a valid UTF-8 first byte was found, either 11xx_xxxx or 00xx_xxxx, UNICODE_OK is returned, and "*ptr" is set to the address of the valid byte. Nul bytes (bytes containing zero) are considered valid. If any of the bytes read contains invalid UTF-8 bytes 0xFE and 0xFF, the error code UNICODE_NOT_CHARACTER is returned and "*ptr" is left unchanged. */ int32_t trim_to_utf8_start (const uint8_t ** ptr) { const uint8_t * p = *ptr; uint8_t c; int32_t i; c = * p; REJECT_FE_FF (c); /* 0xC0 = 1100_0000. */ c &= 0xC0; if (c == 0xC0 || c == 0x00) { return UNICODE_OK; } for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) { c = p[i]; REJECT_FE_FF (c); if ((c & 0x80) != 0x80 || (c & 0x40) != 0) { * ptr = p + i; return UNICODE_OK; } } return UTF8_BAD_CONTINUATION_BYTE; } /* Given a return value "code" which is negative or zero, return a string which describes what the return value means. Positive non-zero return values never indicate errors or statuses in this library. Unknown error codes result in a default string being returned. */ const char * unicode_code_to_error (int32_t code) { switch (code) { case UTF8_BAD_LEADING_BYTE: return "The leading byte of a UTF-8 sequence was invalid"; case UTF8_BAD_CONTINUATION_BYTE: return "A continuation byte of a UTF-8 sequence was invalid"; case UNICODE_SURROGATE_PAIR: return "A surrogate pair code point could not be converted to UTF-8"; case UNICODE_NOT_SURROGATE_PAIR: return "Input code points did not form a surrogate pair"; case UNICODE_OK: return "Successful completion"; case UNICODE_TOO_BIG: return "A code point was beyond limits"; case UNICODE_NOT_CHARACTER: return "A number ending in hex FFFF or FFFE is not valid Unicode"; case UTF8_NON_SHORTEST: return "A UTF-8 input was not in the shortest form"; case UNICODE_EMPTY_INPUT: return "A byte with value zero was found in UTF-8 input"; default: return "Unknown/invalid error code"; } } /* _____ _ |_ _|__ ___| |_ ___ | |/ _ \/ __| __/ __| | | __/\__ \ |_\__ \ |_|\___||___/\__|___/ */ /* Below this is code for testing which is not normally compiled. Use "make test" to compile the testing version. */ #ifdef TEST #include #include #include "c-tap-test.h" static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖX"; static const uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x0}; #define BUFFSIZE 0x100 static void test_ucs2_to_utf8 () { /* Buffer to print utf8 out into. */ uint8_t buffer[BUFFSIZE]; /* Offset into buffer. */ uint8_t * offset; const uint8_t * start = utf8; offset = buffer; while (1) { int32_t unicode; int32_t bytes; const uint8_t * end; unicode = utf8_to_ucs2 (start, & end); if (unicode == UNICODE_EMPTY_INPUT) { break; } if (unicode < 0) { fprintf (stderr, "%s:%d: unexpected error %s converting unicode.\n", __FILE__, __LINE__, unicode_code_to_error (unicode)); // exit ok in test exit (EXIT_FAILURE); } bytes = ucs2_to_utf8 (unicode, offset); TAP_TEST_MSG (bytes > 0, "no bad conversion"); TAP_TEST_MSG (strncmp ((const char *) offset, (const char *) start, bytes) == 0, "round trip OK for %X (%d bytes)", unicode, bytes); start = end; offset += bytes; if (offset - buffer >= BUFFSIZE) { fprintf (stderr, "%s:%d: out of space in buffer.\n", __FILE__, __LINE__); // exit ok exit (EXIT_FAILURE); } } * offset = '\0'; TAP_TEST_MSG (strcmp ((const char *) buffer, (const char *) utf8) == 0, "input %s resulted in identical output %s", utf8, buffer); } static void test_invalid_utf8 () { uint8_t invalid_utf8[UTF8_MAX_LENGTH]; int32_t unicode; int32_t valid; const uint8_t * end; snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1, "%c%c%c", 0xe8, 0xe4, 0xe5); unicode = utf8_to_ucs2 (invalid_utf8, & end); TAP_TEST_MSG (unicode == UTF8_BAD_CONTINUATION_BYTE, "invalid UTF-8 gives incorrect result"); valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8)); TAP_TEST_MSG (valid == UTF8_INVALID, "Invalid UTF-8 fails valid_utf8"); } static void test_surrogate_pairs () { int32_t status; int32_t hi; int32_t lo; int32_t rt; /* This is the wide character space, which does not require representation as a surrogate pair. */ int32_t nogood = 0x3000; /* Two examples from the Wikipedia article on UTF-16 https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */ int32_t wikipedia_1 = 0x10437; int32_t wikipedia_2 = 0x24b62; /* An example from the JSON RFC http://rfc7159.net/rfc7159#rfc.section.7 */ int32_t json_spec = 0x1D11E; status = unicode_to_surrogates (nogood, & hi, & lo); TAP_TEST_MSG (status == UNICODE_NOT_SURROGATE_PAIR, "low value to surrogate pair breaker returns error"); status = unicode_to_surrogates (wikipedia_1, & hi, & lo); TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1); TAP_TEST_MSG (hi == 0xD801, "Got expected %X == 0xD801", hi); TAP_TEST_MSG (lo == 0xDC37, "Got expected %X == 0xDC37", lo); rt = surrogates_to_unicode (hi, lo); TAP_TEST_MSG (rt == wikipedia_1, "Round trip %X == initial %X", rt, wikipedia_1); status = unicode_to_surrogates (wikipedia_2, & hi, & lo); TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1); TAP_TEST_MSG (hi == 0xD852, "Got expected %X == 0xD852", hi); TAP_TEST_MSG (lo == 0xDF62, "Got expected %X == 0xDF62", lo); rt = surrogates_to_unicode (hi, lo); TAP_TEST_MSG (rt == wikipedia_2, "Round trip %X == initial %X", rt, wikipedia_2); status = unicode_to_surrogates (json_spec, & hi, & lo); TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", json_spec); TAP_TEST_MSG (hi == 0xD834, "Got expected %X == 0xD834", hi); TAP_TEST_MSG (lo == 0xDd1e, "Got expected %X == 0xDD1e", lo); rt = surrogates_to_unicode (hi, lo); TAP_TEST_MSG (rt == json_spec, "Round trip %X == initial %X", rt, json_spec); } /* Test sending various bytes into "utf8_bytes" and seeing whether the return value is what we expected. */ static void test_utf8_bytes () { struct tub { int32_t first; int32_t expect; } tests[] = { {'a', 1}, {0xb0, UTF8_BAD_LEADING_BYTE}, {0xc2, 2}, {0xff, UTF8_BAD_LEADING_BYTE}, }; int32_t n_tests = sizeof (tests) / sizeof (struct tub); int32_t i; for (i = 0; i < n_tests; i++) { /* Expected bytes. */ int32_t xbytes; int32_t firstbyte; firstbyte = tests[i].first; xbytes = utf8_bytes (firstbyte); TAP_TEST_MSG (xbytes == tests[i].expect, "Got %d (%d) with input %d", xbytes, tests[i].expect, firstbyte); } } /* Test the conversion from utf-8 to ucs-2 (UTF-16). */ static void test_utf8_to_ucs2 () { const uint8_t * start = utf8; while (*start) { int32_t unicode; const uint8_t * end; unicode = utf8_to_ucs2 (start, & end); TAP_TEST_MSG (unicode > 0, "no bad value at %s", start); printf ("# %s is %04X, length is %d\n", start, unicode, (int) (end - start)); start = end; } } /* Test counting of unicode characters. */ static void test_unicode_count_chars () { int32_t cc; cc = unicode_count_chars (utf8); TAP_TEST_MSG (cc == 7, "unicode_count_chars gets seven characters for utf8"); cc = unicode_count_chars_fast (utf8); TAP_TEST_MSG (cc == 7, "unicode_count_chars_fast gets seven characters for utf8"); } static void test_valid_utf8 () { int32_t valid; valid = valid_utf8 (utf8, strlen ((const char *) utf8)); TAP_TEST_MSG (valid == UTF8_VALID, "Valid UTF-8 passes valid_utf8"); } static void test_trim_to_utf8_start () { int32_t status; const uint8_t * p; /* Invalid UTF-8. */ /* Valid UTF-8. */ uint8_t good[] = "化苦"; uint8_t good2[] = "化abc"; p = bad; status = trim_to_utf8_start (& p); TAP_TEST_MSG (status == UTF8_BAD_CONTINUATION_BYTE, "Non-UTF-8 causes error"); TAP_TEST_MSG (p == bad, "Did not change pointer"); p = good + 1; status = trim_to_utf8_start (& p); TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result"); TAP_TEST_MSG (p != good + 1, "Moved p"); TAP_TEST_MSG (p == good + 3, "Moved p to the right position"); p = good2 + 1; status = trim_to_utf8_start (& p); TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result"); TAP_TEST_MSG (p != good2 + 1, "Moved p"); TAP_TEST_MSG (p == good2 + 3, "Moved p to the right position"); } static void test_constants () { TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM); } static void test_utf8_validate () { int r; int l; utf8_info_t info; r = validate_utf8 ((const uint8_t *) "", 0, & info); TAP_TEST_EQUAL (r, UNICODE_OK); TAP_TEST_EQUAL (info.len_read, 0); TAP_TEST_EQUAL (info.runes_read, 0); l = strlen ((const char *) utf8); r = validate_utf8 (utf8, l, & info); TAP_TEST_EQUAL (r, UNICODE_OK); TAP_TEST_EQUAL (info.len_read, l); TAP_TEST_EQUAL (info.runes_read, 7); l = strlen ((const char *) bad); r = validate_utf8 (bad, l, & info); TAP_TEST (r != UNICODE_OK); } int main () { test_utf8_to_ucs2 (); test_ucs2_to_utf8 (); test_invalid_utf8 (); test_unicode_count_chars (); test_surrogate_pairs (); test_utf8_bytes (); test_valid_utf8 (); test_trim_to_utf8_start (); test_constants (); test_utf8_validate (); TAP_PLAN; } #endif /* def TEST */ JSON-Parse-0.62/utf8-byte-one.c000644 001751 001751 00000000734 13772601034 015417 0ustar00benben000000 000000 case BYTE_20_7F: ADDBYTE; goto string_start; case BYTE_C2_DF: ADDBYTE; goto byte_last_80_bf; case 0xE0: ADDBYTE; goto byte23_a0_bf; case BYTE_E1_EC: ADDBYTE; goto byte_penultimate_80_bf; case 0xED: ADDBYTE; goto byte23_80_9f; case 0xEE: case 0xEF: ADDBYTE; goto byte_penultimate_80_bf; case 0xF0: ADDBYTE; goto byte24_90_bf; case BYTE_F1_F3: ADDBYTE; goto byte24_80_bf; case 0xF4: ADDBYTE; goto byte24_80_8f; JSON-Parse-0.62/typemap000644 001751 001751 00000000135 12614536151 014243 0ustar00benben000000 000000 json_parse_t * T_PTROBJ JSON::Parse T_PTROBJ json_token_t * T_PTROBJ JSON::Tokenize T_PTROBJ JSON-Parse-0.62/META.yml000644 001751 001751 00000001350 14264373424 014117 0ustar00benben000000 000000 --- abstract: 'Parse JSON' author: - 'Ben Bullock ' build_requires: ExtUtils::MakeMaker: '0' configure_requires: ExtUtils::MakeMaker: '0' dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 7.64, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: JSON-Parse no_index: directory: - t - inc file: - build/JPB.pm - t/JPT.pm - xt/JPXT.pm requires: Carp: '0' perl: '5.008009' resources: bugtracker: https://github.com/benkasminbullock/JSON-Parse/issues repository: git://github.com/benkasminbullock/JSON-Parse.git version: '0.62' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' JSON-Parse-0.62/META.json000644 001751 001751 00000002502 14264373424 014267 0ustar00benben000000 000000 { "abstract" : "Parse JSON", "author" : [ "Ben Bullock " ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 7.64, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "JSON-Parse", "no_index" : { "directory" : [ "t", "inc" ], "file" : [ "build/JPB.pm", "t/JPT.pm", "xt/JPXT.pm" ] }, "prereqs" : { "build" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "runtime" : { "requires" : { "Carp" : "0", "perl" : "5.008009" } } }, "release_status" : "stable", "resources" : { "bugtracker" : { "web" : "https://github.com/benkasminbullock/JSON-Parse/issues" }, "repository" : { "type" : "git", "url" : "git://github.com/benkasminbullock/JSON-Parse.git", "web" : "https://github.com/benkasminbullock/JSON-Parse" } }, "version" : "0.62", "x_serialization_backend" : "JSON::PP version 4.06" } JSON-Parse-0.62/script/validjson000755 001751 001751 00000002201 14263225310 016052 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use lib '/home/ben/projects/json-parse/blib/lib'; use lib '/home/ben/projects/json-parse/blib/arch'; use JSON::Parse 'assert_valid_json'; use Getopt::Long; my $ok = GetOptions ( "verbose" => \my $verbose, "help" => \my $help, ); if (! $ok || $help) { usage (); exit; } for my $file (@ARGV) { eval { open my $in, "<:raw", $file or die "Can't open '$file': $!"; my $text = ''; while (my $line = <$in>) { $text .= $line; } close $in or die $!; assert_valid_json ($text); }; if ($@) { my $error = $@; $error =~ s/\n+$//; if ($error !~ /\Q$file/) { $error = "$file: $error"; } if ($error =~ /validjson line [0-9]+\.$/) { $error =~ s/\sat\s\S+\sline.*$/\./; } print "$error\n"; } else { if ($verbose) { print "'$file' is valid JSON.\n"; } } } sub usage { print < released on Sat Jul 16 08:23:13 2022 +0900. =head1 DESCRIPTION This is a module for tokenizing a JSON string. "Tokenizing" means breaking the string into individual tokens, without creating any Perl structures. It uses the same underlying code as L. Tokenizing can be used for tasks such as picking out or searching through parts of a large JSON structure without storing each part of the entire structure in memory. This module is an experimental part of L and its interface is likely to change. The tokenizing functions are currently written in a very primitive way. =head1 FUNCTIONS =head2 tokenize_child my $child = tokenize_child ($child); Walk the tree of tokens. =head2 tokenize_end my $end = tokenize_end ($token); Get the end of the token as a byte offset from the start of the string. Note this is a byte offset not a character offset. =head2 tokenize_json my $token = tokenize_json ($json); =head2 tokenize_next my $next = tokenize_next ($token); Walk the tree of tokens. =head2 tokenize_start my $start = tokenize_start ($token); Get the start of the token as a byte offset from the start of the string. Note this is a byte offset not a character offset. =head2 tokenize_text my $text = tokenize_text ($json, $token); Given a token C<$token> from this parsing and the JSON in C<$json>, return the text which corresponds to the token. This is a convenience function written in Perl which uses L and L and C to get the string from C<$json>. =head2 tokenize_type my $type = tokenize_type ($token); Get the type of the token as a string. The possible return values are "array", "initial state", "invalid", "literal", "number", "object", "string", "unicode escape" =head1 AUTHOR Ben Bullock, =head1 COPYRIGHT & LICENCE This package and associated files are copyright (C) 2016-2022 Ben Bullock. You can use, copy, modify and redistribute this package and associated files under the Perl Artistic Licence or the GNU General Public Licence. JSON-Parse-0.62/lib/JSON/Whitespace.pm000644 001751 001751 00000000662 14264373325 016624 0ustar00benben000000 000000 package JSON::Whitespace; use warnings; use strict; require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw/json_minify/; our %EXPORT_TAGS = ( all => \@EXPORT_OK, ); use warnings; use strict; use Carp; our $VERSION = '0.62'; use JSON::Tokenize 'tokenize_json'; sub json_minify { my ($json) = @_; my $tokens = tokenize_json ($json); my $nospace = strip_whitespace ($tokens, $json); return $nospace; } 1; JSON-Parse-0.62/lib/JSON/Tokenize.pm000644 001751 001751 00000001665 14264373325 016324 0ustar00benben000000 000000 package JSON::Tokenize; use warnings; use strict; require Exporter; our @ISA = qw(Exporter); use JSON::Parse; our @EXPORT_OK = qw/ tokenize_child tokenize_end tokenize_json tokenize_next tokenize_start tokenize_text tokenize_type /; our %EXPORT_TAGS = ('all' => \@EXPORT_OK); use Carp; our $VERSION = '0.62'; sub tokenize_text { my ($input, $token) = @_; if (! $input || ! $token) { croak "tokenize_text requires input string and JSON::Tokenize object"; } my $start = tokenize_start ($token); my $length = tokenize_end ($token) - $start; my $text; if (utf8::is_utf8 ($input)) { # $start and $length refer to bytes, so we need to convert # $input into bytes. my $copy = $input; utf8::encode ($copy); $text = substr ($copy, $start, $length); # Make the output utf8-flagged. utf8::decode ($text); } else { $text = substr ($input, $start, $length); } return $text; } 1; JSON-Parse-0.62/lib/JSON/Parse.pod000644 001751 001751 00000216257 14264373417 015763 0ustar00benben000000 000000 =pod =encoding UTF-8 =head1 NAME JSON::Parse - Parse JSON =head1 SYNOPSIS use JSON::Parse 'parse_json'; my $json = '["golden", "fleece"]'; my $perl = parse_json ($json); # Same effect as $perl = ['golden', 'fleece']; Convert JSON into Perl. =head1 VERSION This documents version 0.62 of JSON::Parse corresponding to L released on Sat Jul 16 08:23:13 2022 +0900. =head1 DESCRIPTION A module for parsing JSON. (JSON means "JavaScript Object Notation" and it is specified in L.) JSON::Parse offers the function L, which takes a string containing JSON, and returns an equivalent Perl structure. It also offers validation of JSON via L, which returns true or false depending on whether the JSON is correct or not, and L, which produces a descriptive fatal error if the JSON is invalid. A function L reads JSON from a file, and there is a safer version of L called L which doesn't throw exceptions. For special cases of parsing, there are also methods L and L, which create a JSON parsing object and run it on text. See L. JSON::Parse accepts only UTF-8 as input. See L and L. =head1 FUNCTIONS =head2 assert_valid_json use JSON::Parse 'assert_valid_json'; eval { assert_valid_json ('["xyz":"b"]'); }; if ($@) { print "Your JSON was invalid: $@\n"; } # Prints "Unexpected character ':' parsing array" produces output Your JSON was invalid: JSON error at line 1, byte 7/11: Unexpected character ':' parsing array starting from byte 1: expecting whitespace: 'n', '\r', '\t', ' ' or comma: ',' or end of array: ']' at /usr/home/ben/projects/json-parse/examples/assert.pl line 6. (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/assert.pl> in the distribution.) This is the underlying function for L. It runs at the same speed, but it throws an error if the JSON is wrong, rather than returning 1 or 0. See L for the error format, which is identical to L. This cannot detect key collisions in the JSON since it does not store values. See L for more on this module's handling of non-unique names in the JSON. The method equivalent to this is L. The behaviour of disallowing empty inputs was changed in version 0.49. =head2 parse_json use JSON::Parse 'parse_json'; my $perl = parse_json ('{"x":1, "y":2}'); This function converts JSON into a Perl structure, either an array reference, a hash reference, or a scalar. If the first argument does not contain a complete valid JSON text, is the undefined value, an empty string, or a string containing only whitespace C throws a fatal error ("dies"). If the argument contains valid JSON, the return value is either a hash reference, an array reference, or a scalar. If the input JSON text is a serialized object, a hash reference is returned: use JSON::Parse ':all'; my $perl = parse_json ('{"a":1, "b":2}'); print ref $perl, "\n"; produces output HASH (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/hash.pl> in the distribution.) If the input JSON text is a serialized array, an array reference is returned: use JSON::Parse ':all'; my $perl = parse_json ('["a", "b", "c"]'); print ref $perl, "\n"; produces output ARRAY (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/array.pl> in the distribution.) Otherwise a Perl scalar is returned. The behaviour of allowing a scalar was added in version 0.32 of this module. This brings it into line with the new specification for JSON. The behaviour of disallowing empty inputs was changed in version 0.49. The function L offers a version of this function with various safety features enabled. The method L is equivalent to this. =head2 parse_json_safe This is almost the same thing as L, but has the following differences: =over =item Does not throw exceptions If the JSON is invalid, a warning is printed and the undefined value is returned, as if calling L like this: eval { $out = parse_json ($json); }; if ($@) { carp $@; $out = undef; } =item Detects key collisions This switches on L, so that if the JSON contains non-unique names, a warning is printed and the undefined value is returned. See L for an explanation of what a key collision is. =item Booleans are not read-only This switches on L so that JSON true, false and null values are copied. These values can be modified, but they will not be converted back into C and C by L. =item Errors are reported by carp Parsing errors are reported by L, so the error line number refers to the caller's line. =back As the name implies, this is meant to be a "safety-first" version of L. 🎲 This function was added in version 0.38. =head2 read_json use JSON::Parse 'read_json'; my $p = read_json ('filename'); This is exactly the same as L except that it reads the JSON from the specified file rather than a scalar. The file must be in the UTF-8 encoding, and is opened as a character file using C<:encoding(utf8)> (see L and L). The output is marked as character strings. The method equivalent is L. This is a convenience function written in Perl. You may prefer to read the file yourself using another module if you need faster performance. This was renamed from L in version 0.59. The old name will also continue to work indefinitely. =head2 valid_json use JSON::Parse 'valid_json'; if (valid_json ($json)) { # do something } C returns I<1> if its argument is valid JSON and I<0> if not. It runs several times faster than L. This gain in speed is obtained because it discards the input data after reading it, rather than storing it into Perl variables. This does not supply the actual errors which caused invalidity. Use L to get error messages when the JSON is invalid. This cannot detect duplicate keys in JSON objects because it does not store values. See L for more on this module's handling of non-unique names in the JSON. =head1 METHODS If you need to parse JSON and you are not satisfied with the parsing options offered by L and L, you can create a JSON parsing object with L and set various options on the object, then use it with L or L. There are options to copy JSON literals (C, C, C) with L, switch off fatal errors with L, detect duplicate keys in objects with L, set the maximum depth of nested objects and arrays with L, produce machine-readable parsing errors with L, and set the JSON literals to user defined values with the methods described under L. These methods only affect the object created with L; they do not globally affect the behaviour of L or L. =head2 check eval { $jp->check ($json); }; This does the same thing as L, except its behaviour can be modified using the L method. 🎲 This method was added in version 0.48. This is for the benefit of L. =head2 copy_literals $jp->copy_literals (1); With a true value, copy JSON literal values (C, C, and C) into new Perl scalar values, and don't put read-only values into the output. With a false value, use read-only scalars: $jp->copy_literals (0); The C behaviour is the behaviour of L. The C behaviour is the behaviour of L. If the user also sets user-defined literals with L, L and L, that takes precedence over this. 🎲 This method was added in version 0.38. =head2 detect_collisions $jp->detect_collisions (1); This switches on a check for hash key collisions (non-unique names in JSON objects). If a collision is found, an error message L is printed, which also gives the non-unique name and the byte position where the start of the colliding string was found: use JSON::Parse; my $jp = JSON::Parse->new (); $jp->detect_collisions (1); eval { $jp->parse ('{"animals":{"cat":"moggy","cat":"feline","cat":"neko"}}'); }; print "$@\n" if $@; produces output JSON error at line 1, byte 28/55: Name is not unique: "cat" parsing object starting from byte 12 at /usr/home/ben/projects/json-parse/examples/../blib/lib/JSON/Parse.pm line 131. (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/collide.pl> in the distribution.) The C behaviour is the behaviour of L. The C behaviour is the behaviour of L. 🎲 This method was added in version 0.38. =head2 diagnostics_hash $jp->diagnostics_hash (1); This changes diagnostics produced by errors from a simple string into a hash reference containing various fields. This is incompatible with L. This replaces the previous experimental global variable C<$json_diagnostics>, which was removed from the module. The hash keys and values are identical to those provided in the object returned by C<$json_diagnostics>, with the addition of a key C which returns the usual error. This requires Perl version 5.14 or later. An example of the use of this method to "repair" broken JSON is in the module L. 🎲 This method was added in version 0.46. =head2 get_max_depth my $max_depth = $jp->get_max_depth (); This returns the maximum nesting depth of objects or arrays in the input JSON. The default value is 10,000. 🎲 This method was added in version 0.58. =head2 new my $jp = JSON::Parse->new (); Create a new JSON::Parse object. 🎲 This method was added in version 0.38. =head2 parse my $out = $jp->parse ($json); This does the same thing as L, except its behaviour can be modified using object methods. 🎲 This method was added in version 0.38. This was renamed from C in version 0.60. =head2 read my $json = $jp->read ($file); Read a file, parse the contained JSON, and return the output. This method is equivalent to the function L. 🎲 This method was added in version 0.60. =head2 set_max_depth $jp->set_max_depth (42); Set the maximum nesting depth of objects or arrays in the input JSON. The default value is 10,000. 🎲 This method was added in version 0.58. =head2 upgrade_utf8 $jp->upgrade_utf8 (1); Upgrade input from bytes to characters automatically. This can be switched off again using any false value: $jp->upgrade_utf8 (0); 🎲 This method was added in version 0.61. =head2 warn_only $jp->warn_only (1); Warn, don't die, on error. Failed parsing returns the undefined value, C, and prints a warning. This can be switched off again using any false value: $jp->warn_only (''); 🎲 This method was added in version 0.41. =head2 Methods for manipulating literals These methods alter what is written into the Perl structure when the parser sees a literal value, C, C or C in the input JSON. This number of methods is needed because of the possibility that a user wants to set the output for C to be C: $jp->set_false (undef); Thus, we cannot use a single function C<< $jp->false (undef) >> to cover both setting and deleting of values. 🎲 This facility was added in version 0.38. =head3 set_true $jp->set_true ("Yes, that is so true"); Supply a scalar to be used in place of the JSON C literal. This example puts the string "Yes, that is so true" into the hash or array when we hit a "true" literal, rather than the default read-only scalar: use JSON::Parse; my $json = '{"yes":true,"no":false}'; my $jp = JSON::Parse->new (); $jp->set_true ('Yes, that is so true'); my $out = $jp->parse ($json); print $out->{yes}, "\n"; prints Yes, that is so true To override the previous value, call it again with a new value. To delete the value and revert to the default behaviour, use L. If you give this a value which is not "true", as in Perl will evaluate it as a false in an if statement, it prints a warning L. You can switch this warning off with L. 🎲 This method was added in version 0.38. =head3 delete_true $jp->delete_true (); Delete the user-defined true value. See L. This method is "safe" in that it has absolutely no effect if no user-defined value is in place. It does not return a value. 🎲 This method was added in version 0.38. =head3 set_false $jp->set_false (JSON::PP::Boolean::false); Supply a scalar to be used in place of the JSON C literal. In the above example, when we hit a "false" literal, we put C in the output, similar to L and other CPAN modules like L or L. To override the previous value, call it again with a new value. To delete the value and revert to the default behaviour, use L. If you give this a value which is not "false", as in Perl will evaluate it as a false in an if statement, it prints a warning L. You can switch this warning off with L. 🎲 This method was added in version 0.38. =head3 delete_false $jp->delete_false (); Delete the user-defined false value. See L. This method is "safe" in that it has absolutely no effect if no user-defined value is in place. It does not return a value. 🎲 This method was added in version 0.38. =head3 set_null $jp->set_null (0); Supply a scalar to be used in place of the JSON C literal. To override the previous value, call it again with a new value. To delete the value and revert to the default behaviour, use L. 🎲 This method was added in version 0.38. =head3 delete_null $jp->delete_null (); Delete the user-defined null value. See L. This method is "safe" in that it has absolutely no effect if no user-defined value is in place. It does not return a value. 🎲 This method was added in version 0.38. =head3 no_warn_literals $jp->no_warn_literals (1); Use a true value to switch off warnings about setting boolean values to contradictory things. For example if you want to set the JSON C literal to turn into the string "false", $jp->no_warn_literals (1); $jp->set_false ("false"); See also L. This also switches off the warning L. 🎲 This method was added in version 0.38. =head1 OLD INTERFACE The following alternative function names are accepted. These are the names used for the functions in old versions of this module. These names are not deprecated and will never be removed from the module. The names ending in "_to_perl" seem quite silly in retrospect since surely it is obvious that one is programming in Perl. =head2 json_to_perl This is exactly the same function as L. =head2 json_file_to_perl This is exactly the same function as L. The function was renamed in version 0.59, after the same function in L. =head2 run This is the old name for L. =head2 validate_json This is exactly the same function as L. =head1 Mapping from JSON to Perl JSON elements are mapped to Perl as follows: =head2 JSON numbers JSON numbers become Perl numbers, either integers or double-precision floating point numbers, or possibly strings containing the number if parsing of a number by the usual methods fails somehow. JSON does not allow leading zeros, like I<0123>, or leading plus signs, like I<+100>, in numbers, so these cause an L error. JSON also does not allow numbers of the form I<1.>, but it does allow things like I<0e0> or I<1E999999>. As far as possible these are accepted by JSON::Parse. =head2 JSON strings JSON strings become Perl strings. The JSON escape characters such as C<\t> for the tab character (see section 2.5 of L) are mapped to the equivalent ASCII character. =head3 Handling of Unicode Inputs must be in the UTF-8 format. See L. In addition, JSON::Parse rejects UTF-8 which encodes non-characters such as C and ill-formed characters such as incomplete halves of surrogate pairs. Unicode encoding points in the input of the form C<\u3000> are converted into the equivalent UTF-8 bytes. Surrogate pairs in the form C<\uD834\uDD1E> are also handled. If the second half of the surrogate pair is missing, an L or L error is thrown. If the second half of the surrogate pair is present but contains an impossible value, a L error is thrown. If the input to L is marked as Unicode characters, the output strings will be marked as Unicode characters. If the input is not marked as Unicode characters, the output strings will not be marked as Unicode characters. Thus, use JSON::Parse ':all'; # The scalar $sasori looks like Unicode to Perl use utf8; my $sasori = '["蠍"]'; my $p = parse_json ($sasori); print utf8::is_utf8 ($p->[0]); # Prints 1. but use JSON::Parse ':all'; # The scalar $ebi does not look like Unicode to Perl no utf8; my $ebi = '["海老"]'; my $p = parse_json ($ebi); print utf8::is_utf8 ($p->[0]); # Prints nothing. Escapes of the form \uXXXX (see page three of L) are mapped to ASCII if XXXX is less than 0x80, or to UTF-8 if XXXX is greater than or equal to 0x80. Strings containing \uXXXX escapes greater than 0x80 are also upgraded to character strings, regardless of whether the input is a character string or a byte string, thus regardless of whether Perl thinks the input string is Unicode, escapes like \u87f9 are converted into the equivalent UTF-8 bytes and the particular string in which they occur is marked as a character string: use JSON::Parse ':all'; no utf8; # 蟹 my $kani = '["\u87f9"]'; my $p = parse_json ($kani); print "It's marked as a character string" if utf8::is_utf8 ($p->[0]); # Prints "It's marked as a character string" because it's upgraded # regardless of the input string's flags. This is modelled on the behaviour of Perl's C: no utf8; my $kani = '87f9'; print "hex is character string\n" if utf8::is_utf8 ($kani); # prints nothing $kani = chr (hex ($kani)); print "chr makes it a character string\n" if utf8::is_utf8 ($kani); # prints "chr makes it a character string" However, JSON::Parse also upgrades the remaining part of the string into a character string, even when it's not marked as a character string. For example, use JSON::Parse ':all'; use Unicode::UTF8 'decode_utf8'; no utf8; my $highbytes = "か"; my $not_utf8 = "$highbytes\\u3042"; my $test = "{\"a\":\"$not_utf8\"}"; my $out = parse_json ($test); # JSON::Parse does something unusual here in promoting the first part # of the string into UTF-8. print "JSON::Parse gives this: ", $out->{a}, "\n"; # Perl cannot assume that $highbytes is in UTF-8, so it has to just # turn the initial characters into garbage. my $add_chr = $highbytes . chr (0x3042); print "Perl's output is like this: ", $add_chr, "\n"; # In fact JSON::Parse's behaviour is equivalent to this: my $equiv = decode_utf8 ($highbytes) . chr (0x3042); print "JSON::Parse did something like this: ", $equiv, "\n"; # With character strings switched on, Perl and JSON::Parse do the same # thing. use utf8; my $is_utf8 = "か"; my $test2 = "{\"a\":\"$is_utf8\\u3042\"}"; my $out2 = parse_json ($test2); print "JSON::Parse: ", $out2->{a}, "\n"; my $add_chr2 = $is_utf8 . chr (0x3042); print "Native Perl: ", $add_chr2, "\n"; produces output JSON::Parse gives this: かあ Perl's output is like this: かあ JSON::Parse did something like this: かあ JSON::Parse: かあ Native Perl: かあ (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/unicode-details.pl> in the distribution.) Although in general the above would be an unsafe practice, JSON::Parse can do things this way because JSON is a text-only, Unicode-only format. To ensure that invalid inputs are never upgraded, JSON::Parse checks each input byte to make sure that it forms UTF-8. See also L. Doing things this way, rather than the way that Perl does it, was one of the original motivations for writing this module. =head2 JSON arrays JSON arrays become Perl array references. The elements of the Perl array are in the same order as they appear in the JSON. Thus my $p = parse_json ('["monday", "tuesday", "wednesday"]'); has the same result as a Perl declaration of the form my $p = [ 'monday', 'tuesday', 'wednesday' ]; =head2 JSON objects JSON objects become Perl hashes. The members of the JSON object become key and value pairs in the Perl hash. The string part of each object member becomes the key of the Perl hash. The value part of each member is mapped to the value of the Perl hash. Thus my $j = < ['blue', 'black'], tuesday => ['grey', 'heart attack'], friday => 'Gotta get down on Friday', }; =head3 Key collisions A key collision is something like the following. use JSON::Parse qw/parse_json parse_json_safe/; my $j = '{"a":1, "a":2}'; my $p = parse_json ($j); print "Ambiguous key 'a' is ", $p->{a}, "\n"; my $q = parse_json_safe ($j); produces output JSON::Parse::parse_json_safe: Name is not unique: "a" parsing object starting from byte 1 at /usr/home/ben/projects/json-parse/examples/key-collision.pl line 8. Ambiguous key 'a' is 2 (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/key-collision.pl> in the distribution.) Here the key "a" could be either 1 or 2. As seen in the example, L overwrites the first value with the second value. L halts and prints a warning. If you use L you can switch key collision on and off with the L method. The rationale for L not to give warnings is that Perl doesn't give information about collisions when storing into hash values, and checking for collisions for every key will degrade performance for the sake of an unlikely occurrence. The JSON specification says "The names within an object SHOULD be unique." (see L, page 5), although it's not a requirement. For performance, L and L do not store hash keys, thus they cannot detect this variety of problem. =head2 Literals =head3 false L maps the JSON false literal to a readonly scalar which evaluates to the empty string, or to zero in a numeric context. (This behaviour changed from version 0.36 to 0.37. In versions up to 0.36, the false literal was mapped to a readonly scalar which evaluated to 0 only.) L maps the JSON literal to a similar scalar without the readonly constraints. If you use a parser created with L, you can choose either of these behaviours with L, or you can tell JSON::Parse to put your own value in place of falses using the L method. =head3 null L maps the JSON null literal to a readonly scalar C<$JSON::Parse::null> which evaluates to C. L maps the JSON literal to the undefined value. If you use a parser created with L, you can choose either of these behaviours with L, or you can tell JSON::Parse to put your own value in place of nulls using the L method. =head3 true L maps the JSON true literal to a readonly scalar which evaluates to C<1>. L maps the JSON literal to the value 1. If you use a parser created with L, you can choose either of these behaviours with L, or you can tell JSON::Parse to put your own value in place of trues using the L method. =head3 Round trips and compatibility The Perl versions of literals produced by L will be converted back to JSON literals if you use L. However, JSON::Parse's literals are incompatible with the other CPAN JSON modules. For compatibility with other CPAN modules, create a JSON::Parse object with L, and set JSON::Parse's literals with L, L, and L. =head3 A round trip with JSON::Tiny This example demonstrates round-trip compatibility using L, version 0.58: use utf8; use JSON::Tiny '0.58', qw(decode_json encode_json); use JSON::Parse; use JSON::Create; my $cream = '{"clapton":true,"hendrix":false}'; my $jp = JSON::Parse->new (); my $jc = JSON::Create->new (sort => 1); print "First do a round-trip of our modules:\n\n"; print $jc->create ($jp->parse ($cream)), "\n\n"; print "Now do a round-trip of JSON::Tiny:\n\n"; print encode_json (decode_json ($cream)), "\n\n"; print "🥴 First, incompatible mode:\n\n"; print 'tiny(parse): ', encode_json ($jp->parse ($cream)), "\n"; print 'create(tiny): ', $jc->create (decode_json ($cream)), "\n\n"; # Set our parser to produce these things as literals: $jp->set_true (JSON::Tiny::true); $jp->set_false (JSON::Tiny::false); print "🔄 Compatibility with JSON::Parse:\n\n"; print 'tiny(parse):', encode_json ($jp->parse ($cream)), "\n\n"; $jc->bool ('JSON::Tiny::_Bool'); print "🔄 Compatibility with JSON::Create:\n\n"; print 'create(tiny):', $jc->create (decode_json ($cream)), "\n\n"; print "🔄 JSON::Parse and JSON::Create are still compatible too:\n\n"; print $jc->create ($jp->parse ($cream)), "\n"; produces output First do a round-trip of our modules: {"clapton":true,"hendrix":false} Now do a round-trip of JSON::Tiny: {"clapton":true,"hendrix":false} 🥴 First, incompatible mode: tiny(parse): {"clapton":1,"hendrix":""} create(tiny): {"clapton":1,"hendrix":0} 🔄 Compatibility with JSON::Parse: tiny(parse):{"clapton":true,"hendrix":false} 🔄 Compatibility with JSON::Create: create(tiny):{"clapton":true,"hendrix":false} 🔄 JSON::Parse and JSON::Create are still compatible too: {"clapton":true,"hendrix":false} (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.62/examples/json-tiny-round-trip-demo.pl> in the distribution.) Most of the other CPAN modules use similar methods to L, so the above example can easily be adapted. See also L for various examples. =head3 Modifying the values L maps all the literals to read-only values. Because of this, attempting to modifying the boolean values in the hash reference returned by L will cause "Modification of a read-only value attempted" errors: my $in = '{"hocus":true,"pocus":false,"focus":null}'; my $p = json_parse ($in); $p->{hocus} = 99; # "Modification of a read-only value attempted" error occurs Since the hash values are read-only scalars, C<< $p->{hocus} = 99 >> is like this: undef = 99; If you need to modify the returned hash reference, then delete the value first: my $in = '{"hocus":true,"pocus":false,"focus":null}'; my $p = json_parse ($in); delete $p->{pocus}; $p->{pocus} = 99; # OK Similarly with array references, delete the value before altering: my $in = '[true,false,null]'; my $q = json_parse ($in); delete $q->[1]; $q->[1] = 'magic'; Note that the return values from parsing bare literals are not read-only scalars, so my $true = JSON::Parse::json_parse ('true'); $true = 99; produces no error. This is because Perl copies the scalar. =head1 RESTRICTIONS This module imposes the following restrictions on its input. =over =item JSON only JSON::Parse is a strict parser. It only accepts input which exactly meets the criteria of L. That means, for example, JSON::Parse does not accept single quotes (') instead of double quotes ("), or numbers with leading zeros, like 0123. JSON::Parse does not accept control characters (0x00 - 0x1F) in strings, missing commas between array or hash elements like C<["a" "b"]>, or trailing commas like C<["a","b","c",]>. It also does not accept trailing non-whitespace, like the second "]" in C<["a"]]>. You may find L by the same authors as JSON::Parse useful if you need to process JSON-like text with tolerance for errors. =item No incremental parsing JSON::Parse does not parse incrementally. It only parses fully-formed JSON strings which include all opening and closing brackets. This is an inherent part of the design of the module. Incremental parsing in the style of L would require some kind of callback structure to deal with the elements of the partially digested structures, but JSON::Parse was never designed to do this; it merely converts what it sees into a Perl structure. Claims to offer incremental JSON parsing in other modules' documentation should be diligently verified. =item UTF-8 only JSON::Parse only parses the UTF-8 format. If input is in a different Unicode encoding than UTF-8, convert the input before handing it to this module. For example, for the UTF-16 format, use Encode 'decode'; my $input_utf8 = decode ('UTF-16', $input); my $perl = parse_json ($input_utf8); or, for a file, use C<:encoding> (see L and L): open my $input, "<:encoding(UTF-16)", 'some-json-file'; JSON::Parse does not try to determine the nature of the octet stream using BOM markers. A BOM marker in the input consists of bytes C<0xFE> and C<0xFF>, both of which are invalid as UTF-8, and thus will cause a fatal error. This restriction to UTF-8 applies regardless of whether Perl thinks that the input string is a character string or a byte string. Non-UTF-8 input will cause an L error. The latest specification for JSON, L, specifies it to be a UTF-8 only format. JSON::Parse does not accept Unicode non-characters (U+FFFF, UFDDO, etc.), UTF-8 representing surrogate pair code points, or bytes outside the range of Unicode code points as UTF-8 bytes. =back =head1 DIAGNOSTICS L does not produce error messages. L and L die on encountering invalid input. L uses L to pass error messages as warnings. Error messages have the line number, and the byte number where appropriate, of the input which caused the problem. The line number is formed simply by counting the number of "\n" (linefeed, ASCII 0x0A) characters in the whitespace part of the JSON. In L and L, parsing errors are fatal, so to continue after an error occurs, put the parsing into an C block: my $p; eval { $p = parse_json ($j); }; if ($@) { # handle error } The following error messages are produced: =over =item Unexpected character An unexpected character (byte) was encountered in the input. For example, when looking at the beginning of a string supposedly containing JSON, if the module encounters a plus sign, it will give an error like this: assert_valid_json ('+'); gives output JSON error at line 1, byte 1/1: Unexpected character '+' parsing initial state: expecting whitespace: 'n', '\r', '\t', ' ' or start of string: '"' or digit: '0-9' or minus: '-' or start of an array or object: '{', '[' or start of literal: 't', 'f', 'n' The message always includes a list of what characters are allowed. If there is some recognizable structure being parsed, the error message will include its starting point in the form "starting from byte n": assert_valid_json ('{"this":"\a"}'); gives output JSON error at line 1, byte 11/13: Unexpected character 'a' parsing string starting from byte 9: expecting escape: '', '/', '"', 'b', 'f', 'n', 'r', 't', 'u' A feature of JSON is that parsing it requires only one byte to be examined at a time. Thus almost all parsing problems can be handled using the "Unexpected character" error type, including spelling errors in literals: assert_valid_json ('[true,folse]'); gives output JSON error at line 1, byte 8/12: Unexpected character 'o' parsing literal starting from byte 7: expecting 'a' and the missing second half of a surrogate pair: assert_valid_json ('["\udc00? <-- should be a second half here"]'); gives output JSON error at line 1, byte 9/44: Unexpected character '?' parsing unicode escape starting from byte 3: expecting '\' All kinds of errors can occur parsing numbers, for example a missing fraction, assert_valid_json ('[1.e9]'); gives output JSON error at line 1, byte 4/6: Unexpected character 'e' parsing number starting from byte 2: expecting digit: '0-9' and a leading zero, assert_valid_json ('[0123]'); gives output JSON error at line 1, byte 3/6: Unexpected character '1' parsing number starting from byte 2: expecting whitespace: 'n', '\r', '\t', ' ' or comma: ',' or end of array: ']' or dot: '.' or exponential sign: 'e', 'E' The error message is this complicated because all of the following are valid here: whitespace: C<[0 ]>; comma: C<[0,1]>, end of array: C<[0]>, dot: C<[0.1]>, or exponential: C<[0e0]>. These are all handled by this error. Thus the error messages are a little confusing as diagnostics. Versions of this module prior to 0.29 gave more informative messages like "leading zero in number". (The messages weren't documented.) The reason to change over to the single message was because it makes the parsing code simpler, and because the testing code described in L makes use of the internals of this error to check that the error message produced actually do correspond to the invalid and valid bytes allowed by the parser, at the exact byte given. This is a bytewise error, thus for example if a miscoded UTF-8 appears in the input, an error message saying what bytes would be valid at that point will be printed. no utf8; use JSON::Parse 'assert_valid_json'; # Error in first byte: my $bad_utf8_1 = chr (hex ("81")); eval { assert_valid_json ("[\"$bad_utf8_1\"]"); }; print "$@\n"; # Error in third byte: my $bad_utf8_2 = chr (hex ('e2')) . chr (hex ('9C')) . 'b'; eval { assert_valid_json ("[\"$bad_utf8_2\"]"); }; print "$@\n"; prints JSON error at line 1, byte 3/5: Unexpected character 0x81 parsing string starting from byte 2: expecting printable ASCII or first byte of UTF-8: '\x20-\x7f', '\xC2-\xF4' at examples/bad-utf8.pl line 10. JSON error at line 1, byte 5/7: Unexpected character 'b' parsing string starting from byte 2: expecting bytes in range 80-bf: '\x80-\xbf' at examples/bad-utf8.pl line 16. =item Unexpected end of input The end of the string was encountered before the end of whatever was being parsed was. For example, if a quote is missing from the end of the string, it will give an error like this: assert_valid_json ('{"first":"Suzuki","second":"Murakami","third":"Asada}'); gives output JSON error at line 1: Unexpected end of input parsing string starting from byte 47 =item Not surrogate pair While parsing a string, a surrogate pair was encountered. While trying to turn this into UTF-8, the second half of the surrogate pair turned out to be an invalid value. assert_valid_json ('["\uDC00\uABCD"]'); gives output JSON error at line 1: Not surrogate pair parsing unicode escape starting from byte 11 =item Empty input This error occurs for an input which is an empty (no length or whitespace only) or an undefined value. assert_valid_json (''); gives output JSON error: Empty input parsing initial state Prior to version 0.49, this error was produced by L only, but it is now also produced by L. =item Name is not unique This error occurs when parsing JSON when the user has chosen L. For example an input like my $p = JSON::Parse->new (); $p->detect_collisions (1); $p->run ('{"hocus":1,"pocus":2,"hocus":3}'); gives output JSON error at line 1, byte 23/31: Name is not unique: "hocus" parsing object starting from byte 1 at blib/lib/JSON/Parse.pm line 131. where the JSON object has two keys with the same name, C. The terminology "name is not unique" is from the JSON specification. =item Contradictory values for "true" and "false" =over =item User-defined value for JSON false evaluates as true This happens if you set JSON false to map to a true value: $jp->set_false (1); To switch off this warning, use L. 🎲 This warning was added in version 0.38. =item User-defined value for JSON true evaluates as false This happens if you set JSON true to map to a false value: $jp->set_true (undef); To switch off this warning, use L. 🎲 This warning was added in version 0.38. =item User-defined value overrules copy_literals This warning is given if you set up literals with L then you also set up your own true, false, or null values with L, L, or L. 🎲 This warning was added in version 0.38. =back =back =head1 PERFORMANCE On the author's computer, the module's speed of parsing is approximately the same as L, with small variations depending on the type of input. For validation, L is faster than any other module known to the author, and up to ten times faster than JSON::XS. Some special types of input, such as floating point numbers containing an exponential part, like "1e09", seem to be about two or three times faster to parse with this module than with L. In JSON::Parse, parsing of exponentials is done by the system's C function, but JSON::XS contains its own parser for exponentials, so these results may be system-dependent. At the moment the main place JSON::XS wins over JSON::Parse is in strings containing escape characters, where JSON::XS is about 10% faster on the module author's computer and compiler. As of version 0.33, despite some progress in improving JSON::Parse, I haven't been able to fully work out the reason behind the better speed. There is some benchmarking code in the github repository under the directory "benchmarks" for those wishing to test these claims. The script L|https://github.com/benkasminbullock/JSON-Parse/d04630086f6c92fea720cba4568faa0cbbdde5a6/benchmarks/bench> is an adaptation of the similar script in the L distribution. The script L|https://github.com/benkasminbullock/JSON-Parse/d04630086f6c92fea720cba4568faa0cbbdde5a6/benchmarks/pub-bench.pl> runs the benchmarks and prints them out as POD. The following benchmark tests used version 0.58_01 of JSON::Parse, version 4.03 of L, and version 4.25 of L on Perl version v5.32.0 compiled with Clang version FreeBSD clang version 10.0.1 on FreeBSD 12.2. The files in the "benchmarks" directory of JSON::Parse. F and F are the benchmarks used by L. =over =item short.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| Cpanel | 313007.761 | 0.0000319 | JP::valid | 838860.800 | 0.0000119 | JSON::Parse | 310689.185 | 0.0000322 | JSON::XS | 303935.072 | 0.0000329 | --------------+------------+------------+ =item long.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| Cpanel | 5611.860 | 0.0017819 | JP::valid | 13586.991 | 0.0007360 | JSON::Parse | 4924.048 | 0.0020308 | JSON::XS | 6406.452 | 0.0015609 | --------------+------------+------------+ =item words-array.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| Cpanel | 34749.826 | 0.0002878 | JP::valid | 270600.258 | 0.0000370 | JSON::Parse | 34017.064 | 0.0002940 | JSON::XS | 35726.610 | 0.0002799 | --------------+------------+------------+ =item exp.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| Cpanel | 46759.242 | 0.0002139 | JP::valid | 117817.528 | 0.0000849 | JSON::Parse | 46759.242 | 0.0002139 | JSON::XS | 19195.899 | 0.0005209 | --------------+------------+------------+ =item literals.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| Cpanel | 33026.016 | 0.0003028 | JP::valid | 384798.532 | 0.0000260 | JSON::Parse | 40840.351 | 0.0002449 | JSON::XS | 33689.189 | 0.0002968 | --------------+------------+------------+ =item cpantesters.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| Cpanel | 212.377 | 0.0470860 | JP::valid | 1309.043 | 0.0076392 | JSON::Parse | 207.491 | 0.0481949 | JSON::XS | 226.439 | 0.0441620 | --------------+------------+------------+ =back =head1 SEE ALSO =over =item RFC 8259 JSON is specified in L. =item json.org L is the website for JSON, authored by Douglas Crockford. =back =head2 Other CPAN modules for parsing and producing JSON The ⭐ represents the number of votes this module has received on metacpan, on a logarithmic scale. Modules which we recommend are marked with 👍. Deprecated modules and modules which are definitely buggy (bug reports/pull requests ignored) and abandoned (no releases for several years) are marked with 👎 and/or 🐛. Modules we can't work out are marked with 😕. =over =item Modules by the same author =over =item JSON::Create 👍 L is a companion module to JSON::Parse by the same author. =item JSON::Repair L is an example module which demonstrates using JSON::Parse to apply some kinds of heuristics to repair "relaxed JSON" or otherwise broken JSON into compliant JSON. =item JSON::Server L is a module which offers a JSON-only, UTF-8 only server using C and L. =item JSON::Tokenize L is part of the JSON::Parse distribution, a tokenizer which reduces a JSON string to tokens. This makes the JSON::Parse tokenizer available to people who want to write their own JSON parsers. =item JSON::Whitespace L is for manipulating the "insignificant whitespace" part of JSON. =back =item Reading and writing JSON =over =item L [⭐⭐ Author: L; Date: C<2021-04-12>; Version: C<4.26>] This is a fork of L. Please see the module for details about the reasons for the fork. =item L [⭐ Author: L; Date: C<2020-11-18>; Version: C<1.00>] Slurp a JSON file into a data structure, and the reverse. It relies on L. =item L [⭐ Author: L; Date: C<2015-04-19>; Version: C<0.002>] Uses the JSON library from Glib, a library of C functions for the Linux GNOME desktop project, so it is independent of the other CPAN modules. Judging from the fairly sparse documentation, it seems to be a module where you build the JSON on the fly rather than converting a Perl structure wholesale into JSON. =item L [⭐⭐ Author: L; Date: C<2021-01-24>; Version: C<4.03>] This calls on either L or L. =item L [Author: L; Date: C<2010-09-29>; Version: C<0.47>] 👎🐛 This module "Does What I Want", where "I" refers to the module's author. Development seems to have ceased in 2010, there is a long list of unfixed bugs, and some of the module's features seem to predate Unicode support in Perl. It is written in XS, and it claims to accept a wide variety of non-JSON formats such as comments, single-quoted strings, trailing commas, etc. =item L [Author: L; Date: C<2021-03-16>; Version: C<0.04>] Uses L to parse JSON. =item L [⭐⭐ Author: L; Date: C<2021-01-23>; Version: C<4.06>] This is part of the Perl core, installed when you install Perl. "PP" stands for "Pure Perl", which means it is in Perl-only without the XS (C-based) parsing. This is slower but may be necessary if you cannot install modules requiring a C compiler. =item L [⭐ Author: L; Date: C<2019-10-30>; Version: C<0.12>] Convenient file slurping and spurting of data using JSON. Uses L or L if available. The basic idea seems to be that it uses context to return arrays or hashes as required, and read and write files without extra stages of opening and closing the file. =item L [⭐⭐ Author: L; Date: C<2020-10-26>; Version: C<1.34>] 👎🐛 Takes advantage of a similarity between YAML (yet another markup language) and JSON to provide a JSON parser/producer using L. We have never tried this module, but it seems to be semi-deprecated (the ABSTRACT says "consider using JSON::XS instead!") and L about things like failing to process equals signs. However, the maintainer is fixing some of the bugs and making new releases, so we're not really sure. =item L [⭐⭐ Author: L; Date: C<2017-11-12>; Version: C<0.58>] This is a fork of L. =item L [⭐⭐⭐ Author: L; Date: C<2020-10-27>; Version: C<4.03>] This is an all-purpose JSON module in XS, which means it requires a C compiler to install. =item L [⭐ Author: L; Date: C<2011-08-05>; Version: C<0.10>] 👎🐛 Wraps a C library called yajl. The module has been abandoned since ten years ago. Bug reports include serious errors, and pull requests have been ignored. =item L [⭐⭐⭐ Author: L; Date: C<2021-04-13>; Version: C<9.17>] Part of the L standalone web framework, "pure Perl" JSON reader/writer. As of version 8.70 of Mojolicious, this actually depends on L but will load L if it is available. =back =item Combination modules These modules rely on more than one back-end module to process JSON for you. =over =item L [⭐ Author: L; Date: C<2015-06-10>; Version: C<1.39>] 👎 This now-deprecated module combines L, L versions one and two, and L. =item L [⭐⭐ Author: L; Date: C<2020-11-13>; Version: C<1.004003>] A module which combines L, L, and L. The original L combines L and L, but this prioritizes L over L. =item L [Author: L; Date: C<2008-02-13>; Version: C<0.31>] 👎 A "combination module" which supports two different interfaces of L. However, JSON::XS is now onto version 4. =item L [⭐ Author: L; Date: C<2019-08-07>; Version: C<1.002>] 👎 This pulls in L instead of L for L users. It seems to have been rendered obsolete by modern versions of Mojolicious due to changes to make that depend on L if available. =back =item Test-related modules =over =item L [Author: L; Date: C<2019-08-07>; Version: C<0.05>] =item L [⭐ Author: L; Date: C<2018-04-24>; Version: C<0.05>] Compare JSON with L. As of version 0.05, it relies on L. =item L [⭐ Author: L; Date: C<2009-08-09>; Version: C<0.11>] 👎 This offers a way to compare two different JSON strings to see if they refer to the same object. The most recent version, 0.11, was released in 2009, and it relies on the deprecated L, which makes it essentially abandoned. =item L [Author: L; Date: C<2012-09-14>; Version: C<0.2>] 👎 Test whether one JSON or Perl structure entails/subsumes another. The most recent version is from 2012, and it relies on L, so it is probably abandoned. Also, oddly but not uniquely for CPAN modules with the name JSON in the title, it seems to not actually have that much to do with JSON, which is a data serialisation format, but actually be testing Perl hashes and arrays. =item L [Author: L; Date: C<2016-04-28>; Version: C<0.02>] JSON Test Utility. As of version 0.02, it relies on L but it is able to use L instead, and so probably L would be OK too. According to the documentation, it can test JSON for validity and compare JSON strings with keys in a different order, and presumably with different whitespace. =back =item Type-related modules These untangle numbers, strings, and booleans into JSON types. =over =item L [Author: L; Date: C<2015-10-26>; Version: C] 😕 Virtually undocumented, it's not clear what this does. =item L [⭐ Author: L; Date: C<2012-10-17>; Version: C<0.05>] Change the type of a Perl variable so that it comes out as a number, a string, or a boolean in the output JSON. =item L [Author: L; Date: C<2017-04-01>; Version: C<0.03>] The module is barely documented, but from looking at L, this seems to enable you to change the output type of a number or a string so that you can, for example, make the number C<1> come out as either a number, C<1>, a string C<"1">, or a boolean, C, in the output JSON. =item L [⭐ Author: L; Date: C<2021-05-03>; Version: C<0.007>] "Replace mushy strings and numbers with rigidly typed replacements" Since Perl muddles strings and numbers, this enables you to work out whether your input JSON was C<"123"> (a string) or C<123> (a number). =back =item Special-purpose modules =over =item L [⭐ Author: L; Date: C<2015-03-04>; Version: C<1.000>] Convert JSON data to other formats. It reads your JSON file or input and converts it into either YAML or Perl native format using L. =item L [⭐⭐ Author: L; Date: C<2016-07-08>; Version: C<0.46>] 👍 This module offers C and C literals in Perl, so you just have use boolean; my $something = true; This is very useful for dealing with JSON. =item L [Author: L; Date: C<2014-12-25>; Version: C<1.5202>] Configuration files in JSON, with hash comments also allowed. =item L [⭐ Author: L; Date: C<2017-09-03>; Version: C<1.001>] For one-liners. =over If you use this module from the command-line, the last value of your one-liner (-e) code will be serialized as JSON data. =back =item L [Author: L; Date: C<2012-07-27>; Version: C] "Embed JSON data structures directly into your Perl code". Relies on L. =item L [Author: L; Date: C<2015-04-16>; Version: C<0.04>] Create JSON under memory limitations. =item L [⭐ Author: L; Date: C<2021-05-07>; Version: C<0.131>] 🌈 This module generates JSON colorized with ANSI escape sequences. =item L [⭐ Author: L; Date: C<2014-09-11>; Version: C<0.004>] =item L [Author: L; Date: C<2020-01-27>; Version: C<0.006>] C is a special-purpose module for parsing JSON objects which have key collisions (something like C<{"a":1,"a":2}>) within objects. (JSON::Parse's handling of key collisions is discussed in L in this document.) =item L [Author: L; Date: C<2015-02-04>; Version: C] Automatically change a JSON string when a data structure changes using tied scalars. =back =item Patch, path, pointer, schema, and transform modules =over =item L [Author: L; Date: C<2017-07-07>; Version: C<0.08>] "Asserts JSONPaths into a JSON data structure for correct values/matches" =item L [Author: L; Date: C<2021-03-29>; Version: C<1.00>] =item L [Author: L; Date: C<2014-09-11>; Version: C<0.002>] =item L [⭐ Author: L; Date: C<2012-10-12>; Version: C<0.011>] =item L [Author: L; Date: C<2021-05-16>; Version: C<0.06>] Perl access to the C tool via L. =item L [⭐ Author: L; Date: C<2016-02-24>; Version: C<0.04>] =item L [Author: L; Date: C<2018-10-25>; Version: C<0.04>] 😕 We don't know what this does, or how it relates to JSON. The example in the synopsis section of the document doesn't show any JSON, it shows an example of altering nested hashes in Perl. =item L [⭐ Author: L; Date: C<2021-01-28>; Version: C<0.431>] Search nested hashref/arrayref structures using JSONPath. =item L [⭐ Author: L; Date: C<2015-08-13>; Version: C<0.07>] Extract parts of a JSON string. =item L [⭐ Author: L; Date: C<2021-04-06>; Version: C<0.19>] "Generate example JSON structures from JSON Schema definitions" =item L [⭐ Author: L; Date: C<2014-09-28>; Version: C<0.104>] Transform JSON using JsonT =item L [⭐ Author: L; Date: C<2020-01-01>; Version: C<0.03>] =item L [⭐⭐ Author: L; Date: C<2021-04-28>; Version: C<4.17>] "Validate data against a JSON schema" - you can decide what the JSON is supposed to contain. =item L [Author: L; Date: C<2019-03-07>; Version: C<0.08>] "Adds a .json vmethod for all TT values." - for use with L