JSON-Parse-0.56/000755 001751 001751 00000000000 13622411134 012636 5ustar00benben000000 000000 JSON-Parse-0.56/t/000755 001751 001751 00000000000 13622411134 013101 5ustar00benben000000 000000 JSON-Parse-0.56/json-whitespace.c000644 001751 001751 00000006704 13166405765 016134 0ustar00benben000000 000000 /* Type for adding whitespace. */ typedef struct json_ws { SV * news; SV * olds; /* Length of new string. */ unsigned int news_l; /* Copy point. */ char * q; /* Origin */ char * p; /* Top of token tree. */ json_token_t * t; /* Token under examination now. */ json_token_t * next; /* Whitespace to add before and after. */ char * before[n_json_tokens]; char * after[n_json_tokens]; int array_depth; int object_depth; char * array_indent; char * object_indent; } json_ws_t; static void copy_whitespace (json_ws_t * ws, char * w) { char * q; q = ws->q; while (*w) { *q++ = *w++; } ws->q = q; } static INLINE int whitespace_json (json_ws_t * ws) { /* Copy place. */ char * c; /* Value of q at entry to this routine, used to calculate added length. */ char * qorig; json_token_t * next; char * q; q = ws->q; qorig = q; next = ws->next; while (next) { /* Copy start of string. */ copy_whitespace (ws, ws->before[next->type]); switch (next->type) { case json_token_object: *q++ = '{'; ws->object_depth++; ws->q = q; q += whitespace_json (ws); ws->object_depth--; *q++ = '}'; break; case json_token_array: *q++ = '['; ws->array_depth++; ws->q = q; q += whitespace_json (ws); ws->object_depth--; *q++ = ']'; break; case json_token_string: case json_token_key: case json_token_literal: case json_token_number: for (c = ws->p + next->start; c <= ws->p + next->end; c++) { *q++ = *c; } break; case json_token_comma: *q++ = ','; break; case json_token_colon: *q++ = ':'; break; default: croak ("unhandled token type %d", next->type); } /* Copy end of string. */ c = ws->after[next->type]; while (*c) { *q++ = *c++; } next = next->next; } return q - qorig; } static int copy_json (char * p, char * q, json_token_t * t) { /* Loop variable. */ json_token_t * next; /* Copy place. */ char * c; /* Value of q at entry to this routine, used to calculate added length. */ char * qorig; next = t; qorig = q; while (next) { switch (next->type) { case json_token_object: *q++ = '{'; q += copy_json (p, q, next->child); *q++ = '}'; break; case json_token_array: *q++ = '['; q += copy_json (p, q, next->child); *q++ = ']'; break; case json_token_string: case json_token_key: case json_token_literal: case json_token_number: for (c = p + next->start; c < p + next->end; c++) { *q++ = *c; } break; case json_token_comma: *q++ = ','; break; case json_token_colon: *q++ = ':'; break; default: croak ("unhandled token type %d", next->type); } next = next->next; } return q - qorig; } /* Remove all the whitespace. */ static SV * strip_whitespace (json_token_t * tokens, SV * json) { SV * stripped; char * p; char * q; /* Original length. */ STRLEN l; /* Length of output. */ unsigned int m; p = SvPV (json, l); stripped = newSV (l); /* Tell Perl it's a string. */ SvPOK_on (stripped); /* Set UTF-8 if necessary. */ if (SvUTF8 (json)) { SvUTF8_on (stripped); } /* Get a pointer to the string inside "stripped". */ q = SvPVX (stripped); m = copy_json (p, q, tokens); /* Set the length. */ SvCUR_set (stripped, m); return stripped; } JSON-Parse-0.56/examples/000755 001751 001751 00000000000 13622411134 014454 5ustar00benben000000 000000 JSON-Parse-0.56/typemap000644 001751 001751 00000000135 12614536151 014246 0ustar00benben000000 000000 json_parse_t * T_PTROBJ JSON::Parse T_PTROBJ json_token_t * T_PTROBJ JSON::Tokenize T_PTROBJ JSON-Parse-0.56/Makefile.PL000644 001751 001751 00000002420 13616666517 014632 0ustar00benben000000 000000 use strict; use warnings; use ExtUtils::MakeMaker; my $pm = 'lib/JSON/Parse.pm'; my $pod = 'lib/JSON/Parse.pod'; my $github = 'github.com/benkasminbullock/JSON-Parse'; my $repo = "https://$github"; WriteMakefile ( NAME => 'JSON::Parse', VERSION_FROM => $pm, ABSTRACT_FROM => $pod, AUTHOR => 'Ben Bullock ', LICENSE => 'perl', PREREQ_PM => { 'Carp' => '0', }, META_MERGE => { 'meta-spec' => { version => 2, }, resources => { repository => { type => 'git', url => "git://$github.git", web => $repo, }, bugtracker => { web => "$repo/issues", }, }, x_contributors => [ 'Shlomi Fish ', 'kolmogorov42', ], }, # All the C files are actually #included into Json3.xs so there is # only one object file. OBJECT => 'Json3.o', # Currently the oldest version to pass at CPAN testers. # http://matrix.cpantesters.org/?dist=JSON-Parse+0.24 MIN_PERL_VERSION => '5.008009', EXE_FILES => [ 'script/validjson', ], # Uncomment the following to get "author tests" # OPTIMIZE => '-Wall -g', # OPTIMIZE => '-Wall -O', # This achieves only small speedups with GCC. # OPTIMIZE => '-O3', # CC => 'gcc', ); JSON-Parse-0.56/json-perl.c000644 001751 001751 00000062363 13423032745 014733 0ustar00benben000000 000000 /* The C part is broken into three pieces, "json-common.c", "json-perl.c", and "json-entry-points.c". This file contains the "Perl" stuff, for example if we have a string, the stuff to convert it into a Perl hash key or a Perl scalar is in this file. */ /* There are two routes through the code, the PERLING route and the non-PERLING route. If we go via the non-PERLING route, we never create or alter any Perl-related stuff, we just parse each byte and possibly throw an error. This makes validation faster. */ #ifdef PERLING /* We are creating Perl structures from the JSON. */ #define PREFIX(x) x #define SVPTR SV * #define SETVALUE value = #elif defined(TOKENING) /* We are just tokenizing the JSON. */ #define PREFIX(x) tokenize_ ## x #define SVPTR json_token_t * #define SETVALUE value = #else /* not def PERLING/TOKENING */ /* Turn off everything to do with creating Perl things. */ #define PREFIX(x) valid_ ## x #define SVPTR void #define SETVALUE #endif /* def PERLING */ /* This is what INT_MAX_DIGITS is, but #defining it like this causes huge amounts of unnecessary calculation, so this is commented out. #define INT_MAX_DIGITS ((int) (log (INT_MAX) / log (10)) - 1) */ /* The maximum digits we allow an integer before throwing in the towel and returning a Perl string type. */ #define INT_MAX_DIGITS 8 #define USEDIGIT guess = guess * 10 + (c - '0') static INLINE SVPTR PREFIX (number) (json_parse_t * parser) { /* End marker for strtod. */ char * end; /* Start marker for strtod. */ char * start; /* A guess for integer numbers. */ int guess; /* The parsed character itself, the cause of our motion. */ unsigned char c; /* If it has exp or dot in it. */ double d; /* Negative number. */ int minus; /* When this is called, it means that a byte indicating a number was found. We need to re-examine that byte as a number. */ parser->end--; start = (char *) parser->end; #define FAILNUMBER(err) \ if (STRINGEND && \ parser->top_level_value && \ c == '\0') { \ goto exp_number_end; \ } \ parser->bad_byte = parser->end - 1; \ parser->error = json_error_ ## err; \ parser->bad_type = json_number; \ parser->bad_beginning = \ (unsigned char*) start; \ failbadinput (parser) #define NUMBEREND \ WHITESPACE: \ case ']': \ case '}': \ case ',' #define XNUMBEREND (XCOMMA|XWHITESPACE|parser->end_expected) guess = 0; minus = 0; switch (NEXTBYTE) { case DIGIT19: guess = c - '0'; goto leading_digit19; case '0': goto leading_zero; case '-': minus = 1; goto leading_minus; default: parser->expected = XDIGIT | XMINUS; FAILNUMBER (unexpected_character); } leading_digit19: switch (NEXTBYTE) { case DIGIT: USEDIGIT; goto leading_digit19; case '.': goto dot; case 'e': case 'E': goto exp; case NUMBEREND: goto int_number_end; default: parser->expected = XDIGIT | XDOT | XEXPONENTIAL | XNUMBEREND; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } leading_zero: switch (NEXTBYTE) { case '.': /* "0." */ goto dot; case 'e': case 'E': /* "0e" */ goto exp; case NUMBEREND: /* "0" */ goto int_number_end; default: parser->expected = XDOT | XEXPONENTIAL | XNUMBEREND; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } leading_minus: switch (NEXTBYTE) { case DIGIT19: USEDIGIT; goto leading_digit19; case '0': goto leading_zero; default: parser->expected = XDIGIT; FAILNUMBER (unexpected_character); } /* Things like "5." are not allowed so there is no NUMBEREND here. */ dot: switch (NEXTBYTE) { case DIGIT: goto dot_digits; default: parser->expected = XDIGIT; FAILNUMBER (unexpected_character); } /* We have as much as 5.5 so we can stop. */ dot_digits: switch (NEXTBYTE) { case DIGIT: goto dot_digits; case 'e': case 'E': goto exp; case NUMBEREND: goto exp_number_end; default: parser->expected = XDIGIT | XNUMBEREND | XEXPONENTIAL; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } /* Things like "10E" are not allowed so there is no NUMBEREND here. */ exp: switch (NEXTBYTE) { case '-': case '+': goto exp_sign; case DIGIT: goto exp_digits; default: parser->expected = XDIGIT | XMINUS | XPLUS; FAILNUMBER (unexpected_character); } exp_sign: switch (NEXTBYTE) { case DIGIT: goto exp_digits; default: parser->expected = XDIGIT; FAILNUMBER (unexpected_character); } /* We have as much as "3.0e1" or similar. */ exp_digits: switch (NEXTBYTE) { case DIGIT: goto exp_digits; case NUMBEREND: goto exp_number_end; default: parser->expected = XDIGIT | XNUMBEREND; if (parser->top_level_value) { parser->expected &= ~XCOMMA; } FAILNUMBER (unexpected_character); } exp_number_end: parser->end--; d = strtod (start, & end); if ((unsigned char *) end == parser->end) { /* Success, strtod worked as planned. */ #ifdef PERLING return newSVnv (d); #elif defined (TOKENING) return json_token_new (parser, (unsigned char *) start, parser->end, json_token_number); #else return; #endif } else { /* Failure, strtod rejected the number. */ goto string_number_end; } int_number_end: parser->end--; if (parser->end - (unsigned char *) start < INT_MAX_DIGITS + minus) { if (minus) { guess = -guess; } /* printf ("number debug: '%.*s': %d\n", parser->end - (unsigned char *) start, start, guess); */ #ifdef PERLING return newSViv (guess); #elif defined (TOKENING) return json_token_new (parser, (unsigned char *) start, parser->end - 1, json_token_number); #else return; #endif } else { goto string_number_end; } string_number_end: /* We could not convert this number using a number conversion routine, so we are going to convert it to a string. This might happen with ridiculously long numbers or something. The JSON standard doesn't explicitly disallow integers with a million digits. */ #ifdef PERLING return newSVpv (start, (STRLEN) ((char *) parser->end - start)); #elif defined (TOKENING) return json_token_new (parser, (unsigned char *) start, parser->end - 1, json_token_number); #else return; #endif } #ifdef PERLING /* This copies our on-stack buffer "buffer" of size "size" into the end of a Perl SV called "string". */ #define COPYBUFFER { \ if (! string) { \ string = newSVpvn ((char *) buffer, size); \ } \ else { \ char * svbuf; \ STRLEN cur = SvCUR (string); \ if (SvLEN (string) <= cur + size) { \ SvGROW (string, cur + size); \ } \ svbuf = SvPVX (string); \ memcpy (svbuf + cur, buffer, size); \ SvCUR_set (string, cur + size); \ } \ } /* The size of the on-stack buffer. */ #define BUFSIZE 0x1000 /* We need a safety margin when dealing with the buffer, for example if we hit a Unicode \uabcd escape which needs to be decoded, we need to have enough bytes to write into the buffer. */ #define MARGIN 0x10 /* Speedup hack, a special "get_string" for Perl parsing which doesn't use parser->buffer but its own buffer on the stack. */ static INLINE SV * perl_get_string (json_parse_t * parser, STRLEN prefixlen) { unsigned char * b; unsigned char c; unsigned char * start; unsigned char buffer[BUFSIZE]; STRLEN size; SV * string; string = 0; start = parser->end; b = buffer; if (prefixlen > 0) { /* The string from parser->end to parser->end + prefixlen has already been checked and found not to contain the end of the string or any escapes, so we just copy the memory straight into the buffer. This was supposed to speed things up, but it didn't seem to. However this presumably cannot hurt either. */ if (prefixlen > BUFSIZE - MARGIN) { /* This is to account for the very unlikely case that the key of the JSON object is more than BUFSIZE - MARGIN bytes long and has an escape after more than BUFSIZE - MARGIN bytes. */ prefixlen = BUFSIZE - MARGIN; } memcpy (buffer, parser->end, prefixlen); start += prefixlen; } string_start: size = b - buffer; if (size >= BUFSIZE - MARGIN) { /* Spot-check for an overflow. */ if (STRINGEND) { STRINGFAIL (unexpected_end_of_input); } /* "string_start" is a label for a goto which is applied until we get to the end of the string, so size keeps getting larger and larger. Now the string being parsed has proved to be too big for our puny BUFSIZE buffer, so we copy the contents of the buffer into the nice Perl scalar. */ COPYBUFFER; /* Set the point of copying bytes back to the beginning of buffer. We don't reset the memory in buffer. */ b = buffer; } NEXTBYTE; /* "if" statements seem to compile to something marginally faster than "switch" statements, for some reason. */ if (c < 0x20) { ILLEGALBYTE; } else if (c >= 0x20 && c <= 0x80) { /* For some reason or another, putting the following "if" statements after the above one results in about 4% faster code than putting them before it. */ if (c == '"') { goto string_end; } if (c == '\\') { HANDLE_ESCAPES (parser->end, start - 1); goto string_start; } * b++ = c; goto string_start; } else { /* Resort to switch statements for the UTF-8 stuff. This actually also contains statements to handle ASCII but they will never be executed. */ switch (c) { #define ADDBYTE * b = c; b++ #define startofutf8string start #include "utf8-byte-one.c" default: /* We have to give up, this byte is too mysterious for our weak minds. */ ILLEGALBYTE; } } string_end: if (STRINGEND) { STRINGFAIL (unexpected_end_of_input); } COPYBUFFER; return string; /* The rest of the UTF-8 stuff goes in here. */ #include "utf8-next-byte.c" #undef ADDBYTE goto string_end; } #endif /* PERLING */ static SVPTR PREFIX (string) (json_parse_t * parser) { unsigned char c; #ifdef PERLING SV * string; STRLEN len; STRLEN prefixlen; #elif defined (TOKENING) json_token_t * string; int len; #else int len; #endif unsigned char * start; start = parser->end; len = 0; /* First of all, we examine the string to work out how long it is and to look for escapes. If we find them, we go to "contains_escapes" and go back and do all the hard work of converting the escapes into the right things. If we don't find any escapes, we just use "start" and "len" and copy the string from inside "input". This is a trick to increase the speed of processing. */ string_start: switch (NEXTBYTE) { case '"': goto string_end; case '\\': goto contains_escapes; #define ADDBYTE len++ #include "utf8-byte-one.c" /* Not a fall through. */ case BADBYTES: ILLEGALBYTE; } /* Parsing of the string ended due to a \0 byte flipping the "while" switch and we dropped into this section before reaching the string's end. */ ILLEGALBYTE; #include "utf8-next-byte.c" #undef ADDBYTE string_end: #ifdef PERLING /* Our string didn't contain any escape sequences, so we can just make a new SV * by copying the string from "start", the old position within the thing we're parsing to start + len. */ string = newSVpvn ((char *) start, len); #elif defined (TOKENING) string = json_token_new (parser, start - 1, start + len, json_token_string); #endif goto string_done; contains_escapes: #ifdef PERLING /* Use "perl_get_string" which keeps the buffer on the stack. Results in a minor speed increase. */ parser->end = start; prefixlen = (STRLEN) (parser->end - start); string = perl_get_string (parser, prefixlen); #elif defined (TOKENING) /* Don't use "len" here since it subtracts the escapes. */ parser->end = start; len = get_string (parser); string = json_token_new (parser, /* Location of first quote. */ start - 1, /* Location of last quote. */ parser->end, json_token_string); #else parser->end = start; len = get_string (parser); #endif string_done: #ifdef PERLING if (parser->unicode || parser->force_unicode) { SvUTF8_on (string); parser->force_unicode = 0; } #endif #if defined (PERLING) || defined (TOKENING) return string; #else return; #endif } #define FAILLITERAL(c) \ parser->expected = XIN_LITERAL; \ parser->literal_char = c; \ parser->bad_beginning = start; \ parser->error = json_error_unexpected_character; \ parser->bad_type = json_literal; \ parser->bad_byte = parser->end - 1; \ failbadinput (parser) static SVPTR PREFIX (literal_true) (json_parse_t * parser) { unsigned char * start; start = parser->end - 1; if (* parser->end++ == 'r') { if (* parser->end++ == 'u') { if (* parser->end++ == 'e') { #ifdef PERLING if (parser->user_true) { return newSVsv (parser->user_true); } else if (parser->copy_literals) { return newSVsv (&PL_sv_yes); } else { return &PL_sv_yes; } #elif defined (TOKENING) return json_token_new (parser, start, parser->end - 1, json_token_literal); #else return; #endif } FAILLITERAL ('e'); } FAILLITERAL ('u'); } FAILLITERAL ('r'); } static SVPTR PREFIX (literal_false) (json_parse_t * parser) { unsigned char * start; start = parser->end - 1; if (* parser->end++ == 'a') { if (* parser->end++ == 'l') { if (* parser->end++ == 's') { if (* parser->end++ == 'e') { #ifdef PERLING if (parser->user_false) { return newSVsv (parser->user_false); } else if (parser->copy_literals) { return newSVsv (&PL_sv_no); } else { return &PL_sv_no; } #elif defined (TOKENING) return json_token_new (parser, start, parser->end - 1, json_token_literal); #else return; #endif } FAILLITERAL ('e'); } FAILLITERAL ('s'); } FAILLITERAL ('l'); } FAILLITERAL ('a'); } static SVPTR PREFIX (literal_null) (json_parse_t * parser) { unsigned char * start; start = parser->end - 1; if (* parser->end++ == 'u') { if (* parser->end++ == 'l') { if (* parser->end++ == 'l') { #ifdef PERLING if (parser->user_null) { return newSVsv (parser->user_null); } else if (parser->copy_literals) { return newSVsv (&PL_sv_undef); } else { SvREFCNT_inc (json_null); return json_null; } #elif defined (TOKENING) return json_token_new (parser, start, parser-> end - 1, json_token_literal); #else return; #endif } FAILLITERAL ('l'); } FAILLITERAL ('l'); } FAILLITERAL ('u'); } static SVPTR PREFIX (object) (json_parse_t * parser); /* Given one character, decide what to do next. This goes in the switch statement in both "object ()" and "array ()". */ #define PARSE(start,expected) \ \ case WHITESPACE: \ goto start; \ \ case '"': \ SETVALUE PREFIX (string) (parser); \ break; \ \ case '-': \ case DIGIT: \ parser->end_expected = expected; \ SETVALUE PREFIX (number) (parser); \ break; \ \ case '{': \ SETVALUE PREFIX (object) (parser); \ break; \ \ case '[': \ SETVALUE PREFIX (array) (parser); \ break; \ \ case 'f': \ SETVALUE PREFIX (literal_false) (parser); \ break; \ \ case 'n': \ SETVALUE PREFIX (literal_null) (parser); \ break; \ \ case 't': \ SETVALUE PREFIX (literal_true) (parser); \ break #define FAILARRAY(err) \ parser->bad_byte = parser->end - 1; \ parser->bad_type = json_array; \ parser->bad_beginning = start; \ parser->error = json_error_ ## err; \ failbadinput (parser) /* We have seen "[", so now deal with the contents of an array. At the end of this routine, "parser->end" is pointing one beyond the final "]" of the array. */ static SVPTR PREFIX (array) (json_parse_t * parser) { unsigned char c; unsigned char * start; #ifdef PERLING AV * av; SV * value = & PL_sv_undef; #elif defined (TOKENING) json_token_t * av; json_token_t * prev; json_token_t * value; #endif start = parser->end - 1; #ifdef PERLING av = newAV (); #elif defined (TOKENING) av = json_token_new (parser, start, 0, json_token_array); prev = 0; #endif array_start: switch (NEXTBYTE) { PARSE (array_start, XARRAY_END); case ']': goto array_end; default: parser->expected = VALUE_START | XWHITESPACE | XARRAY_END; FAILARRAY (unexpected_character); } #ifdef PERLING av_push (av, value); #elif defined (TOKENING) prev = json_token_set_child (parser, av, value); #endif /* Accept either a comma or whitespace or the end of the array. */ array_middle: switch (NEXTBYTE) { case WHITESPACE: goto array_middle; case ',': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, parser->end - 1, json_token_comma); prev = json_token_set_next (prev, value); #endif goto array_next; case ']': /* Array with at least one element. */ goto array_end; default: parser->expected = XWHITESPACE | XCOMMA | XARRAY_END; FAILARRAY (unexpected_character); } array_next: switch (NEXTBYTE) { PARSE (array_next, XARRAY_END); default: parser->expected = VALUE_START | XWHITESPACE; FAILARRAY (unexpected_character); } #ifdef PERLING av_push (av, value); #elif defined (TOKENING) prev = json_token_set_next (prev, value); #endif goto array_middle; array_end: #ifdef PERLING return newRV_noinc ((SV *) av); #elif defined (TOKENING) /* We didn't know where the end was until now. */ json_token_set_end (parser, av, parser->end - 1); return av; #else return; #endif } #define FAILOBJECT(err) \ parser->bad_byte = parser->end - 1; \ parser->bad_type = json_object; \ parser->bad_beginning = start; \ parser->error = json_error_ ## err; \ failbadinput (parser) /* We have seen "{", so now deal with the contents of an object. At the end of this routine, "parser->end" is pointing one beyond the final "}" of the object. */ static SVPTR PREFIX (object) (json_parse_t * parser) { char c; #ifdef PERLING HV * hv; SV * value; /* This is set to -1 if we want a Unicode key. See "perldoc perlapi" under "hv_store". */ int uniflag; #elif defined (TOKENING) json_token_t * hv; json_token_t * value; json_token_t * prev; #endif string_t key; /* Start of parsing. */ unsigned char * start; start = parser->end - 1; #ifdef PERLING if (parser->unicode) { /* Keys are unicode. */ uniflag = -1; } else { /* Keys are not unicode. */ uniflag = 1; } hv = newHV (); #elif defined (TOKENING) hv = json_token_new (parser, start, 0, json_token_object); prev = 0; #endif hash_start: switch (NEXTBYTE) { case WHITESPACE: goto hash_start; case '}': goto hash_end; case '"': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, 0, json_token_string); /* We only come past the label "hash_start" once, so we don't need to check that there is not already a child. */ json_token_set_child (parser, hv, value); prev = value; #endif get_key_string (parser, & key); #ifdef TOKENING /* We didn't know where the end of the string was until now so we wait until after "get_key_string" to set the end. */ json_token_set_end (parser, value, parser->end - 1); #endif goto hash_next; default: parser->expected = XWHITESPACE | XSTRING_START | XOBJECT_END; FAILOBJECT (unexpected_character); } hash_middle: /* We are in the middle of a hash. We have seen a key:value pair, and now we're looking for either a comma and then another key-value pair, or a closing curly brace and the end of the hash. */ switch (NEXTBYTE) { case WHITESPACE: goto hash_middle; case '}': goto hash_end; case ',': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, parser->end - 1, json_token_comma); prev = json_token_set_next (prev, value); #endif goto hash_key; default: parser->expected = XWHITESPACE | XCOMMA | XOBJECT_END; FAILOBJECT (unexpected_character); } hash_key: /* We're looking for a key in the hash, which is a string starting with a double quotation mark. */ switch (NEXTBYTE) { case WHITESPACE: goto hash_key; case '"': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, 0, json_token_string); prev = json_token_set_next (prev, value); #endif get_key_string (parser, & key); #ifdef TOKENING /* We didn't know where the end of the string was until now so we wait until after "get_key_string" to set the end. */ json_token_set_end (parser, value, parser->end - 1); #endif goto hash_next; default: parser->expected = XWHITESPACE | XSTRING_START; FAILOBJECT (unexpected_character); } hash_next: /* We've seen a key, now we're looking for a colon. */ switch (NEXTBYTE) { case WHITESPACE: goto hash_next; case ':': #ifdef TOKENING value = json_token_new (parser, parser->end - 1, parser->end - 1, json_token_colon); prev = json_token_set_next (prev, value); #endif goto hash_value; default: parser->expected = XWHITESPACE | XVALUE_SEPARATOR; FAILOBJECT (unexpected_character); } hash_value: /* We've seen a colon, now we're looking for a value, which can be anything at all, including another hash. Most of the cases are dealt with in the PARSE macro. */ switch (NEXTBYTE) { PARSE (hash_value, XOBJECT_END); default: parser->expected = XWHITESPACE | VALUE_START; FAILOBJECT (unexpected_character); } if (key.contains_escapes) { /* The key had something like "\n" in it, so we can't just copy the value but have to process it to remove the escapes. */ int klen; klen = resolve_string (parser, & key); #ifdef PERLING key.start = parser->buffer; key.length = klen; #endif } #ifdef PERLING if (parser->detect_collisions) { /* Look in hv for an existing key with our values. */ SV ** sv_ptr; sv_ptr = hv_fetch (hv, (char *) key.start, key.length * uniflag, 0); if (sv_ptr) { parser->bad_byte = key.start; parser->bad_length = key.length; parser->bad_type = json_object; parser->bad_beginning = start; parser->error = json_error_name_is_not_unique; failbadinput (parser); } } (void) hv_store (hv, (char *) key.start, key.length * uniflag, value, 0); #endif #if defined(TOKENING) prev = json_token_set_next (prev, value); #endif goto hash_middle; hash_end: #ifdef PERLING return newRV_noinc ((SV *) hv); #elif defined (TOKENING) json_token_set_end (parser, hv, parser->end - 1); return hv; #else return; #endif } #undef PREFIX #undef SVPTR #undef SETVALUE #ifdef PERLING /* Set and delete user-defined literals. */ static void json_parse_delete_true (json_parse_t * parser) { if (parser->user_true) { SvREFCNT_dec (parser->user_true); parser->user_true = 0; } } static void json_parse_set_true (json_parse_t * parser, SV * user_true) { json_parse_delete_true (parser); if (! SvTRUE (user_true) && ! parser->no_warn_literals) { warn ("User-defined value for JSON true evaluates as false"); } if (parser->copy_literals && ! parser->no_warn_literals) { warn ("User-defined value overrules copy_literals"); } parser->user_true = user_true; SvREFCNT_inc (user_true); } static void json_parse_delete_false (json_parse_t * parser) { if (parser->user_false) { SvREFCNT_dec (parser->user_false); parser->user_false = 0; } } static void json_parse_set_false (json_parse_t * parser, SV * user_false) { json_parse_delete_false (parser); if (SvTRUE (user_false) && ! parser->no_warn_literals) { warn ("User-defined value for JSON false evaluates as true"); } if (parser->copy_literals && ! parser->no_warn_literals) { warn ("User-defined value overrules copy_literals"); } parser->user_false = user_false; SvREFCNT_inc (user_false); } static void json_parse_delete_null (json_parse_t * parser) { if (parser->user_null) { SvREFCNT_dec (parser->user_null); parser->user_null = 0; } } static void json_parse_set_null (json_parse_t * parser, SV * user_null) { if (parser->copy_literals && ! parser->no_warn_literals) { warn ("User-defined value overrules copy_literals"); } json_parse_delete_null (parser); parser->user_null = user_null; SvREFCNT_inc (user_null); } static void json_parse_free (json_parse_t * parser) { json_parse_delete_true (parser); json_parse_delete_false (parser); json_parse_delete_null (parser); Safefree (parser); } static void json_parse_copy_literals (json_parse_t * parser, SV * onoff) { if (! parser->no_warn_literals && (parser->user_true || parser->user_false || parser->user_null)) { warn ("User-defined value overrules copy_literals"); } parser->copy_literals = SvTRUE (onoff) ? 1 : 0; } #endif /* def PERLING */ JSON-Parse-0.56/Json3.xs000644 001751 001751 00000011375 13171627670 014232 0ustar00benben000000 000000 #include "EXTERN.h" #include "perl.h" #include #include "XSUB.h" /* TESTRANDOM should never be defined in the code released to CPAN. That this is not defined is tested in "xt/testrandom-invalid.t". */ //#define TESTRANDOM #ifdef TESTRANDOM #include #endif /* def TESTRANDOM */ /* A structure representing the "null" in JSON. Although we're now using PL_sv_yes and PL_sv_no, we don't use PL_sv_undef, because perldoc perlguts says it's a bad idea. */ static SV * json_null; /* Code starts here. */ #include "unicode.h" #include "unicode.c" #include "json-common.c" #define PERLING #include "json-perl.c" #undef PERLING #define TOKENING #include "json-perl.c" #undef TOKENING #include "json-perl.c" #include "json-entry-points.c" #ifdef TESTRANDOM #include "json-random-test.c" #endif /* def TESTRANDOM */ #include "json-whitespace.c" #ifdef NOPERL #error "Cannot define NOPERL error when compiling Perl version" #endif /* def NOPERL */ typedef json_parse_t * JSON__Parse; typedef json_token_t * JSON__Tokenize; MODULE=JSON::Parse PACKAGE=JSON::Parse PROTOTYPES: DISABLE BOOT: { json_null = get_sv ("JSON::Parse::null", GV_ADD); SvREADONLY_on (json_null); } SV * parse_json (json) SV * json; CODE: RETVAL = parse (json); OUTPUT: RETVAL SV * parse_json_safer (json) SV * json; CODE: RETVAL = parse_safe (json); OUTPUT: RETVAL void assert_valid_json (json) SV * json; CODE: validate (json, 0); JSON::Parse new (char * class, ...) CODE: if (! class) { croak ("no class"); } Newxz (RETVAL, 1, json_parse_t); OUTPUT: RETVAL SV * run_internal (parser, json) JSON::Parse parser SV * json CODE: RETVAL = json_parse_run (parser, json); OUTPUT: RETVAL void check (parser, json) JSON::Parse parser SV * json CODE: check (parser, json); void DESTROY (parser) JSON::Parse parser; CODE: json_parse_free (parser); void set_true (parser, user_true) JSON::Parse parser; SV * user_true; CODE: json_parse_set_true (parser, user_true); void set_false (parser, user_false) JSON::Parse parser; SV * user_false; CODE: json_parse_set_false (parser, user_false); void set_null (parser, user_null) JSON::Parse parser; SV * user_null; CODE: json_parse_set_null (parser, user_null); void delete_true (parser) JSON::Parse parser; CODE: json_parse_delete_true (parser); void delete_false (parser) JSON::Parse parser; CODE: json_parse_delete_false (parser); void delete_null (parser) JSON::Parse parser; CODE: json_parse_delete_null (parser); void copy_literals (parser, onoff) JSON::Parse parser; SV * onoff; CODE: json_parse_copy_literals (parser, onoff); void no_warn_literals (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->no_warn_literals = SvTRUE (onoff) ? 1 : 0; void diagnostics_hash (parser, onoff) JSON::Parse parser; SV * onoff; CODE: #if PERL_VERSION > 12 parser->diagnostics_hash = SvTRUE (onoff) ? 1 : 0; #else warn ("diagnostics_hash () requires Perl 5.14 or later; this is 5.%d", PERL_VERSION); #endif void warn_only (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->warn_only = SvTRUE (onoff) ? 1 : 0; int get_warn_only (parser) JSON::Parse parser; CODE: if (parser->warn_only) { RETVAL = 1; } else { RETVAL = 0; } OUTPUT: RETVAL void detect_collisions (parser, onoff) JSON::Parse parser; SV * onoff; CODE: parser->detect_collisions = SvTRUE (onoff) ? 1 : 0; #ifdef TESTRANDOM int random_json () CODE: RETVAL = random_json (); OUTPUT: RETVAL #endif /* def TESTRANDOM */ MODULE=JSON::Parse PACKAGE=JSON::Tokenize JSON::Tokenize tokenize_json (json) SV * json; CODE: RETVAL = tokenize (json); RETVAL->blessed = 1; OUTPUT: RETVAL JSON::Tokenize tokenize_child (token) JSON::Tokenize token CODE: if (token->child) { RETVAL = token->child; RETVAL->blessed = 1; } OUTPUT: RETVAL JSON::Tokenize tokenize_next (token) JSON::Tokenize token CODE: if (token->next) { RETVAL = token->next; RETVAL->blessed = 1; } else { RETVAL = 0; } OUTPUT: RETVAL int tokenize_start (token) JSON::Tokenize token CODE: RETVAL = token->start; OUTPUT: RETVAL int tokenize_end (token) JSON::Tokenize token CODE: RETVAL = token->end; OUTPUT: RETVAL SV * tokenize_type (token) JSON::Tokenize token CODE: /* Only set this to the real value if everything is OK. */ RETVAL = & PL_sv_undef; if (token->type > json_token_invalid && token->type < n_json_tokens) { RETVAL = newSVpv (token_names[token->type], 0); } else { warn ("Invalid JSON token type %d", token->type); } OUTPUT: RETVAL void DESTROY (token) JSON::Tokenize token CODE: tokenize_free (token); MODULE=JSON::Parse PACKAGE=JSON::Whitespace SV * strip_whitespace (tokens, json) JSON::Tokenize tokens; SV * json; CODE: RETVAL = strip_whitespace (tokens, json); OUTPUT: RETVAL JSON-Parse-0.56/script/000755 001751 001751 00000000000 13622411134 014142 5ustar00benben000000 000000 JSON-Parse-0.56/json-common.c000644 001751 001751 00000105137 13423036410 015247 0ustar00benben000000 000000 /* These things are common between the validation and the parsing routines. This is #included into "Json3.xs". */ /* The following matches bytes which are not allowed in JSON strings. "All Unicode characters may be placed within the quotation marks except for the characters that must be escaped: quotation mark, reverse solidus, and the control characters (U+0000 through U+001F)." - from section 2.5 of RFC 4627 */ #define BADBYTES \ '\0':case 0x01:case 0x02:case 0x03: \ case 0x04:case 0x05:case 0x06:case 0x07: \ case 0x08:case 0x09:case 0x0A:case 0x0B: \ case 0x0C:case 0x0D:case 0x0E:case 0x0F: \ case 0x10:case 0x11:case 0x12:case 0x13: \ case 0x14:case 0x15:case 0x16:case 0x17: \ case 0x18:case 0x19:case 0x1A:case 0x1B: \ case 0x1C:case 0x1D:case 0x1E:case 0x1F /* Match whitespace. Whitespace is as defined by the JSON standard, not by Perl. "Insignificant whitespace is allowed before or after any of the six structural characters. ws = *( %x20 / ; Space %x09 / ; Horizontal tab %x0A / ; Line feed or New line %x0D ; Carriage return )" From JSON RFC. */ #define WHITESPACE \ '\n': \ parser->line++; \ /* Fallthrough. */ \ case ' ': \ case '\t': \ case '\r' /* Match digits. */ #define DIGIT \ '0': \ case '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9' /* Match digits from 1-9. This is handled differently because JSON disallows leading zeros in numbers. */ #define DIGIT19 \ '1': \ case '2': \ case '3': \ case '4': \ case '5': \ case '6': \ case '7': \ case '8': \ case '9' /* Hexadecimal, in upper and lower case. */ #define UHEX 'A': case 'B': case 'C': case 'D': case 'E': case 'F' #define LHEX 'a': case 'b': case 'c': case 'd': case 'e': case 'f' /* As of version 0.45 of JSON::Parse, most of the UTF-8 switches are now in "unicode.c", but the following one is JSON-specific. */ /* This excludes '"' and '\'. */ #define BYTE_20_7F \ 0x20: case 0x21:\ case 0x23: case 0x24: case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:\ case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: case 0x2F: case 0x30:\ case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:\ case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E:\ case 0x3F: case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45:\ case 0x46: case 0x47: case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:\ case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: case 0x52: case 0x53:\ case 0x54: case 0x55: case 0x56: case 0x57: case 0x58: case 0x59: case 0x5A:\ case 0x5B: case 0x5D: case 0x5E: case 0x5F: case 0x60: case 0x61:\ case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: case 0x68:\ case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:\ case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76:\ case 0x77: case 0x78: case 0x79: case 0x7A: case 0x7B: case 0x7C: case 0x7D:\ case 0x7E: case 0x7F /* A "string_t" is a pointer into the input, which lives in "parser->input". The "string_t" structure is used for copying strings when the string does not contain any escapes. When a string contains escapes, it is copied into "parser->buffer". */ typedef struct string { unsigned char * start; #ifdef NOPERL int length; #else /* def NOPERL */ STRLEN length; #endif /* def NOPERL */ /* The "contains_escapes" flag is set if there are backslash escapes in the string like "\r", so that it needs to be cleaned up before using it. That means we use "parser->buffer". This is to speed things up, by not doing the cleanup when it isn't necessary. */ unsigned contains_escapes : 1; } string_t; typedef enum { json_invalid, json_initial_state, json_string, json_number, json_literal, json_object, json_array, json_unicode_escape, json_overflow } json_type_t; const char * type_names[json_overflow] = { "invalid", "initial state", "string", "number", "literal", "object", "array", "unicode escape" }; /* The maximum value of bytes to check for. */ #define JSON3MAXBYTE 0x100 // uncomment this when running random test to terminal otherwise the // random characters will mess up the terminal. //#define JSON3MAXBYTE 0x80 #include "errors.c" /* Anything which could be the start of a value. */ #define VALUE_START (XARRAYOBJECTSTART|XSTRING_START|XDIGIT|XMINUS|XLITERAL) typedef struct parser { /* The length of "input". */ unsigned int length; /* The input. This is fixed at the beginning throughout parsing. */ unsigned char * input; /* The end-point of the parsing, the last parsed thing. This increments through "input". */ unsigned char * end; /* The last byte of "input", "parser->input + parser->length". This is used to detect overflows. */ unsigned char * last_byte; /* Allocated size of "buffer". */ int buffer_size; /* Buffer to stick strings into temporarily. */ unsigned char * buffer; /* Line number. */ int line; /* Where the beginning of the series of unfortunate events was. For example if we are parsing an array, this points to the "[" at the start of the array, or if we are parsing a string, this points to the byte after the '"' at the start of the string. */ unsigned char * bad_beginning; /* The bad type itself. */ json_type_t bad_type; /* What we were expecting to see when the error occurred. */ int expected; /* The byte which caused the parser to fail. */ unsigned char * bad_byte; unsigned bad_length; /* The type of error encountered. */ json_error_t error; /* If we were parsing a literal and found a bad character, what were we expecting? */ unsigned char literal_char; /* The end expected. */ int end_expected; /* Number of mallocs. */ int n_mallocs; /* Bytes we accept. */ int valid_bytes[JSON3MAXBYTE]; /* Perl SV * pointers to copy for our true, false, and null values. */ void * user_true; void * user_false; void * user_null; /* If this is 1, we copy the literals into new SVs. */ unsigned int copy_literals : 1; /* If this is 1, we don't die on errors. */ unsigned int warn_only : 1; /* If this is 1, we check for hash collisions before inserting values. */ unsigned int detect_collisions : 1; /* Don't warn the user about non-false false and untrue true values, etc. */ unsigned int no_warn_literals : 1; /* Are we tokenizing the input? */ unsigned int tokenizing : 1; #ifdef TESTRANDOM /* Return point for longjmp. */ jmp_buf biscuit; char * last_error; #endif /* Unicode? */ unsigned int unicode : 1; /* Force unicode. This happens when we hit "\uxyzy". */ unsigned int force_unicode : 1; /* Top-level value? We need to know this for the case when we are parsing a number and suddenly meet a '\0' byte. If it's a top level value then we can assume that is just the end of the JSON, but if it's not a top-level value then that is an error, since the end array or end object at least are missing. */ unsigned int top_level_value : 1; /* Produce diagnostics as a hash rather than a string. */ unsigned int diagnostics_hash : 1; #ifdef TESTRANDOM /* This is true if we are testing with random bytes. */ unsigned randomtest : 1; #endif /* def TESTRANDOM */ } json_parse_t; #ifndef NOPERL static SV * error_to_hash (json_parse_t * parser, char * error_as_string); #endif /* ndef NOPERL */ #ifdef __GNUC__ #define INLINE inline #else #define INLINE #endif /* def __GNUC__ */ /* The size of the buffer for printing errors. */ #define ERRORMSGBUFFERSIZE 0x1000 /* Declare all bad inputs as non-returning. */ #ifdef __GNUC__ #if 0 static void failbadinput_json (json_parse_t * parser) __attribute__ ((noreturn)); #endif /* 0 */ static void failbadinput (json_parse_t * parser) __attribute__ ((noreturn)); static INLINE void failbug (char * file, int line, json_parse_t * parser, const char * format, ...) __attribute__ ((noreturn)); #endif /* Assert failure handler. Coming here means there is a bug in the code rather than in the JSON input. We still send it to Perl via "croak". */ static INLINE void failbug (char * file, int line, json_parse_t * parser, const char * format, ...) { char buffer[ERRORMSGBUFFERSIZE]; va_list a; va_start (a, format); vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a); va_end (a); croak ("JSON::Parse: %s:%d: Internal error at line %d: %s", file, line, parser->line, buffer); } /* This is a test for whether the string has ended, which we use when we catch a zero byte in an unexpected part of the input. Here we use ">" rather than ">=" because "parser->end" is incremented by one after each access. See the NEXTBYTE macro. */ #define STRINGEND (parser->end > parser->last_byte) /* One of the types which demands a specific next byte. */ #define SPECIFIC(c) (((c) & XIN_LITERAL) || ((c) & XIN_SURROGATE_PAIR)) /* Make the list of valid bytes. */ static void make_valid_bytes (json_parse_t * parser) { int i; for (i = 0; i < JSON3MAXBYTE; i++) { parser->valid_bytes[i] = 0; } for (i = 0; i < n_expectations; i++) { int X; X = 1<expected & X) { int j; for (j = 0; j < JSON3MAXBYTE; j++) { parser->valid_bytes[j] |= allowed[i][j]; } } } if (SPECIFIC (parser->expected)) { parser->valid_bytes[parser->literal_char] = 1; } } /* Repeated arguments to snprintf. */ #define SNEND buffer + string_end #define SNSIZE ERRORMSGBUFFERSIZE - string_end /* Disabled due to clash with Darwin compiler: http://www.cpantesters.org/cpan/report/7c69e0f0-70c0-11e3-95aa-bcf4d95af652 http://www.cpantesters.org/cpan/report/6cde36da-6fd1-11e3-946f-2b87da5af652 #define SNEND, SNSIZE buffer + string_end, ERRORMSGBUFFERSIZE - string_end */ #define EROVERFLOW \ if (string_end >= ERRORMSGBUFFERSIZE - 0x100) { \ failbug (__FILE__, __LINE__, parser, \ "Error string length is %d" \ " of maximum %d. Bailing out.", \ string_end, ERRORMSGBUFFERSIZE); \ } #if 0 /* Coming in to this routine, we have checked the error for validity and converted at failbadinput. If this is called directly the bug traps won't work. */ static void failbadinput_json (json_parse_t * parser) { char buffer[ERRORMSGBUFFERSIZE]; int string_end; string_end = 0; string_end += snprintf (SNEND, SNSIZE, "{" "\"input length\":%d" ",\"bad type\":\"%s\"" ",\"error\":\"%s\"", parser->length, type_names[parser->bad_type], json_errors[parser->error]); EROVERFLOW; if (parser->bad_byte) { int position; position = (int) (parser->bad_byte - parser->input) + 1, string_end += snprintf (SNEND, SNSIZE, ",\"bad byte position\":%d" ",\"bad byte contents\":%d", position, * parser->bad_byte); EROVERFLOW; } if (parser->bad_beginning) { int bcstart; bcstart = (int) (parser->bad_beginning - parser->input) + 1; string_end += snprintf (SNEND, SNSIZE, ",\"start of broken component\":%d", bcstart); EROVERFLOW; } if (parser->error == json_error_unexpected_character) { int j; make_valid_bytes (parser); string_end += snprintf (SNEND, SNSIZE, ",\"valid bytes\":[%d", parser->valid_bytes[0]); EROVERFLOW; for (j = 1; j < JSON3MAXBYTE; j++) { string_end += snprintf (SNEND, SNSIZE, ",%d", parser->valid_bytes[j]); } EROVERFLOW; string_end += snprintf (SNEND, SNSIZE, "]"); EROVERFLOW; } string_end += snprintf (SNEND, SNSIZE, "}\n"); EROVERFLOW; croak (buffer); } #endif /* 0 */ static void failbadinput (json_parse_t * parser) { char buffer[ERRORMSGBUFFERSIZE]; int string_end; int i; int l; const char * format; /* If the error is "unexpected character", and we are at the end of the input, change to "unexpected end of input". This is probably triggered by reading a byte with value '\0', but we don't check the value of "* parser->bad_byte" in the following "if" statement, since it's an error to go past the expected end of the string regardless of whether the byte is '\0'. */ if (parser->error == json_error_unexpected_character && STRINGEND) { parser->error = json_error_unexpected_end_of_input; /* We don't care about what byte it was, we went past the end of the string, which is already a failure. */ parser->bad_byte = 0; /* It trips an assertion if "parser->expected" is set for anything other than an "unexpected character" error. */ parser->expected = 0; } /* Array bounds check for error message. */ if (parser->error <= json_error_invalid && parser->error >= json_error_overflow) { failbug (__FILE__, __LINE__, parser, "Bad value for parser->error: %d\n", parser->error); } format = json_errors[parser->error]; l = strlen (format); if (l >= ERRORMSGBUFFERSIZE - 1) { l = ERRORMSGBUFFERSIZE - 1; } for (i = 0; i < l; i++) { buffer[i] = format[i]; } buffer[l] = '\0'; string_end = l; /* If we got an unexpected character somewhere, append the exact value of the character to the error message. */ if (parser->error == json_error_unexpected_character) { /* This contains the unexpected character itself, from the "parser->bad_byte" pointer. */ unsigned char bb; /* Make sure that we were told where the unexpected character was. Unlocated unexpected characters are a bug. */ if (! parser->bad_byte) { failbug (__FILE__, __LINE__, parser, "unexpected character error but " "parser->bad_byte is invalid"); } bb = * parser->bad_byte; /* We have to check what kind of character. For example printing '\0' with %c will just give a message which suddenly ends when printed to the terminal, and other control characters will be invisible. So display the character in a different way depending on whether it's printable or not. */ /* Don't use "isprint" because on Windows it seems to think that 0x80 is printable: http://www.cpantesters.org/cpan/report/d6438b68-6bf4-1014-8647-737bdb05e747 */ if (bb >= 0x20 && bb < 0x7F) { /* Printable character, print the character itself. */ string_end += snprintf (SNEND, SNSIZE, " '%c'", bb); EROVERFLOW; } else { /* Unprintable character, print its hexadecimal value. */ string_end += snprintf (SNEND, SNSIZE, " 0x%02x", bb); EROVERFLOW; } } else if (parser->error == json_error_name_is_not_unique) { string_end += snprintf (SNEND, SNSIZE, ": \"%.*s\"", parser->bad_length, parser->bad_byte); } /* "parser->bad_type" contains what was being parsed when the error occurred. This should never be undefined. */ if (parser->bad_type <= json_invalid || parser->bad_type >= json_overflow) { failbug (__FILE__, __LINE__, parser, "parsing type set to invalid value %d in error message", parser->bad_type); } string_end += snprintf (SNEND, SNSIZE, " parsing %s", type_names[parser->bad_type]); EROVERFLOW; if (parser->bad_beginning) { int bad_byte; bad_byte = (parser->bad_beginning - parser->input) + 1; string_end += snprintf (SNEND, SNSIZE, " starting from byte %d", bad_byte); EROVERFLOW; } /* "parser->expected" is set for the "unexpected character" error and it tells the user what kind of input was expected. It contains various flags or'd together, so this goes through each possible flag and prints a message for it. */ if (parser->expected) { if (parser->error == json_error_unexpected_character) { int i; int joined; unsigned char bb; bb = * parser->bad_byte; string_end += snprintf (SNEND, SNSIZE, ": expecting "); EROVERFLOW; joined = 0; if (SPECIFIC (parser->expected)) { if (! parser->literal_char) { failbug (__FILE__, __LINE__, parser, "expected literal character unset"); } string_end += snprintf (SNEND, SNSIZE, "'%c'", parser->literal_char); EROVERFLOW; } for (i = 0; i < n_expectations; i++) { int X; X = 1<expected & X) { /* Check that this really is disallowed. */ if (allowed[i][bb]) { failbug (__FILE__, __LINE__, parser, "mismatch parsing %s: got %X " "but it's allowed by %s (%d)", type_names[parser->bad_type], bb, input_expectation[i], i); } if (joined) { string_end += snprintf (SNEND, SNSIZE, " or "); EROVERFLOW; } string_end += snprintf (SNEND, SNSIZE, "%s", input_expectation[i]); EROVERFLOW; joined = 1; } } } else { failbug (__FILE__, __LINE__, parser, "'expected' is set but error %s != unexp. char", json_errors[parser->error]); } } else if (parser->error == json_error_unexpected_character) { failbug (__FILE__, __LINE__, parser, "unexpected character error for 0X%02X at byte %d " "with no expected value set", * parser->bad_byte, parser->bad_byte - parser->input); } #undef SNEND #undef SNSIZE #ifdef TESTRANDOM /* Go back to where we came from. */ if (parser->randomtest) { parser->last_error = buffer; make_valid_bytes (parser); longjmp (parser->biscuit, 1); } #endif /* def TESTRANDOM */ #ifndef NOPERL if (parser->diagnostics_hash) { #if PERL_VERSION > 12 croak_sv (error_to_hash (parser, buffer)); #endif /* PERL_VERSION > 12 */ } #endif /* ndef NOPERL */ if (parser->length > 0) { if (parser->end - parser->input > parser->length) { croak ("JSON error at line %d: %s", parser->line, buffer); } else if (parser->bad_byte) { croak ("JSON error at line %d, byte %d/%d: %s", parser->line, parser->bad_byte - parser->input + 1, parser->length, buffer); } else { croak ("JSON error at line %d: %s", parser->line, buffer); } } else { croak ("JSON error: %s", buffer); } } #undef SPECIFIC /* This is for failures not due to errors in the input or to bugs but to exhaustion of resources, i.e. out of memory, or file errors would go here if there were any C file opening things anywhere. */ static INLINE void failresources (json_parse_t * parser, const char * format, ...) { char buffer[ERRORMSGBUFFERSIZE]; va_list a; va_start (a, format); vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a); va_end (a); croak ("Parsing failed at line %d, byte %d/%d: %s", parser->line, parser->end - parser->input, parser->length, buffer); } #undef ERRORMSGBUFFERSIZE /* Get more memory for "parser->buffer". */ static void expand_buffer (json_parse_t * parser, int length) { if (parser->buffer_size < 2 * length + 0x100) { parser->buffer_size = 2 * length + 0x100; if (parser->buffer) { Renew (parser->buffer, parser->buffer_size, unsigned char); } else { Newx (parser->buffer, parser->buffer_size, unsigned char); parser->n_mallocs++; } if (! parser->buffer) { failresources (parser, "out of memory"); } } } #define UNIFAIL(err) \ parser->bad_type = json_unicode_escape; \ parser->error = json_error_ ## err; \ failbadinput (parser) /* Parse the hex bit of a \uXYZA escape. */ static INLINE int parse_hex_bytes (json_parse_t * parser, unsigned char * p) { int k; int unicode; unicode = 0; for (k = 0; k < strlen ("ABCD"); k++) { unsigned char c; c = p[k]; switch (c) { case DIGIT: unicode = unicode * 16 + c - '0'; break; case UHEX: unicode = unicode * 16 + c - 'A' + 10; break; case LHEX: unicode = unicode * 16 + c - 'a' + 10; break; case '\0': if (p + k - parser->input >= parser->length) { UNIFAIL (unexpected_end_of_input); } break; default: parser->bad_byte = p + k; parser->expected = XHEXADECIMAL_CHARACTER; UNIFAIL (unexpected_character); } } return unicode; } /* STRINGFAIL applies for any kind of failure within a string, not just unexpected character errors. */ #define STRINGFAIL(err) \ parser->error = json_error_ ## err; \ parser->bad_type = json_string; \ failbadinput (parser) #define FAILSURROGATEPAIR(c) \ parser->expected = XIN_SURROGATE_PAIR; \ parser->literal_char = c; \ parser->bad_beginning = start - 2; \ parser->error = json_error_unexpected_character; \ parser->bad_type = json_unicode_escape; \ parser->bad_byte = p - 1; \ failbadinput (parser) static INLINE unsigned char * do_unicode_escape (json_parse_t * parser, unsigned char * p, unsigned char ** b_ptr) { int unicode; unsigned int plus; unsigned char * start; start = p; unicode = parse_hex_bytes (parser, p); p += 4; plus = ucs2_to_utf8 (unicode, *b_ptr); if (plus == UTF8_BAD_LEADING_BYTE || plus == UTF8_BAD_CONTINUATION_BYTE) { failbug (__FILE__, __LINE__, parser, "Failed to parse unicode input %.4s", start); } else if (plus == UNICODE_SURROGATE_PAIR) { int unicode2; int plus2; if (parser->last_byte - p < 6) { parser->bad_beginning = start - 2; parser->bad_type = json_unicode_escape; parser->error = json_error_unexpected_end_of_input; failbadinput (parser); } if (*p++ == '\\') { if (*p++ == 'u') { unicode2 = parse_hex_bytes (parser, p); p += 4; plus2 = surrogate_to_utf8 (unicode, unicode2, * b_ptr); if (plus2 <= 0) { if (plus2 == UNICODE_NOT_SURROGATE_PAIR) { parser->bad_byte = 0; parser->bad_beginning = p - 4; UNIFAIL (not_surrogate_pair); } else { failbug (__FILE__, __LINE__, parser, "unhandled error %d from surrogate_to_utf8", plus2); } } * b_ptr += plus2; goto end; } else { FAILSURROGATEPAIR ('u'); } } else { FAILSURROGATEPAIR ('\\'); } } else if (plus <= 0) { failbug (__FILE__, __LINE__, parser, "unhandled error code %d while decoding unicode escape", plus); } * b_ptr += plus; end: if (unicode >= 0x80 && ! parser->unicode) { /* Force the UTF-8 flag on for this string. */ parser->force_unicode = 1; } return p; } /* Handle backslash escapes. We can't use the NEXTBYTE macro here for the reasons outlined below. */ #if 0 /* I expected a switch statement to compile to faster code, but it doesn't seem to. */ #define HANDLE_ESCAPES(p,start) \ switch (c = * ((p)++)) { \ \ case '\\': \ case '/': \ case '"': \ *b++ = c; \ break; \ \ case 'b': \ *b++ = '\b'; \ break; \ \ case 'f': \ *b++ = '\f'; \ break; \ \ case 'n': \ *b++ = '\n'; \ break; \ \ case 'r': \ *b++ = '\r'; \ break; \ \ case 't': \ *b++ = '\t'; \ break; \ \ case 'u': \ p = do_unicode_escape (parser, p, & b); \ break; \ \ default: \ parser->bad_beginning = start; \ parser->bad_byte = p - 1; \ parser->expected = XESCAPE; \ STRINGFAIL (unexpected_character); \ } #else /* This is identical to the above macro, but it uses if statements rather than a switch statement. Using the Clang compiler, this results in about 2.5% faster code, for some reason or another. */ #define HANDLE_ESCAPES(p,start) \ c = * ((p)++); \ if (c == '\\' || c == '/' || c == '"') { \ *b++ = c; \ } \ else if (c == 'b') { \ *b++ = '\b'; \ } \ else if (c == 'f') { \ *b++ = '\f'; \ } \ else if (c == 'n') { \ *b++ = '\n'; \ } \ else if (c == 'r') { \ *b++ = '\r'; \ } \ else if (c == 't') { \ *b++ = '\t'; \ } \ else if (c == 'u') { \ p = do_unicode_escape (parser, p, & b); \ } \ else { \ parser->bad_beginning = start; \ parser->bad_byte = p - 1; \ parser->expected = XESCAPE; \ STRINGFAIL (unexpected_character); \ } #endif /* Resolve "s" by converting escapes into the appropriate things. Put the result into "parser->buffer". The return value is the length of the string. */ static INLINE int resolve_string (json_parse_t * parser, string_t * s) { /* The pointer where we copy the string. This points into "parser->buffer". */ unsigned char * b; /* "p" is the pointer into "parser->input", using "s->start" to get the start point. We don't use "parser->end" for this job because "resolve_string" is called only after the value of the object is resolved. E.g. if the object goes like {"hot":{"potatoes":"tomatoes"}} then this routine is called first for "potatoes" and then for "hot" as each sub-element of the hashes is resolved. We don't want to mess around with the value of "parser->end", which is always pointing to one after the last byte viewed. */ unsigned char * p; p = s->start; /* Ensure we have enough memory to fit the string. */ expand_buffer (parser, s->length); b = parser->buffer; while (p - s->start < s->length) { unsigned char c; c = *p++; if (c == '\\') { HANDLE_ESCAPES (p, s->start - 1); } else { *b++ = c; } } /* This is the length of the string in bytes. */ return b - parser->buffer; } #define NEXTBYTE (c = *parser->end++) /* Get an object key value and put it into "key". Check for escapes. */ static INLINE void get_key_string (json_parse_t * parser, string_t * key) { unsigned char c; int i; key->start = parser->end; key->contains_escapes = 0; key_string_next: switch (NEXTBYTE) { case '"': /* Go on eating bytes until we find a ". */ break; case '\\': /* Mark this string as containing escapes. */ key->contains_escapes = 1; switch (NEXTBYTE) { case '\\': case '/': case '"': case 'b': case 'f': case 'n': case 'r': case 't': /* Eat another byte. */ goto key_string_next; case 'u': /* i counts the bytes, from 0 to 3. */ i = 0; unitunes: switch (NEXTBYTE) { case DIGIT: case UHEX: case LHEX: i++; if (i >= strlen ("ABCD")) { goto key_string_next; } else { goto unitunes; } /* not a fall through, we always "goto" above. */ default: parser->bad_beginning = parser->end - 1 - i; parser->expected = XHEXADECIMAL_CHARACTER; parser->bad_byte = parser->end - 1; UNIFAIL (unexpected_character); } /* not a fall through, we either UNIFAIL or goto above. */ default: parser->bad_beginning = key->start - 1; parser->expected = XESCAPE; parser->bad_byte = parser->end - 1; STRINGFAIL (unexpected_character); } /* Not a fall through, we never arrive here. */ case BADBYTES: parser->bad_beginning = key->start - 1; parser->expected = XSTRINGCHAR; parser->bad_byte = parser->end - 1; STRINGFAIL (unexpected_character); /* Not a fall through, STRINGFAIL does not return. */ #define ADDBYTE #define string_start key_string_next #define startofutf8string (key->start) #include "utf8-byte-one.c" /* Not a fall through. */ default: parser->bad_beginning = key->start - 1; parser->expected = XSTRINGCHAR; parser->bad_byte = parser->end - 1; STRINGFAIL (unexpected_character); } key->length = parser->end - key->start - 1; return; #include "utf8-next-byte.c" #undef startofutf8string #undef string_start #undef ADDBYTE } /* "start - 1" puts the start on the " rather than after it. "start" is usually after the quote because the quote is eaten on the way here. */ #define ILLEGALBYTE \ parser->bad_beginning = start - 1; \ parser->bad_byte = parser->end - 1; \ parser->expected = XSTRINGCHAR; \ STRINGFAIL (unexpected_character) /* Resolve the string pointed to by "parser->end" into "parser->buffer". The return value is the length of the string. This is only called if the string has \ escapes in it. */ static INLINE int get_string (json_parse_t * parser) { unsigned char * b; unsigned char c; unsigned char * start; start = parser->end; if (! parser->buffer) { expand_buffer (parser, 0x1000); } b = parser->buffer; string_start: if (b - parser->buffer >= parser->buffer_size - 0x100) { /* Save our offset in parser->buffer, because "realloc" is called by "expand_buffer", and "b" may no longer point to a meaningful location. */ int size = b - parser->buffer; expand_buffer (parser, 2 * parser->buffer_size); b = parser->buffer + size; } switch (NEXTBYTE) { case '"': goto string_end; break; case '\\': HANDLE_ESCAPES (parser->end, start - 1); goto string_start; #define ADDBYTE (* b++ = c) #define startofutf8string start #include "utf8-byte-one.c" /* Not a fall through. */ default: /* fall through */ case BADBYTES: ILLEGALBYTE; } if (STRINGEND) { STRINGFAIL (unexpected_end_of_input); } string_end: return b - parser->buffer; #include "utf8-next-byte.c" #undef ADDBYTE goto string_end; } static void parser_free (json_parse_t * parser) { if (parser->buffer) { Safefree (parser->buffer); parser->n_mallocs--; } /* There is a discrepancy between the number of things used and the number freed. */ if (parser->n_mallocs != 0) { /* The tokenizing parser is freed before the tokens themselves are freed. Whether or not the tokens are freed correctly can be checked in "tokenize_free" in "json-entry-points.c". */ if (! parser->tokenizing) { fprintf (stderr, "%s:%d: %d pieces of unfreed memory remain.\n", __FILE__, __LINE__, parser->n_mallocs); } } parser->buffer = 0; parser->buffer_size = 0; } typedef enum json_token_type { json_token_invalid, json_token_number, json_token_string, json_token_key, json_token_literal, json_token_comma, json_token_colon, json_token_object, json_token_array, n_json_tokens } json_token_type_t; const char * token_names[n_json_tokens] = { "invalid", "number", "string", "key", "literal", "comma", "colon", "object", "array" }; typedef struct json_token json_token_t; struct json_token { json_token_t * child; json_token_t * next; unsigned int start; unsigned int end; json_token_type_t type; unsigned int parent; unsigned blessed : 1; }; #define JSON_TOKEN_PARENT_INVALID 0 /* "start" is the first character of the thing. "end" is the last character of the thing. If the thing only takes one character then "start == end" should be true. */ static json_token_t * json_token_new (json_parse_t * parser, unsigned char * start, unsigned char * end, json_token_type_t type) { json_token_t * new; /* Check the token in various ways. */ switch (type) { case json_token_string: case json_token_key: if (* start != '"') { if (end) { failbug (__FILE__, __LINE__, parser, "no quotes at start of string '%.*s'", end - start, start); } else { failbug (__FILE__, __LINE__, parser, "no quotes at start of string '%.10s'", start); } } if (end && * end != '"') { failbug (__FILE__, __LINE__, parser, "'%c' is not a quote at end of string '%.*s'", * end, end - start, start); } break; case json_token_number: if (* start - '0' > 9 && * start != '-') { failbug (__FILE__, __LINE__, parser, "bad character %c at start of number", * start); } if (* end - '0' > 9) { failbug (__FILE__, __LINE__, parser, "bad character %c at end of number", * end); } break; case json_token_object: if (* start != '{' || (end && * end != '}')) { failbug (__FILE__, __LINE__, parser, "no { or } in object %.*s: char %X", end ? end - start : strlen ((char *) start), start, * start); } break; case json_token_array: if (* start != '[' || (end && * end != ']')) { failbug (__FILE__, __LINE__, parser, "no [ or ] in array"); } break; case json_token_comma: if (end - start != 0 || * start != ',') { failbug (__FILE__, __LINE__, parser, "not a comma %.*s", end - start); } break; case json_token_colon: if (end - start != 0 || * start != ':') { failbug (__FILE__, __LINE__, parser, "not a colon %.*s", end - start); } break; case json_token_literal: break; default: croak ("%s:%d: bad type %d\n", __FILE__, __LINE__, type); } Newx (new, 1, json_token_t); // static int nnew; // nnew++; // fprintf (stderr, "New %d %p\n", nnew, new); parser->n_mallocs++; #if 0 fprintf (stderr, "%s:%d: parser->n_mallocs = %d\n", __FILE__, __LINE__, parser->n_mallocs); #endif /* 0 */ new->start = start - parser->input; if (end) { new->end = end - parser->input + 1; } else { new->end = 0; } new->type = type; new->parent = JSON_TOKEN_PARENT_INVALID; new->child = 0; new->next = 0; return new; } static void json_token_set_end (json_parse_t * parser, json_token_t * jt, unsigned char * end) { if (jt->end != 0) { int offset = (int) (end - parser->input); failbug (__FILE__, __LINE__, parser, "attempt to set end as %d is now %d\n", offset, jt->end); } switch (jt->type) { case json_token_string: case json_token_key: if (* end != '"') { failbug (__FILE__, __LINE__, parser, "no quotes at end of string"); } break; case json_token_object: if (* end != '}') { failbug (__FILE__, __LINE__, parser, "no } at end of object"); } break; case json_token_array: if (* end != ']') { failbug (__FILE__, __LINE__, parser, "no ] at end of array"); } break; default: failbug (__FILE__, __LINE__, parser, "set end for unknown type %d", jt->type); break; } jt->end = end - parser->input + 1; } static json_token_t * json_token_set_child (json_parse_t * parser, json_token_t * parent, json_token_t * child) { switch (parent->type) { case json_token_object: case json_token_array: break; default: failbug (__FILE__, __LINE__, parser, "bad parent type %d\n", parent->type); } parent->child = child; return child; } static json_token_t * json_token_set_next (json_token_t * prev, json_token_t * next) { prev->next = next; return next; } JSON-Parse-0.56/unicode.h000444 001751 001751 00000011153 13622411125 014434 0ustar00benben000000 000000 /* This file was generated by the following command: /home/ben/software/install/bin/cfunctions unicode.c */ #ifndef CFH_UNICODE_H #define CFH_UNICODE_H #line 8 "unicode.c" #define UTF8_MAX_LENGTH 5 #define UNICODE_MAXIMUM 0x10ffff #define UNICODE_UTF8_4 0x1fffff #define UNICODE_OK 0 #define UTF8_BAD_LEADING_BYTE -1 #define UNICODE_SURROGATE_PAIR -2 #define UNICODE_NOT_SURROGATE_PAIR -3 #define UTF8_BAD_CONTINUATION_BYTE -4 #define UNICODE_EMPTY_INPUT -5 #define UTF8_NON_SHORTEST -6 #define UNICODE_TOO_BIG -7 #define UNICODE_NOT_CHARACTER -8 #define UTF8_VALID 1 #define UTF8_INVALID 0 #line 98 "unicode.c" extern const uint8_t utf8_sequence_len[]; #line 103 "unicode.c" int32_t utf8_bytes (uint8_t c); #line 124 "unicode.c" int32_t utf8_no_checks (const uint8_t* input, const uint8_t** end_ptr); #line 160 "unicode.c" int32_t utf8_to_ucs2 (const uint8_t* input, const uint8_t** end_ptr); #line 250 "unicode.c" int32_t ucs2_to_utf8 (int32_t ucs2, uint8_t* utf8); #line 295 "unicode.c" int32_t unicode_to_surrogates (int32_t unicode, int32_t* hi_ptr, int32_t* lo_ptr); #line 314 "unicode.c" int32_t surrogates_to_unicode (int32_t hi, int32_t lo); #line 337 "unicode.c" int32_t surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t* utf8); #line 350 "unicode.c" int32_t unicode_chars_to_bytes (const uint8_t* utf8, int32_t n_chars); #line 370 "unicode.c" int32_t unicode_count_chars_fast (const uint8_t* utf8); #line 392 "unicode.c" int32_t unicode_count_chars (const uint8_t* utf8); #line 415 "unicode.c" #define BYTE_80_8F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F #define BYTE_80_9F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F #define BYTE_80_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_90_BF \ 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \ case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \ case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \ case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \ case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \ case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \ case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_A0_BF \ 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \ case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \ case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \ case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \ case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_C2_DF \ 0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \ case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \ case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \ case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \ case 0xDE: case 0xDF #define BYTE_E1_EC \ 0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \ case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC #define BYTE_EE_EF \ 0xEE: case 0xEF #define BYTE_F1_F3 \ 0xF1: case 0xF2: case 0xF3 #line 479 "unicode.c" int32_t valid_utf8 (const uint8_t* input, int32_t input_length); #line 616 "unicode.c" int32_t trim_to_utf8_start (uint8_t** ptr); #line 639 "unicode.c" const char* unicode_code_to_error (int32_t code); #endif /* CFH_UNICODE_H */ JSON-Parse-0.56/MANIFEST.SKIP000644 001751 001751 00000002355 13063734776 014564 0ustar00benben000000 000000 (build|make-pod|clean|make-errors|run)\.pl blib/.* .gitignore .git/.* Makefile$ pm_to_blib MYMETA..* .*\.bak ^Json3-[0-9.]/$ ^Json3-[0-9.]\.tar\.gz$ ^Json3.(?:c|o|bs)$ ^peek\.pl$ ^test-empty-string\.pl$ ^xt/.* ^lib/JSON/Parse\.pod\.tmpl$ ^Parse\.bs$ # Can download from github if needed. benchmarks # Spurious build guff ^makeitfile$ # Corefiles, a.out sometimes get into the distribution. ^.*\.core$ ^a\.out$ # There aren't any files I plan to distribute with these file types. ^[^/]*\.pl$ ^[^/]*\.txt$ # Don't put the "torture test" code into the CPAN distribution. It # contains bad stuff like "setjmp" which is incompatible with many # people's operating systems. People who want this kind of stuff can # get it from the repository. ^Json3-random-test\.c$ ^(?:json-)?random-test\.c$ ^random-test$ ^randomjson\.pl$ # Don't add the UTF-8 test code to the repo: ^utf8test/.*$ # Makefile for C random stuff ^randmakefile$ # Not necessary or useful for CPAN ^FILES$ # Design document directory, no use to put on CPAN. ^doc/.*$ # ^examples/.*-out\.txt$ # Experimental unfinished module should not go to CPAN ^lib/JSON/Whitespace\.pm$ # Template for POD ^lib/JSON/Tokenize\.pod\.tmpl$ # Obsolete files ^obsolete/.*$ # Local variables: # comment-start: "#" # End: JSON-Parse-0.56/lib/000755 001751 001751 00000000000 13622411134 013404 5ustar00benben000000 000000 JSON-Parse-0.56/utf8-byte-one.c000644 001751 001751 00000000723 12255446465 015432 0ustar00benben000000 000000 case BYTE_20_7F: ADDBYTE; goto string_start; case BYTE_C2_DF: ADDBYTE; goto byte_last_80_bf; case 0xE0: ADDBYTE; goto byte23_a0_bf; case BYTE_E1_EC: ADDBYTE; goto byte_penultimate_80_bf; case 0xED: ADDBYTE; goto byte23_80_9f; case BYTE_EE_EF: ADDBYTE; goto byte_penultimate_80_bf; case 0xF0: ADDBYTE; goto byte24_90_bf; case BYTE_F1_F3: ADDBYTE; goto byte24_80_bf; case 0xF4: ADDBYTE; goto byte24_80_8f; JSON-Parse-0.56/README000644 001751 001751 00000006506 13622411131 013522 0ustar00benben000000 000000 _ ____ ___ _ _ ____ | / ___| / _ \| \ | | _ _ | _ \ __ _ _ __ ___ ___ _ | \___ \| | | | \| | (_|_) | |_) / _` | '__/ __|/ _ \ | |_| |___) | |_| | |\ | _ _ | __/ (_| | | \__ \ __/ \___/|____/ \___/|_| \_| (_|_) |_| \__,_|_| |___/\___| This is the README for JSON::Parse version 0.56. JSON::Parse is a "module" for the Perl computer programming language, a library of computer code to install on a computer. This document contains four sections: 1. About - what the module does 2. Documentation - how to learn more about the module 3. Installation - how to install this module on a computer 4. Help - what to do if you get stuck ----------------------------------------------------------------------------- 1. ABOUT JSON::Parse - Read JSON into a Perl variable A module for parsing JSON. (JSON means "JavaScript Object Notation" and it is specified in "RFC 7159".) JSON::Parse offers the function "parse_json", which takes a string containing JSON, and returns an equivalent Perl structure. It also offers validation of JSON via "valid_json", which returns true or false depending on whether the JSON is correct or not, and "assert_valid_json", which produces a descriptive fatal error if the JSON is invalid. A function "json_file_to_perl" reads JSON from a file, and there is a safer version of "parse_json" called "parse_json_safe" which doesn't throw exceptions. For special cases of parsing, there are also methods "new" and "run", which create a JSON parsing object and run it on text. See "METHODS". JSON::Parse accepts only UTF-8 as input. See "UTF-8 only" and "Handling of Unicode". ----------------------------------------------------------------------------- 2. DOCUMENTATION You can read the documentation for the module online at the following website: * http://metacpan.org/release/JSON-Parse (This link goes to the latest version of the module.) After installing the module, you can read the documentation on your computer using perldoc JSON::Parse ----------------------------------------------------------------------------- 3. INSTALLATION This module requires Perl version 5.8.9 or later. To install the module from CPAN, use cpan JSON::Parse If you have the App::cpanminus installer, you may prefer cpanm JSON::Parse To install the module from the source file, JSON-Parse-0.56.tar.gz, follow this sequence of commands: tar xfz JSON-Parse-0.56.tar.gz cd JSON-Parse-0.56 perl Makefile.PL make make install If you want to test the module before installing it, use "make test" after "make" and before "make install". ----------------------------------------------------------------------------- 4. HELP To get help with the module, you can email the author, Ben Bullock, at . If you think there is a problem in the module, you can report a bug at , or if you want to alter the source code of JSON::Parse, try the public repository on github at . ----------------------------------------------------------------------------- This README was written on Mon Feb 17 13:11:05 2020. ----------------------------------------------------------------------------- JSON-Parse-0.56/errors.c000444 001751 001751 00000035561 13622411131 014323 0ustar00benben000000 000000 typedef enum { json_error_invalid, json_error_unexpected_character, json_error_unexpected_end_of_input, json_error_not_surrogate_pair, json_error_empty_input, json_error_name_is_not_unique, json_error_overflow } json_error_t; const char * json_errors[json_error_overflow] = { "Invalid", "Unexpected character", "Unexpected end of input", "Not surrogate pair", "Empty input", "Name is not unique", }; enum expectation { xwhitespace, xcomma, xvalue_separator, xobject_end, xarray_end, xhexadecimal_character, xstring_start, xdigit, xdot, xminus, xplus, xexponential, xarrayobjectstart, xescape, xstringchar, xliteral, xin_literal, xin_surrogate_pair, xbytes_80_8f, xbytes_80_9f, xbytes_80_bf, xbytes_90_bf, xbytes_a0_bf, n_expectations }; #define XWHITESPACE (1<bad_beginning = startofutf8string - 1; \ parser->bad_type = json_string; \ parser->bad_byte = parser->end - 1; \ parser->expected = want; \ parser->error = json_error_unexpected_character; \ failbadinput (parser) byte_last_80_bf: switch (NEXTBYTE) { case BYTE_80_BF: ADDBYTE; goto string_start; default: FAILUTF8 (XBYTES_80_BF); } byte_penultimate_80_bf: switch (NEXTBYTE) { case BYTE_80_BF: ADDBYTE; goto byte_last_80_bf; default: FAILUTF8 (XBYTES_80_BF); } byte24_90_bf: switch (NEXTBYTE) { case BYTE_90_BF: ADDBYTE; goto byte_penultimate_80_bf; default: FAILUTF8 (XBYTES_90_BF); } byte23_80_9f: switch (NEXTBYTE) { case BYTE_80_9F: ADDBYTE; goto byte_last_80_bf; default: FAILUTF8 (XBYTES_80_9F); } byte23_a0_bf: switch (NEXTBYTE) { case BYTE_A0_BF: ADDBYTE; goto byte_last_80_bf; default: FAILUTF8 (XBYTES_A0_BF); } byte24_80_bf: switch (NEXTBYTE) { case BYTE_80_BF: ADDBYTE; goto byte_penultimate_80_bf; default: FAILUTF8 (XBYTES_80_BF); } byte24_80_8f: switch (NEXTBYTE) { case BYTE_80_8F: ADDBYTE; goto byte_penultimate_80_bf; default: FAILUTF8 (XBYTES_80_8F); } JSON-Parse-0.56/json-entry-points.c000644 001751 001751 00000016467 13423032117 016441 0ustar00benben000000 000000 /* Empty input was provided. */ static void fail_empty (json_parse_t * parser) { parser->bad_type = json_initial_state; parser->error = json_error_empty_input; failbadinput (parser); } /* Check for stray non-whitespace after the end and free memory. */ static void check_end (json_parse_t * parser) { int c; end: switch (NEXTBYTE) { case WHITESPACE: goto end; case '\0': parser_free (parser); return; default: parser->bad_type = json_initial_state; parser->bad_byte = parser->end - 1; parser->expected = XWHITESPACE; parser->error = json_error_unexpected_character; failbadinput (parser); } } #define ENTRYDECL \ /* Our collection of bits and pieces. */ \ \ json_parse_t parser_o = {0}; \ json_parse_t * parser = & parser_o #ifndef NOPERL /* Set up "parser" with the string from "json". */ static void getstring (SV * json, json_parse_t * parser) { STRLEN length; parser->input = (unsigned char *) SvPV (json, length); parser->end = parser->input; parser->length = (unsigned int) length; parser->unicode = SvUTF8 (json) ? 1 : 0; } #endif /* ndef NOPERL */ #define SETUPPARSER \ parser->line = 1; \ parser->last_byte = parser->input + parser->length /* Error to throw if there is a character other than whitespace, "[" or "{" at the start of the JSON. */ #define BADCHAR \ parser->bad_byte = parser->end - 1; \ parser->bad_type = json_initial_state; \ parser->expected = XARRAYOBJECTSTART | VALUE_START | XWHITESPACE; \ parser->error = json_error_unexpected_character; \ failbadinput (parser) #ifndef NOPERL static SV * json_parse_run (json_parse_t * parser, SV * json) { /* The currently-parsed character. */ char c; /* The returned object. */ SV * r = & PL_sv_undef; getstring (json, parser); if (parser->length == 0) { fail_empty (parser); } SETUPPARSER; parse_start: switch (NEXTBYTE) { case '{': r = object (parser); break; case '[': r = array (parser); break; case '-': case '0': case DIGIT19: parser->top_level_value = 1; r = number (parser); break; case '"': parser->top_level_value = 1; r = string (parser); break; case 't': parser->top_level_value = 1; r = literal_true (parser); break; case 'f': parser->top_level_value = 1; r = literal_false (parser); break; case 'n': parser->top_level_value = 1; r = literal_null (parser); break; case WHITESPACE: goto parse_start; case '\0': /* We have an empty string. */ fail_empty (parser); default: BADCHAR; } check_end (parser); return r; } /* This is the entry point for non-object parsing. */ SV * parse (SV * json) { /* Make our own parser object on the stack. */ ENTRYDECL; /* Run it. */ return json_parse_run (parser, json); } /* This is the entry point for "safe" non-object parsing. */ SV * parse_safe (SV * json) { /* Make our own parser object on the stack. */ ENTRYDECL; parser_o.detect_collisions = 1; parser_o.copy_literals = 1; parser_o.warn_only = 1; parser_o.diagnostics_hash = 1; /* Run it. */ return json_parse_run (parser, json); } #endif /* ndef NOPERL */ /* Validation without Perl structures. */ static void c_validate (json_parse_t * parser) { /* The currently-parsed character. */ char c; /* If the string is empty, throw an exception. */ if (parser->length == 0) { fail_empty (parser); } SETUPPARSER; validate_start: switch (NEXTBYTE) { case '{': valid_object (parser); break; case '[': valid_array (parser); break; case '-': case '0': case DIGIT19: parser->top_level_value = 1; valid_number (parser); break; case '"': parser->top_level_value = 1; valid_string (parser); break; case 't': parser->top_level_value = 1; valid_literal_true (parser); break; case 'f': parser->top_level_value = 1; valid_literal_false (parser); break; case 'n': parser->top_level_value = 1; valid_literal_null (parser); break; case WHITESPACE: goto validate_start; default: BADCHAR; } check_end (parser); } static INLINE void print_tokens (json_token_t * t) { printf ("Start: %d End: %d: Type: %s\n", t->start, t->end, token_names[t->type]); if (t->child) { printf ("Children:\n"); print_tokens (t->child); } if (t->next) { printf ("Next:\n"); print_tokens (t->next); } } #ifndef NOPERL static json_token_t * c_tokenize (json_parse_t * parser) { /* The currently-parsed character. */ char c; json_token_t * r; SETUPPARSER; tokenize_start: switch (NEXTBYTE) { case '{': r = tokenize_object (parser); break; case '[': r = tokenize_array (parser); break; case WHITESPACE: goto tokenize_start; default: BADCHAR; } check_end (parser); #if 0 printf ("TOKENS:\n"); print_tokens (r); #endif /* 0 */ return r; } static void tokenize_free (json_token_t * token) { json_token_t * next; static int nfree; next = token->child; if (next) { if (! next->blessed) { tokenize_free (next); } token->child = 0; } next = token->next; if (next) { if (! next->blessed) { tokenize_free (next); } token->next = 0; } if (! token->blessed) { //nfree++; //fprintf (stderr, "Free %d %p\n", nfree, token); Safefree (token); } } /* This is the entry point for validation. */ static void validate (SV * json, unsigned int flags) { ENTRYDECL; getstring (json, parser); if (parser->length == 0) { fail_empty (parser); } c_validate (& parser_o); } static void check (json_parse_t * parser, SV * json) { getstring (json, parser); c_validate (parser); } static json_token_t * tokenize (SV * json) { ENTRYDECL; getstring (json, parser); /* Mark this parser as being used for tokenizing to bypass the checks for memory leaks when the parser is freed. */ parser_o.tokenizing = 1; return c_tokenize (& parser_o); } /* Make a hash containing a diagnostic error from the parser. */ static SV * error_to_hash (json_parse_t * parser, char * error_as_string) { HV * error; error = newHV (); #ifdef HK #warn "Redefinition of macro HK" #endif /* def HK */ #undef HK #define HK(x, val) (void) hv_store (error, x, strlen (x), val, 0) HK("length", newSViv (parser->length)); HK("bad type", newSVpv (type_names[parser->bad_type], 0)); HK("error", newSVpv (json_errors[parser->error], 0)); HK("error as string", newSVpv (error_as_string, 0)); if (parser->bad_byte) { int position; position = (int) (parser->bad_byte - parser->input) + 1; HK("bad byte position", newSViv (position)); HK("bad byte contents", newSViv (* parser->bad_byte)); } if (parser->bad_beginning) { int bcstart; bcstart = (int) (parser->bad_beginning - parser->input) + 1; HK("start of broken component", newSViv (bcstart)); } if (parser->error == json_error_unexpected_character) { int j; AV * valid_bytes; valid_bytes = newAV (); make_valid_bytes (parser); for (j = 0; j < JSON3MAXBYTE; j++) { av_push (valid_bytes, newSViv (parser->valid_bytes[j])); } HK("valid bytes", newRV_inc ((SV *) valid_bytes)); } return newRV_inc ((SV *) error); #undef HK } #endif /* ndef NOPERL */ JSON-Parse-0.56/Changes000644 001751 001751 00000010403 13616666602 014146 0ustar00benben000000 000000 0.56 2020-02-06 * Allow build on Solaris and SunOS 0.55_02 2019-01-24 * For testing of Daxim segfault through CPAN testers 0.55 2017-10-21 * Versions synchronised * Memory problem on Cygwin addressed 0.54 2017-10-20 * Refuse to support Solaris 0.53 2017-10-18 * Windows build fix 0.52 2017-10-18 * Removed unnecessary test 0.50 2017-10-18 * tokenize_text method added to JSON::Tokenize 0.49 2016-11-26 * Behaviour for undefined, whitespace-only inputs was altered * JSON Parsing Test Suite compliance 0.48 2016-11-20 * Add "check" method for the benefit of JSON::Repair 0.47 2016-11-16 * Bug fixes for Perl 12 and earlier regarding diagnostics_hash method 0.46 2016-11-16 * Remove experimental feature $json_diagnostics * Add diagnostics_hash method as a replacement for $json_diagnostics * Better warnings for parse_json_safe * Add the examples directory files to the CPAN distribution * Documentation rewrites 0.45 2016-11-15 * Remove non-functioning JSON::Whitespace module from CPAN distribution 0.44 2016-11-13 * Add a primitive interface to JSON::Tokenize * Document the JSON::Tokenize interface 0.43 2016-11-05 * Remove validjson dependence on Path::Tiny. * Produce better outputs from validjson script 0.42 2016-08-22 * Long string copying bug fixed. 0.41 2016-06-14 * Implement "warn_only" * Documentation clarifications 0.40 2016-05-22 * Compilation warning, possible memory leaks (SHLOMIF) 0.39 2015-11-18 * Fix for 64 bit * Changed error message documentation to machine-readable 0.38 2015-10-30 * Add object interface for more flexible parsing. * Add "parse_json_safe" with no exceptions and hash collision detection. * Add choice to copy literals. * Better documentation of changes and examples 0.37 2015-10-29 * Remove $JSON::Parse::true and false, use &PL_sv_yes and &PL_sv_no instead. 0.36 2015-10-24 * Document and install "validjson" script. * Document the "$JSON::Parse::json_diagnostics" global variable. 0.35 2015-10-19 * Minor documentation changes 0.34 2015-10-19 * Document key collisions * Extend SEE ALSO section on other JSON modules on CPAN. 0.33 2015-10-09 * Memory leak plug * Small speed improvements * Minor bug about commas after bare values * Documentation rewritten to clarify changes in 0.32 and testing. 0.32 2015-07-29 * Update to accept bare values, per RFC 7159. 0.31 2014-12-07 * Make sure to cast to STRLEN in newSVpv(n) calls. * Remove reference to deleted Google Group from documentation. 0.30 2014-03-23 * Prototypes removed from functions * Documentation rewrites * Changes to compile on Darwin OS * Use "assert_valid_json" in DIAGNOSTICS section of pod. 0.29 2013-12-25 * UTF-8 validation in strings * Number, object, array parsing improvements * Fewer error messages & documented (see DIAGNOSTICS in pod) * Testing done with random input to root out problems (see TESTING in pod) 0.28 2013-12-12 * Change function name from "validate_json" to "assert_valid_json" (old name is still accepted). * More tests * Style of error messages changed for "end of input" errors. * More error checking for invalid \u escapes. 0.27 2013-12-11 * More thorough checking of input for format errors (missing commas etc.) * Check for bytes 0x00 - 0x1F in strings and throw error if present * Don't upgrade \u0000 - \u0080 to character strings * Documentation rewrites 0.26 2013-12-10 * Remove core file from distribution. 0.25 2013-12-10 * Fix bug where missing commas were not detected. * Now parses \uXXXX constructions. * "valid_json" now back to being a fast validator. * Add new "validate_json", a fast, exception throwing validator. * Surrogate pairs now handled (tentative) * Upgrade \uXXXX to character string. 0.24 2013-11-27 * Update "Changes" to conform to CPAN::Changes::Spec 0.23 2013-09-15 0.22 2013-08-26 * Typos and minor details in documentation. * Changes to compile on Microsoft C compiler. 0.21 2013-08-07 * Changes documented. 0.20 2013-08-06 * From-scratch rewrite of module. 0.19 2013-03-15 0.18 2013-03-12 0.17 2013-03-10 0.16 2013-03-08 0.15 2013-03-08 0.14 2012-06-12 0.13 2012-06-12 0.12 2012-06-12 0.11 2012-04-13 0.10 2012-03-17 0.09 2012-03-16 0.07 2011-06-09 0.06 2011-06-02 0.05 2011-05-31 * Changed name from JSON::Argo to JSON::Parse 0.04 2011-05-29 0.03 2011-05-25 0.02 2011-05-25 0.01 2011-05-23 JSON-Parse-0.56/MANIFEST000644 001751 001751 00000002221 13622411134 013764 0ustar00benben000000 000000 Changes errors.c examples/array.pl examples/assert.pl examples/bad-utf8.pl examples/bad-utf8.txt examples/chr.pl examples/collide.pl examples/ebi.pl examples/first-bit.pl examples/hash.pl examples/json-tiny-round-trip-demo.pl examples/kani.pl examples/key-collision.pl examples/long-number.pl examples/sasori.pl examples/synopsis.pl examples/tokenize-synopsis.pl examples/true-subs.pl examples/unicode-details.pl json-common.c json-entry-points.c json-perl.c json-whitespace.c Json3.xs lib/JSON/Parse.pm lib/JSON/Parse.pod lib/JSON/Tokenize.pm lib/JSON/Tokenize.pod Makefile.PL MANIFEST This list of files MANIFEST.SKIP README script/validjson t/array.t t/bugzilla-2049.t t/collision.t t/JSON-Parse.t t/json-tokenize.t t/Json3.t t/kolmorogov42-1.t t/numbers.t t/object.t t/perl-monks-1165399.t t/read-file.t t/rfc7159.t t/syntax-error-1.json t/syntax.t t/test-empty-string.t t/test.json t/unicode.t t/utf8.t t/valid-json.t t/whitespace.t typemap unicode.c unicode.h utf8-byte-one.c utf8-next-byte.c META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) JSON-Parse-0.56/unicode.c000444 001751 001751 00000074634 13622411125 014444 0ustar00benben000000 000000 /* This file is a Unicode library in the programming language C which deals with conversions to and from the UTF-8 format. */ /* Author: Ben Bullock , Repository: https://github.com/benkasminbullock/unicode-c */ #include #include #include "unicode.h" #ifdef HEADER /* _ _ _ _ | | (_)_ __ ___ (_) |_ ___ | | | | '_ ` _ \| | __/ __| | |___| | | | | | | | |_\__ \ |_____|_|_| |_| |_|_|\__|___/ */ /* The maximum number of bytes we need to contain any Unicode code point as UTF-8 as a C string. This length includes one trailing nul byte. */ #define UTF8_MAX_LENGTH 5 /* The maximum possible value of a Unicode code point. See http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */ #define UNICODE_MAXIMUM 0x10ffff /* The maximum possible value which will fit into four bytes of UTF-8. This is larger than UNICODE_MAXIMUM. */ #define UNICODE_UTF8_4 0x1fffff /* ____ _ _ | _ \ ___| |_ _ _ _ __ _ __ __ ____ _| |_ _ ___ ___ | |_) / _ \ __| | | | '__| '_ \ \ \ / / _` | | | | |/ _ \/ __| | _ < __/ |_| |_| | | | | | | \ V / (_| | | |_| | __/\__ \ |_| \_\___|\__|\__,_|_| |_| |_| \_/ \__,_|_|\__,_|\___||___/ */ /* All of the functions in this library return an "int32_t". Negative values are used to indicate errors. */ /* This return value indicates the successful completion of a routine which doesn't use the return value to communicate data back to the caller. */ #define UNICODE_OK 0 /* This return value means that the leading byte of a UTF-8 sequence was not valid. */ #define UTF8_BAD_LEADING_BYTE -1 /* This return value means the caller attempted to turn a code point for a surrogate pair into UTF-8. */ #define UNICODE_SURROGATE_PAIR -2 /* This return value means that code points which did not form a surrogate pair were tried to be converted into a code point as if they were a surrogate pair. */ #define UNICODE_NOT_SURROGATE_PAIR -3 /* This return value means that input which was supposed to be UTF-8 encoded contained an invalid continuation byte. If the leading byte of a UTF-8 sequence is not valid, UTF8_BAD_LEADING_BYTE is returned instead of this. */ #define UTF8_BAD_CONTINUATION_BYTE -4 /* This return value indicates a zero byte was found in a string which was supposed to contain UTF-8 bytes. It is returned only by the functions which are documented as not allowing zero bytes. */ #define UNICODE_EMPTY_INPUT -5 /* This return value indicates that UTF-8 bytes were not in the shortest possible form. See http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. */ #define UTF8_NON_SHORTEST -6 /* This return value indicates that there was an attempt to convert a code point which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4 into UTF-8 bytes. */ #define UNICODE_TOO_BIG -7 /* This return value indicates that the Unicode code-point ended with either 0xFFFF or 0xFFFE, meaning it cannot be used as a character code point. */ #define UNICODE_NOT_CHARACTER -8 /* This return value indicates that the UTF-8 is valid. */ #define UTF8_VALID 1 /* This return value indicates that the UTF-8 is not valid. */ #define UTF8_INVALID 0 #endif /* def HEADER */ /* This table contains the length of a sequence which begins with the byte given. A value of zero indicates that the byte can not begin a UTF-8 sequence. */ /* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */ const uint8_t utf8_sequence_len[0x100] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ }; /* This function returns the number of bytes of UTF-8 a sequence starting with byte "c" will become, either 1 (c = 0000xxxx), 2 (c = 110xxxxx), 3 (c = 1110xxxx), or 4 (c = 111100xx or c = 11110100). If "c" is not a valid UTF-8 first byte, the value UTF8_BAD_LEADING_BYTE is returned. */ int32_t utf8_bytes (uint8_t c) { int32_t r; r = utf8_sequence_len[c]; if (r == 0) { return UTF8_BAD_LEADING_BYTE; } return r; } /* This macro converts four bytes of UTF-8 into the corresponding code point. */ #define FOUR(x) \ (((int32_t) (x[0] & 0x07)) << 18) \ | (((int32_t) (x[1] & 0x3F)) << 12) \ | (((int32_t) (x[2] & 0x3F)) << 6) \ | (((int32_t) (x[3] & 0x3F))) /* Try to convert "input" from UTF-8 to UCS-2, and return a value even if the input is partly broken. This checks the first byte of the input, but it doesn't check the subsequent bytes. */ int32_t utf8_no_checks (const uint8_t * input, const uint8_t ** end_ptr) { uint8_t c; c = input[0]; switch (utf8_sequence_len[c]) { case 1: * end_ptr = input + 1; return c; case 2: * end_ptr = input + 2; return (c & 0x1F) << 6 | (input[1] & 0x3F); case 3: * end_ptr = input + 3; return (c & 0x0F) << 12 | (input[1] & 0x3F) << 6 | (input[2] & 0x3F); case 4: * end_ptr = input + 4; return FOUR (input); case 0: /* fall through */ default: return UTF8_BAD_LEADING_BYTE; } } /* This function converts UTF-8 encoded bytes in "input" into the equivalent Unicode code point. The return value is the Unicode code point corresponding to the UTF-8 character in "input" if successful, and a negative number if not successful. "*end_ptr" is set to the next character after the read character on success. "*end_ptr" is set to the start of input on failure. "end_ptr" may not be null. If the first byte of "input" is zero, UNICODE_EMPTY_INPUT is returned. If the first byte of "input" is not valid UTF-8, UTF8_BAD_LEADING_BYTE is returned. If the second or later bytes of "input" are not valid UTF-8, UTF8_BAD_CONTINUATION_BYTE is returned. If the UTF-8 is not in the shortest possible form, the error UTF8_NON_SHORTEST is returned. If the value extrapolated from "input" is greater than UNICODE_MAXIMUM, UNICODE_TOO_BIG is returned. If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE, UNICODE_NOT_CHARACTER is returned. */ int32_t utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr) { uint8_t c; uint8_t l; *end_ptr = input; c = input[0]; if (c == 0) { return UNICODE_EMPTY_INPUT; } l = utf8_sequence_len[c]; if (l == 1) { * end_ptr = input + 1; return c; } if (l == 2) { /* Two byte case. */ if (input[1] < 0x80 || input[1] > 0xBF) { return UTF8_BAD_CONTINUATION_BYTE; } if (c <= 0xC1) { return UTF8_NON_SHORTEST; } * end_ptr = input + 2; return ((int32_t) (c & 0x1F) << 6) | ((int32_t) (input[1] & 0x3F)); } if (l == 3) { /* Three byte case. */ if (input[1] < 0x80 || input[1] > 0xBF || input[2] < 0x80 || input[2] > 0xBF) { return UTF8_BAD_CONTINUATION_BYTE; } if (c == 0xe0 && input[1] < 0xa0) { /* We don't need to check the value of input[2], because the if statement above this one already guarantees that it is 10xxxxxx. */ return UTF8_NON_SHORTEST; } * end_ptr = input + 3; return ((int32_t) (c & 0x0F)) << 12 | ((int32_t) (input[1] & 0x3F)) << 6 | ((int32_t) (input[2] & 0x3F)); } if (l == 4) { /* Four byte case. */ uint8_t d; uint8_t e; uint8_t f; int32_t v; d = input[1]; e = input[2]; f = input[3]; if (/* c must be 11110xxx. */ c >= 0xf8 || /* d, e, f must be 10xxxxxx. */ d < 0x80 || d >= 0xC0 || e < 0x80 || e >= 0xC0 || f < 0x80 || f >= 0xC0) { return UTF8_BAD_CONTINUATION_BYTE; } if (c == 0xf0 && d < 0x90) { /* We don't need to check the values of e and d, because the if statement above this one already guarantees that e and d are 10xxxxxx. */ return UTF8_NON_SHORTEST; } /* Calculate the code point. */ v = FOUR (input); /* Greater than U+10FFFF */ if (v > UNICODE_MAXIMUM) { return UNICODE_TOO_BIG; } /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ if ((v & 0xffff) >= 0xfffe) { return UNICODE_NOT_CHARACTER; } * end_ptr = input + 4; return v; } return UTF8_BAD_LEADING_BYTE; } #define UNI_SUR_HIGH_START 0xD800 #define UNI_SUR_HIGH_END 0xDBFF #define UNI_SUR_LOW_START 0xDC00 #define UNI_SUR_LOW_END 0xDFFF /* Input: a Unicode code point, "ucs2". Output: UTF-8 characters in buffer "utf8". Return value: the number of bytes written into "utf8", or a negative number if there was an error. If the value of "ucs2" is invalid because of being in the surrogate pair range from 0xD800 to 0xDFFF, the return value is UNICODE_SURROGATE_PAIR, else if the value is too big to fit into four bytes of UTF-8, UNICODE_UTF8_4, the return value is UNICODE_TOO_BIG. However, it does not insist on ucs2 being less than UNICODE_MAXIMUM, so the user needs to check that "ucs2" is a valid code point. It also does not check for invalid characters, such as 0xFFFF. This adds a zero byte to the end of the string. It assumes that the buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to write to, without checking. */ int32_t ucs2_to_utf8 (int32_t ucs2, uint8_t * utf8) { if (ucs2 < 0x80) { utf8[0] = ucs2; utf8[1] = '\0'; return 1; } if (ucs2 < 0x800) { utf8[0] = (ucs2 >> 6) | 0xC0; utf8[1] = (ucs2 & 0x3F) | 0x80; utf8[2] = '\0'; return 2; } if (ucs2 < 0xFFFF) { utf8[0] = ((ucs2 >> 12) ) | 0xE0; utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80; utf8[2] = ((ucs2 ) & 0x3F) | 0x80; utf8[3] = '\0'; if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { /* Ill-formed. */ return UNICODE_SURROGATE_PAIR; } return 3; } if (ucs2 <= UNICODE_UTF8_4) { /* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */ utf8[0] = 0xF0 | (ucs2 >> 18); utf8[1] = 0x80 | ((ucs2 >> 12) & 0x3F); utf8[2] = 0x80 | ((ucs2 >> 6) & 0x3F); utf8[3] = 0x80 | ((ucs2 & 0x3F)); utf8[4] = '\0'; return 4; } return UNICODE_TOO_BIG; } /* For shifting by 10 bits. */ #define TEN_BITS 10 #define HALF_BASE 0x0010000UL /* 0b1111111111 */ #define LOW_TEN_BITS 0x3FF /* This converts the Unicode code point in "unicode" into a surrogate pair, and returns the two parts in "* hi_ptr" and "* lo_ptr". Return value: If "unicode" does not need to be a surrogate pair, the error UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr" and "*lo_ptr" are undefined. If the conversion is successful, UNICODE_OK is returned. */ int32_t unicode_to_surrogates (int32_t unicode, int32_t * hi_ptr, int32_t * lo_ptr) { int32_t hi = UNI_SUR_HIGH_START; int32_t lo = UNI_SUR_LOW_START; if (unicode < HALF_BASE) { /* Doesn't need to be a surrogate pair. */ return UNICODE_NOT_SURROGATE_PAIR; } unicode -= HALF_BASE; hi |= ((unicode >> TEN_BITS) & LOW_TEN_BITS); lo |= ((unicode) & LOW_TEN_BITS); * hi_ptr = hi; * lo_ptr = lo; return UNICODE_OK; } /* Convert a surrogate pair in "hi" and "lo" to a single Unicode value. The return value is the Unicode value. If the return value is negative, an error has occurred. If "hi" and "lo" do not form a surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is returned. https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */ int32_t surrogates_to_unicode (int32_t hi, int32_t lo) { int32_t u; if (hi < UNI_SUR_HIGH_START || hi > UNI_SUR_HIGH_END || lo < UNI_SUR_LOW_START || lo > UNI_SUR_LOW_END) { return UNICODE_NOT_SURROGATE_PAIR; } u = ((hi - UNI_SUR_HIGH_START) << TEN_BITS) + (lo - UNI_SUR_LOW_START) + HALF_BASE; return u; } #undef UNI_SUR_HIGH_START #undef UNI_SUR_HIGH_END #undef UNI_SUR_LOW_START #undef UNI_SUR_LOW_END #undef TEN_BITS #undef HALF_BASE #undef LOW_TEN_BITS /* Convert the surrogate pair in "hi" and "lo" to UTF-8 in "utf8". This calls "surrogates_to_unicode" and "ucs2_to_utf8", thus it can return the same errors as them, and has the same restriction on "utf8" as "ucs2_to_utf8". */ int32_t surrogate_to_utf8 (int32_t hi, int32_t lo, uint8_t * utf8) { int32_t C; C = surrogates_to_unicode (hi, lo); if (C < 0) { return C; } return ucs2_to_utf8 (C, utf8); } /* Given a nul-terminated string "utf8" and a number of Unicode characters "n_chars", return the number of bytes into "utf8" at which the end of the characters occurs. A negative value indicates some kind of error. If "utf8" contains a zero byte, the return value is UNICODE_EMPTY_INPUT. This may also return any of the error values of "utf8_to_ucs2". */ int32_t unicode_chars_to_bytes (const uint8_t * utf8, int32_t n_chars) { int32_t i; const uint8_t * p = utf8; int32_t len = strlen ((const char *) utf8); if (len == 0 && n_chars != 0) { return UNICODE_EMPTY_INPUT; } for (i = 0; i < n_chars; i++) { int32_t ucs2 = utf8_to_ucs2 (p, & p); if (ucs2 < 0) { return ucs2; } } return p - utf8; } /* Like unicode_count_chars, but without error checks or validation of the input. This only checks the first byte of each UTF-8 sequence. It may return UTF8_BAD_LEADING_BYTE if the first byte is invalid. */ int32_t unicode_count_chars_fast (const uint8_t * utf8) { int32_t chars; const uint8_t * p; chars = 0; p = utf8; while (*p) { int32_t len; len = utf8_sequence_len[*p]; if (len == 0) { /* The first byte of a UTF-8 sequence is bad, so return this, not BAD_UTF8. */ return UTF8_BAD_LEADING_BYTE; } p += len; chars++; } return chars; } /* Given a nul-terminated string "utf8", return the total number of Unicode characters it contains. Return value If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the errors of "utf8_to_ucs2". */ int32_t unicode_count_chars (const uint8_t * utf8) { int32_t chars = 0; const uint8_t * p = utf8; int32_t len = strlen ((const char *) utf8); if (len == 0) { return 0; } while (p - utf8 < len) { int32_t ucs2; ucs2 = utf8_to_ucs2 (p, & p); if (ucs2 < 0) { /* Return the error from utf8_to_ucs2. */ return ucs2; } chars++; if (*p == '\0') { return chars; } } /* Cannot be reached in practice, since strlen indicates the null byte. */ return UTF8_BAD_LEADING_BYTE; } #ifdef HEADER /* These are intended for use in switch statements, for example switch (c) { case BYTE_80_8F: do_something; They originally come from the Json3 project. */ #define BYTE_80_8F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F #define BYTE_80_9F \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F #define BYTE_80_BF \ 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: \ case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: \ case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: \ case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: \ case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: \ case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: \ case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: \ case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: \ case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: \ case 0xBF #define BYTE_90_BF \ 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: \ case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: \ case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: \ case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: \ case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: \ case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: \ case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_A0_BF \ 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: \ case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: \ case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: \ case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: \ case 0xBC: case 0xBD: case 0xBE: case 0xBF #define BYTE_C2_DF \ 0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: \ case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: \ case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: \ case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: \ case 0xDE: case 0xDF #define BYTE_E1_EC \ 0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: \ case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC #define BYTE_EE_EF \ 0xEE: case 0xEF #define BYTE_F1_F3 \ 0xF1: case 0xF2: case 0xF3 #endif /* def HEADER */ #define UNICODEADDBYTE i++ #define UNICODEFAILUTF8(want) return UTF8_INVALID #define UNICODENEXTBYTE c = input[i] /* Given "input" and "input_length", validate "input" byte by byte up to "input_length". The return value may be UTF8_VALID or UTF8_INVALID. */ int32_t valid_utf8 (const uint8_t * input, int32_t input_length) { int32_t i; uint8_t c; i = 0; string_start: i++; if (i >= input_length) { return UTF8_VALID; } /* Set c separately here since we use a range comparison before the switch statement. */ c = input[i]; /* Admit all bytes <= 0x80. */ if (c <= 0x80) { goto string_start; } switch (c) { case BYTE_C2_DF: UNICODEADDBYTE; goto byte_last_80_bf; case 0xE0: UNICODEADDBYTE; goto byte23_a0_bf; case BYTE_E1_EC: UNICODEADDBYTE; goto byte_penultimate_80_bf; case 0xED: UNICODEADDBYTE; goto byte23_80_9f; case BYTE_EE_EF: UNICODEADDBYTE; goto byte_penultimate_80_bf; case 0xF0: UNICODEADDBYTE; goto byte24_90_bf; case BYTE_F1_F3: UNICODEADDBYTE; goto byte24_80_bf; case 0xF4: UNICODEADDBYTE; goto byte24_80_8f; } byte_last_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_BF: UNICODEADDBYTE; goto string_start; default: UNICODEFAILUTF8 (XBYTES_80_BF); } byte_penultimate_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_BF: UNICODEADDBYTE; goto byte_last_80_bf; default: UNICODEFAILUTF8 (XBYTES_80_BF); } byte24_90_bf: switch (UNICODENEXTBYTE) { case BYTE_90_BF: UNICODEADDBYTE; goto byte_penultimate_80_bf; default: UNICODEFAILUTF8 (XBYTES_90_BF); } byte23_80_9f: switch (UNICODENEXTBYTE) { case BYTE_80_9F: UNICODEADDBYTE; goto byte_last_80_bf; default: UNICODEFAILUTF8 (XBYTES_80_9F); } byte23_a0_bf: switch (UNICODENEXTBYTE) { case BYTE_A0_BF: UNICODEADDBYTE; goto byte_last_80_bf; default: UNICODEFAILUTF8 (XBYTES_A0_BF); } byte24_80_bf: switch (UNICODENEXTBYTE) { case BYTE_80_BF: UNICODEADDBYTE; goto byte_penultimate_80_bf; default: UNICODEFAILUTF8 (XBYTES_80_BF); } byte24_80_8f: switch (UNICODENEXTBYTE) { case BYTE_80_8F: UNICODEADDBYTE; goto byte_penultimate_80_bf; default: UNICODEFAILUTF8 (XBYTES_80_8F); } } /* Make "* ptr" point to the start of the first UTF-8 character after its initial value. This assumes that there are at least four bytes which can be read, and that "* ptr" points to valid UTF-8. If "** ptr" does not have its top bit set, 00xx_xxxx, this does not change the value of "* ptr", and it returns UNICODE_OK. If "** ptr" has its top two bits set, 11xx_xxxx, this does not change the value of "* ptr" and it returns UNICODE_OK. If "**ptr" has its top bit set but its second-to-top bit unset, 10xx_xxxx, so it is the second, third, or fourth byte of a multibyte sequence, "* ptr" is incremented until either "** ptr" is a valid first byte of a UTF-8 sequence, or too many bytes have passed for it to be valid UTF-8. If too many bytes have passed, UTF8_BAD_CONTINUATION_BYTE is returned and "*ptr" is left unchanged. If a valid UTF-8 first byte was found, either 11xx_xxxx or 00xx_xxxx, UNICODE_OK is returned, and "*ptr" is set to the address of the valid byte. Nul bytes (bytes containing zero) are considered valid. This does not check for invalid UTF-8 bytes such as 0xFE and 0xFF. */ int32_t trim_to_utf8_start (uint8_t ** ptr) { uint8_t * p = *ptr; uint8_t c; int32_t i; /* 0xC0 = 1100_0000. */ c = *p & 0xC0; if (c == 0xC0 || c == 0x00) { return UNICODE_OK; } for (i = 0; i < UTF8_MAX_LENGTH - 1; i++) { c = p[i]; if ((c & 0x80) != 0x80 || (c & 0x40) != 0) { * ptr = p + i; return UNICODE_OK; } } return UTF8_BAD_CONTINUATION_BYTE; } /* Given a return value "code" which is negative or zero, return a string which describes what the return value means. Positive non-zero return values never indicate errors or statuses in this library. */ const char * unicode_code_to_error (int32_t code) { switch (code) { case UTF8_BAD_LEADING_BYTE: return "The leading byte of a UTF-8 sequence was invalid"; case UTF8_BAD_CONTINUATION_BYTE: return "A continuation byte of a UTF-8 sequence was invalid"; case UNICODE_SURROGATE_PAIR: return "A surrogate pair code point could not be converted to UTF-8"; case UNICODE_NOT_SURROGATE_PAIR: return "Input code points did not form a surrogate pair"; case UNICODE_OK: return "Successful completion"; case UNICODE_TOO_BIG: return "A code point was beyond limits"; case UNICODE_NOT_CHARACTER: return "A number ending in hex FFFF or FFFE is not valid Unicode"; case UTF8_NON_SHORTEST: return "A UTF-8 input was not in the shortest form"; case UNICODE_EMPTY_INPUT: return "A byte with value zero was found in UTF-8 input"; default: return "Unknown/invalid error code"; } } /* _____ _ |_ _|__ ___| |_ ___ | |/ _ \/ __| __/ __| | | __/\__ \ |_\__ \ |_|\___||___/\__|___/ */ /* Below this is code for testing which is not normally compiled. Use "make test" to compile the testing version. */ #ifdef TEST #include #include #include "c-tap-test.h" static const uint8_t * utf8 = (uint8_t *) "漢数字ÔÕÖX"; #define BUFFSIZE 0x100 static void test_ucs2_to_utf8 () { /* Buffer to print utf8 out into. */ uint8_t buffer[BUFFSIZE]; /* Offset into buffer. */ uint8_t * offset; const uint8_t * start = utf8; offset = buffer; while (1) { int32_t unicode; int32_t bytes; const uint8_t * end; unicode = utf8_to_ucs2 (start, & end); if (unicode == UNICODE_EMPTY_INPUT) { break; } if (unicode < 0) { fprintf (stderr, "%s:%d: unexpected error %s converting unicode.\n", __FILE__, __LINE__, unicode_code_to_error (unicode)); // exit ok in test exit (EXIT_FAILURE); } bytes = ucs2_to_utf8 (unicode, offset); TAP_TEST_MSG (bytes > 0, "no bad conversion"); TAP_TEST_MSG (strncmp ((const char *) offset, (const char *) start, bytes) == 0, "round trip OK for %X (%d bytes)", unicode, bytes); start = end; offset += bytes; if (offset - buffer >= BUFFSIZE) { fprintf (stderr, "%s:%d: out of space in buffer.\n", __FILE__, __LINE__); // exit ok exit (EXIT_FAILURE); } } * offset = '\0'; TAP_TEST_MSG (strcmp ((const char *) buffer, (const char *) utf8) == 0, "input %s resulted in identical output %s", utf8, buffer); } static void test_invalid_utf8 () { uint8_t invalid_utf8[UTF8_MAX_LENGTH]; int32_t unicode; int32_t valid; const uint8_t * end; snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1, "%c%c%c", 0xe8, 0xe4, 0xe5); unicode = utf8_to_ucs2 (invalid_utf8, & end); TAP_TEST_MSG (unicode == UTF8_BAD_CONTINUATION_BYTE, "invalid UTF-8 gives incorrect result"); valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8)); TAP_TEST_MSG (valid == UTF8_INVALID, "Invalid UTF-8 fails valid_utf8"); } static void test_surrogate_pairs () { int32_t status; int32_t hi; int32_t lo; int32_t rt; /* This is the wide character space, which does not require representation as a surrogate pair. */ int32_t nogood = 0x3000; /* Two examples from the Wikipedia article on UTF-16 https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */ int32_t wikipedia_1 = 0x10437; int32_t wikipedia_2 = 0x24b62; /* An example from the JSON RFC http://rfc7159.net/rfc7159#rfc.section.7 */ int32_t json_spec = 0x1D11E; status = unicode_to_surrogates (nogood, & hi, & lo); TAP_TEST_MSG (status == UNICODE_NOT_SURROGATE_PAIR, "low value to surrogate pair breaker returns error"); status = unicode_to_surrogates (wikipedia_1, & hi, & lo); TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1); TAP_TEST_MSG (hi == 0xD801, "Got expected %X == 0xD801", hi); TAP_TEST_MSG (lo == 0xDC37, "Got expected %X == 0xDC37", lo); rt = surrogates_to_unicode (hi, lo); TAP_TEST_MSG (rt == wikipedia_1, "Round trip %X == initial %X", rt, wikipedia_1); status = unicode_to_surrogates (wikipedia_2, & hi, & lo); TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", wikipedia_1); TAP_TEST_MSG (hi == 0xD852, "Got expected %X == 0xD852", hi); TAP_TEST_MSG (lo == 0xDF62, "Got expected %X == 0xDF62", lo); rt = surrogates_to_unicode (hi, lo); TAP_TEST_MSG (rt == wikipedia_2, "Round trip %X == initial %X", rt, wikipedia_2); status = unicode_to_surrogates (json_spec, & hi, & lo); TAP_TEST_MSG (status == UNICODE_OK, "Ok with %X", json_spec); TAP_TEST_MSG (hi == 0xD834, "Got expected %X == 0xD834", hi); TAP_TEST_MSG (lo == 0xDd1e, "Got expected %X == 0xDD1e", lo); rt = surrogates_to_unicode (hi, lo); TAP_TEST_MSG (rt == json_spec, "Round trip %X == initial %X", rt, json_spec); } /* Test sending various bytes into "utf8_bytes" and seeing whether the return value is what we expected. */ static void test_utf8_bytes () { struct tub { int32_t first; int32_t expect; } tests[] = { {'a', 1}, {0xb0, UTF8_BAD_LEADING_BYTE}, {0xc2, 2}, {0xff, UTF8_BAD_LEADING_BYTE}, }; int32_t n_tests = sizeof (tests) / sizeof (struct tub); int32_t i; for (i = 0; i < n_tests; i++) { /* Expected bytes. */ int32_t xbytes; int32_t firstbyte; firstbyte = tests[i].first; xbytes = utf8_bytes (firstbyte); TAP_TEST_MSG (xbytes == tests[i].expect, "Got %d (%d) with input %d", xbytes, tests[i].expect, firstbyte); } } /* Test the conversion from utf-8 to ucs-2 (UTF-16). */ static void test_utf8_to_ucs2 () { const uint8_t * start = utf8; while (*start) { int32_t unicode; const uint8_t * end; unicode = utf8_to_ucs2 (start, & end); TAP_TEST_MSG (unicode > 0, "no bad value at %s", start); printf ("# %s is %04X, length is %d\n", start, unicode, end - start); start = end; } } /* Test counting of unicode characters. */ static void test_unicode_count_chars () { int32_t cc; cc = unicode_count_chars (utf8); TAP_TEST_MSG (cc == 7, "unicode_count_chars gets seven characters for utf8"); cc = unicode_count_chars_fast (utf8); TAP_TEST_MSG (cc == 7, "unicode_count_chars_fast gets seven characters for utf8"); } static void test_valid_utf8 () { int32_t valid; valid = valid_utf8 (utf8, strlen ((const char *) utf8)); TAP_TEST_MSG (valid == UTF8_VALID, "Valid UTF-8 passes valid_utf8"); } static void test_trim_to_utf8_start () { int32_t status; uint8_t * p; /* Invalid UTF-8. */ uint8_t bad[] = {0x99, 0x99, 0x99, 0x99, 0x99, 0x99}; /* Valid UTF-8. */ uint8_t good[] = "化苦"; uint8_t good2[] = "化abc"; p = bad; status = trim_to_utf8_start (& p); TAP_TEST_MSG (status == UTF8_BAD_CONTINUATION_BYTE, "Non-UTF-8 causes error"); TAP_TEST_MSG (p == bad, "Did not change pointer"); p = good + 1; status = trim_to_utf8_start (& p); TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result"); TAP_TEST_MSG (p != good + 1, "Moved p"); TAP_TEST_MSG (p == good + 3, "Moved p to the right position"); p = good2 + 1; status = trim_to_utf8_start (& p); TAP_TEST_MSG (status == UNICODE_OK, "Got TAP_TEST_MSG result"); TAP_TEST_MSG (p != good2 + 1, "Moved p"); TAP_TEST_MSG (p == good2 + 3, "Moved p to the right position"); } static void test_constants () { TAP_TEST (UNICODE_UTF8_4 > UNICODE_MAXIMUM); } int main () { test_utf8_to_ucs2 (); test_ucs2_to_utf8 (); test_invalid_utf8 (); test_unicode_count_chars (); test_surrogate_pairs (); test_utf8_bytes (); test_valid_utf8 (); test_trim_to_utf8_start (); test_constants (); TAP_PLAN; } #endif /* def TEST */ JSON-Parse-0.56/META.yml000644 001751 001751 00000001407 13622411134 014111 0ustar00benben000000 000000 --- abstract: 'Read JSON into a Perl variable' author: - 'Ben Bullock ' build_requires: ExtUtils::MakeMaker: '0' configure_requires: ExtUtils::MakeMaker: '0' dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: JSON-Parse no_index: directory: - t - inc requires: Carp: '0' perl: '5.008009' resources: bugtracker: https://github.com/benkasminbullock/JSON-Parse/issues repository: git://github.com/benkasminbullock/JSON-Parse.git version: '0.56' x_contributors: - 'Shlomi Fish ' - kolmogorov42 x_serialization_backend: 'CPAN::Meta::YAML version 0.018' JSON-Parse-0.56/META.json000644 001751 001751 00000002526 13622411134 014264 0ustar00benben000000 000000 { "abstract" : "Read JSON into a Perl variable", "author" : [ "Ben Bullock " ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "JSON-Parse", "no_index" : { "directory" : [ "t", "inc" ] }, "prereqs" : { "build" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "runtime" : { "requires" : { "Carp" : "0", "perl" : "5.008009" } } }, "release_status" : "stable", "resources" : { "bugtracker" : { "web" : "https://github.com/benkasminbullock/JSON-Parse/issues" }, "repository" : { "type" : "git", "url" : "git://github.com/benkasminbullock/JSON-Parse.git", "web" : "https://github.com/benkasminbullock/JSON-Parse" } }, "version" : "0.56", "x_contributors" : [ "Shlomi Fish ", "kolmogorov42" ], "x_serialization_backend" : "JSON::PP version 2.97001" } JSON-Parse-0.56/lib/JSON/000755 001751 001751 00000000000 13622411134 014155 5ustar00benben000000 000000 JSON-Parse-0.56/lib/JSON/Parse.pod000644 001751 001751 00000156140 13622411131 015737 0ustar00benben000000 000000 =pod =encoding UTF-8 =head1 NAME JSON::Parse - Read JSON into a Perl variable =head1 SYNOPSIS use JSON::Parse 'parse_json'; my $json = '["golden", "fleece"]'; my $perl = parse_json ($json); # Same effect as $perl = ['golden', 'fleece']; Convert JSON into Perl. =head1 VERSION This documents version 0.56 of JSON::Parse corresponding to L released on Mon Feb 17 13:10:15 2020 +0900. =head1 DESCRIPTION A module for parsing JSON. (JSON means "JavaScript Object Notation" and it is specified in L.) JSON::Parse offers the function L, which takes a string containing JSON, and returns an equivalent Perl structure. It also offers validation of JSON via L, which returns true or false depending on whether the JSON is correct or not, and L, which produces a descriptive fatal error if the JSON is invalid. A function L reads JSON from a file, and there is a safer version of L called L which doesn't throw exceptions. For special cases of parsing, there are also methods L and L, which create a JSON parsing object and run it on text. See L. JSON::Parse accepts only UTF-8 as input. See L and L. =head1 FUNCTIONS =head2 parse_json use JSON::Parse 'parse_json'; my $perl = parse_json ('{"x":1, "y":2}'); This function converts JSON into a Perl structure, either an array reference, a hash reference, or a scalar. If the first argument does not contain a complete valid JSON text, is the undefined value, an empty string, or a string containing only whitespace C throws a fatal error ("dies"). If the argument contains valid JSON, the return value is either a hash reference, an array reference, or a scalar. If the input JSON text is a serialized object, a hash reference is returned: use JSON::Parse ':all'; my $perl = parse_json ('{"a":1, "b":2}'); print ref $perl, "\n"; produces output HASH (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/hash.pl> in the distribution.) If the input JSON text is a serialized array, an array reference is returned: use JSON::Parse ':all'; my $perl = parse_json ('["a", "b", "c"]'); print ref $perl, "\n"; produces output ARRAY (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/array.pl> in the distribution.) Otherwise a Perl scalar is returned. The behaviour of allowing a scalar was added in version 0.32 of this module. This brings it into line with the new specification for JSON. The behaviour of disallowing empty inputs was changed in version 0.49. This makes it conform to the L. The function L offers a version of this function with various safety features enabled. =head2 json_file_to_perl use JSON::Parse 'json_file_to_perl'; my $p = json_file_to_perl ('filename'); This is exactly the same as L except that it reads the JSON from the specified file rather than a scalar. The file must be in the UTF-8 encoding, and is opened as a character file using C<:encoding(UTF-8)> (see L and L for details). The output is marked as character strings. This is a convenience function written in Perl. You may prefer to read the file yourself using another module if you need faster performance. =head2 valid_json use JSON::Parse 'valid_json'; if (valid_json ($json)) { # do something } C returns I<1> if its argument is valid JSON and I<0> if not. It runs several times faster than L. This gain in speed is obtained because it discards the input data after reading it, rather than storing it into Perl variables. This does not supply the actual errors which caused invalidity. Use L to get error messages when the JSON is invalid. This cannot detect key collisions in the JSON since it does not store values. See L for more on this module's handling of non-unique names in the JSON. =head2 assert_valid_json use JSON::Parse 'assert_valid_json'; eval { assert_valid_json ('["xyz":"b"]'); }; if ($@) { print "Your JSON was invalid: $@\n"; } # Prints "Unexpected character ':' parsing array" produces output Your JSON was invalid: JSON error at line 1, byte 7/11: Unexpected character ':' parsing array starting from byte 1: expecting whitespace: '\n', '\r', '\t', ' ' or comma: ',' or end of array: ']' at /usr/home/ben/projects/Json3/examples/assert.pl line 6. (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/assert.pl> in the distribution.) This is the underlying function for L. It runs at the same speed, but it throws an error if the JSON is wrong, rather than returning 1 or 0. See L for the error format, which is identical to L. This cannot detect key collisions in the JSON since it does not store values. See L for more on this module's handling of non-unique names in the JSON. The behaviour of disallowing empty inputs was changed in version 0.49. This makes it conform to the L, and also makes it give identical results to L. =head2 parse_json_safe This is almost the same thing as L, but has the following differences: =over =item Does not throw exceptions If the JSON is invalid, a warning is printed and the undefined value is returned, as if calling L like this: eval { $out = parse_json ($json); }; if ($@) { carp $@; $out = undef; } =item Detects key collisions This switches on L, so that if the JSON contains non-unique names, a warning is printed and the undefined value is returned. See L for an explanation of what a key collision is. =item Booleans are not read-only This switches on L so that JSON true, false and null values are copied. These values can be modified, but they will not be converted back into C and C by L. =item Errors are reported by carp Parsing errors are reported by L, so the error line number refers to the caller's line. =back As the name implies, this is meant to be a "safety-first" version of L. This function does not pass all of the tests of the L, because it creates an error for duplicate keys in objects, which is legal JSON. See F for details. This function was added in version 0.38. =head1 OLD INTERFACE The following alternative function names are accepted. These are the names used for the functions in old versions of this module. These names are not deprecated and will never be removed from the module. =head2 json_to_perl This is exactly the same function as L. =head2 validate_json This is exactly the same function as L. =head1 Mapping from JSON to Perl JSON elements are mapped to Perl as follows: =head2 JSON numbers JSON numbers become Perl numbers, either integers or double-precision floating point numbers, or possibly strings containing the number if parsing of a number by the usual methods fails somehow. JSON does not allow leading zeros, like I<0123>, or leading plus signs, like I<+100>, in numbers, so these cause an L error. JSON also does not allow numbers of the form I<1.>, but it does allow things like I<0e0> or I<1E999999>. As far as possible these are accepted by JSON::Parse. =head2 JSON strings JSON strings become Perl strings. The JSON escape characters such as C<\t> for the tab character (see section 2.5 of L) are mapped to the equivalent ASCII character. =head3 Handling of Unicode Inputs must be in the UTF-8 format. See L. If the input to L is marked as Unicode characters, the output strings will be marked as Unicode characters. If the input is not marked as Unicode characters, the output strings will not be marked as Unicode characters. Thus, use JSON::Parse ':all'; # The scalar $sasori looks like Unicode to Perl use utf8; my $sasori = '["蠍"]'; my $p = parse_json ($sasori); print utf8::is_utf8 ($p->[0]); # Prints 1. but use JSON::Parse ':all'; # The scalar $ebi does not look like Unicode to Perl no utf8; my $ebi = '["海老"]'; my $p = parse_json ($ebi); print utf8::is_utf8 ($p->[0]); # Prints nothing. Escapes of the form \uXXXX (see page three of L) are mapped to ASCII if XXXX is less than 0x80, or to UTF-8 if XXXX is greater than or equal to 0x80. Strings containing \uXXXX escapes greater than 0x80 are also upgraded to character strings, regardless of whether the input is a character string or a byte string, thus regardless of whether Perl thinks the input string is Unicode, escapes like \u87f9 are converted into the equivalent UTF-8 bytes and the particular string in which they occur is marked as a character string: use JSON::Parse ':all'; no utf8; # 蟹 my $kani = '["\u87f9"]'; my $p = parse_json ($kani); print "It's marked as a character string" if utf8::is_utf8 ($p->[0]); # Prints "It's marked as a character string" because it's upgraded # regardless of the input string's flags. This is modelled on the behaviour of Perl's C: no utf8; my $kani = '87f9'; print "hex is character string\n" if utf8::is_utf8 ($kani); # prints nothing $kani = chr (hex ($kani)); print "chr makes it a character string\n" if utf8::is_utf8 ($kani); # prints "chr makes it a character string" However, JSON::Parse also upgrades the remaining part of the string into a character string, even when it's not marked as a character string. For example, use JSON::Parse ':all'; use Unicode::UTF8 'decode_utf8'; no utf8; my $highbytes = "か"; my $not_utf8 = "$highbytes\\u3042"; my $test = "{\"a\":\"$not_utf8\"}"; my $out = parse_json ($test); # JSON::Parse does something unusual here in promoting the first part # of the string into UTF-8. print "JSON::Parse gives this: ", $out->{a}, "\n"; # Perl cannot assume that $highbytes is in UTF-8, so it has to just # turn the initial characters into garbage. my $add_chr = $highbytes . chr (0x3042); print "Perl's output is like this: ", $add_chr, "\n"; # In fact JSON::Parse's behaviour is equivalent to this: my $equiv = decode_utf8 ($highbytes) . chr (0x3042); print "JSON::Parse did something like this: ", $equiv, "\n"; # With character strings switched on, Perl and JSON::Parse do the same # thing. use utf8; my $is_utf8 = "か"; my $test2 = "{\"a\":\"$is_utf8\\u3042\"}"; my $out2 = parse_json ($test2); print "JSON::Parse: ", $out2->{a}, "\n"; my $add_chr2 = $is_utf8 . chr (0x3042); print "Native Perl: ", $add_chr2, "\n"; produces output JSON::Parse gives this: かあ Perl's output is like this: かあ JSON::Parse did something like this: かあ JSON::Parse: かあ Native Perl: かあ (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/unicode-details.pl> in the distribution.) Although in general the above would be an unsafe practice, JSON::Parse can do things this way because JSON is a text-only, Unicode-only format. To ensure that invalid inputs are never upgraded, JSON::Parse checks each input byte to make sure that it forms UTF-8. See also L. Doing things this way, rather than the way that Perl does it, was one of the original motivations for writing this module. See also L. Surrogate pairs in the form C<\uD834\uDD1E> are also handled. If the second half of the surrogate pair is missing, an L or L error is thrown. If the second half of the surrogate pair is present but contains an impossible value, a L error is thrown. =head2 JSON arrays JSON arrays become Perl array references. The elements of the Perl array are in the same order as they appear in the JSON. Thus my $p = parse_json ('["monday", "tuesday", "wednesday"]'); has the same result as a Perl declaration of the form my $p = [ 'monday', 'tuesday', 'wednesday' ]; =head2 JSON objects JSON objects become Perl hashes. The members of the JSON object become key and value pairs in the Perl hash. The string part of each object member becomes the key of the Perl hash. The value part of each member is mapped to the value of the Perl hash. Thus my $j = < ['blue', 'black'], tuesday => ['grey', 'heart attack'], friday => 'Gotta get down on Friday', }; =head3 Key collisions A key collision is something like the following. use JSON::Parse qw/parse_json parse_json_safe/; my $j = '{"a":1, "a":2}'; my $p = parse_json ($j); print "Ambiguous key 'a' is ", $p->{a}, "\n"; my $q = parse_json_safe ($j); produces output JSON::Parse::parse_json_safe: JSON error at line 1, byte 10/14: Name is not unique: "a" parsing object starting from byte 1 at /usr/home/ben/projects/Json3/examples/key-collision.pl line 8. Ambiguous key 'a' is 2 (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/key-collision.pl> in the distribution.) Here the key "a" could be either 1 or 2. As seen in the example, L overwrites the first value with the second value. L halts and prints a warning. If you use L you can switch key collision on and off with the L method. The rationale for L not to give warnings is that Perl doesn't give information about collisions when storing into hash values, and checking for collisions for every key will degrade performance for the sake of an unlikely occurrence. The JSON specification says "The names within an object SHOULD be unique." (see L, page 5), although it's not a requirement. For performance, L and L do not store hash keys, thus they cannot detect this variety of problem. =head2 Literals =head3 null L maps the JSON null literal to a readonly scalar C<$JSON::Parse::null> which evaluates to C. L maps the JSON literal to the undefined value. If you use a parser created with L, you can choose either of these behaviours with L, or you can tell JSON::Parse to put your own value in place of nulls using the L method. =head3 true L maps the JSON true literal to a readonly scalar which evaluates to C<1>. L maps the JSON literal to the value 1. If you use a parser created with L, you can choose either of these behaviours with L, or you can tell JSON::Parse to put your own value in place of trues using the L method. =head3 false L maps the JSON false literal to a readonly scalar which evaluates to the empty string, or to zero in a numeric context. (This behaviour changed from version 0.36 to 0.37. In versions up to 0.36, the false literal was mapped to a readonly scalar which evaluated to 0 only.) L maps the JSON literal to a similar scalar without the readonly constraints. If you use a parser created with L, you can choose either of these behaviours with L, or you can tell JSON::Parse to put your own value in place of falses using the L method. =head3 Round trips and compatibility The Perl versions of literals produced by L will be converted back to JSON literals if you use L. However, JSON::Parse's literals are incompatible with the other CPAN JSON modules. For compatibility with other CPAN modules, create a JSON::Parse object with L, and set JSON::Parse's literals with L, L, and L. This example demonstrates round-trip compatibility using L version 0.54: use JSON::Tiny '0.54', qw(decode_json encode_json); use JSON::Parse; use JSON::Create; my $cream = '{"clapton":true,"hendrix":false,"bruce":true,"fripp":false}'; my $jp = JSON::Parse->new (); my $jc = JSON::Create->new (); print "First do a round-trip of our modules:\n\n"; print $jc->run ($jp->run ($cream)), "\n\n"; print "Now do a round-trip of JSON::Tiny:\n\n"; print encode_json (decode_json ($cream)), "\n\n"; print "First, incompatible mode:\n\n"; print 'tiny(parse): ', encode_json ($jp->run ($cream)), "\n"; print 'create(tiny): ', $jc->run (decode_json ($cream)), "\n\n"; $jp->set_true (JSON::Tiny::true); $jp->set_false (JSON::Tiny::false); print "Compatibility with JSON::Parse:\n\n"; print 'tiny(parse):', encode_json ($jp->run ($cream)), "\n\n"; $jc->bool ('JSON::Tiny::_Bool'); print "Compatibility with JSON::Create:\n\n"; print 'create(tiny):', $jc->run (decode_json ($cream)), "\n\n"; print "JSON::Parse and JSON::Create are still compatible too:\n\n"; print $jc->run ($jp->run ($cream)), "\n"; produces output First do a round-trip of our modules: {"hendrix":false,"clapton":true,"fripp":false,"bruce":true} Now do a round-trip of JSON::Tiny: {"bruce":true,"clapton":true,"fripp":false,"hendrix":false} First, incompatible mode: tiny(parse): {"bruce":1,"clapton":1,"fripp":"","hendrix":""} create(tiny): {"fripp":0,"bruce":1,"clapton":1,"hendrix":0} Compatibility with JSON::Parse: tiny(parse):{"bruce":true,"clapton":true,"fripp":false,"hendrix":false} Compatibility with JSON::Create: create(tiny):{"hendrix":false,"fripp":false,"bruce":true,"clapton":true} JSON::Parse and JSON::Create are still compatible too: {"fripp":false,"bruce":true,"clapton":true,"hendrix":false} (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/json-tiny-round-trip-demo.pl> in the distribution.) Most of the other CPAN modules use similar methods to L, so the above example can easily be adapted. See also L for various examples. =head3 Modifying the values L maps all the literals to read-only values. Because of this, attempting to modifying the boolean values in the hash reference returned by L will cause "Modification of a read-only value attempted" errors: my $in = '{"hocus":true,"pocus":false,"focus":null}'; my $p = json_parse ($in); $p->{hocus} = 99; # "Modification of a read-only value attempted" error occurs Since the hash values are read-only scalars, C<< $p->{hocus} = 99 >> is like this: undef = 99; If you need to modify the returned hash reference, then delete the value first: my $in = '{"hocus":true,"pocus":false,"focus":null}'; my $p = json_parse ($in); delete $p->{pocus}; $p->{pocus} = 99; # OK Similarly with array references, delete the value before altering: my $in = '[true,false,null]'; my $q = json_parse ($in); delete $q->[1]; $q->[1] = 'magic'; Note that the return values from parsing bare literals are not read-only scalars, so my $true = JSON::Parse::json_parse ('true'); $true = 99; produces no error. This is because Perl copies the scalar. =head1 METHODS If you need to parse JSON and you are not satisfied with the parsing options offered by L and L, you can create a JSON parsing object with L and set various options on the object, then use it with L. These options include the ability to copy JSON literals with L, switch off fatal errors with L, detect key collisions in objects with L, and set the JSON literals to user defined values with the methods described under L. These methods only work on an object created with L; they do not affect the behaviour of L or L. =head2 new my $jp = JSON::Parse->new (); Create a new JSON::Parse object. This method was added in version 0.38. =head2 run my $out = $jp->run ($json); This does the same thing as L, except its behaviour can be modified using the methods below. This method was added in version 0.38. =head2 check eval { $jp->check ($json); }; This does the same thing as L, except its behaviour can be modified using the methods below. Only the L method will actually affect this. This method was added in version 0.48, for the benefit of L. =head2 copy_literals $jp->copy_literals (1); With a true value, copy JSON literal values (C, C, and C) into new Perl scalar values, and don't put read-only values into the output. With a false value, use read-only scalars: $jp->copy_literals (0); The C behaviour is the behaviour of L. The C behaviour is the behaviour of L. If the user also sets user-defined literals with L, L and L, that takes precedence over this. This method was added in version 0.38. =head2 warn_only $jp->warn_only (1); Warn, don't die, on error. Failed parsing returns the undefined value, C, and prints a warning. This can be switched off again using any false value: $jp->warn_only (''); This method was documented in version 0.38, but only implemented in version 0.41. =head2 detect_collisions $jp->detect_collisions (1); This switches on a check for hash key collisions (non-unique names in JSON objects). If a collision is found, an error message L is printed, which also gives the non-unique name and the byte position where the start of the colliding string was found: use JSON::Parse; my $jp = JSON::Parse->new (); $jp->detect_collisions (1); eval { $jp->run ('{"animals":{"cat":"moggy","cat":"feline","cat":"neko"}}'); }; print "$@\n" if $@; produces output JSON error at line 1, byte 28/55: Name is not unique: "cat" parsing object starting from byte 12 at /usr/home/ben/projects/Json3/blib/lib/JSON/Parse.pm line 103. (This example is included as L|https://fastapi.metacpan.org/source/BKB/JSON-Parse-0.56/examples/collide.pl> in the distribution.) The C behaviour is the behaviour of L. The C behaviour is the behaviour of L. This method was added in version 0.38. =head2 diagnostics_hash $jp->diagnostics_hash (1); This changes diagnostics produced by errors from a simple string into a hash reference containing various fields. This is experimental and subject to change. This is incompatible with L. This replaces the previous experimental global variable C<$json_diagnostics>, which was removed from the module. The hash keys and values are identical to those provided in the object returned by C<$json_diagnostics>, with the addition of a key C which returns the usual error. This requires Perl version 5.14 or later. This method was added in version 0.46. =head2 Methods for manipulating literals These methods alter what is written into the Perl structure when the parser sees a literal value, C, C or C in the input JSON. This number of methods is unfortunately necessary, since it's possible that a user might want to set_false (undef) to set false values to turn into undefs. $jp->set_false (undef); Thus, we cannot use a single function C<< $jp->false (undef) >> to cover both setting and deleting of values. These methods were added in version 0.38. =head3 set_true $jp->set_true ("Yes, that is so true"); Supply a scalar to be used in place of the JSON C literal. This example puts the string "Yes, that is so true" into the hash or array when we hit a "true" literal, rather than the default read-only scalar: use JSON::Parse; my $json = '{"yes":true,"no":false}'; my $jp = JSON::Parse->new (); $jp->set_true ('Yes, that is so true'); my $out = $jp->run ($json); print $out->{yes}, "\n"; prints Yes, that is so true To override the previous value, call it again with a new value. To delete the value and revert to the default behaviour, use L. If you give this a value which is not "true", as in Perl will evaluate it as a false in an if statement, it prints a warning L. You can switch this warning off with L. This method was added in version 0.38. =head3 delete_true $jp->delete_true (); Delete the user-defined true value. See L. This method is "safe" in that it has absolutely no effect if no user-defined value is in place. It does not return a value. This method was added in version 0.38. =head3 set_false $jp->set_false (JSON::PP::Boolean::false); Supply a scalar to be used in place of the JSON C literal. In the above example, when we hit a "false" literal, we put C in the output, similar to L and other CPAN modules like L or L. To override the previous value, call it again with a new value. To delete the value and revert to the default behaviour, use L. If you give this a value which is not "false", as in Perl will evaluate it as a false in an if statement, it prints a warning L. You can switch this warning off with L. This method was added in version 0.38. =head3 delete_false $jp->delete_false (); Delete the user-defined false value. See L. This method is "safe" in that it has absolutely no effect if no user-defined value is in place. It does not return a value. This method was added in version 0.38. =head3 set_null $jp->set_null (0); Supply a scalar to be used in place of the JSON C literal. To override the previous value, call it again with a new value. To delete the value and revert to the default behaviour, use L. This method was added in version 0.38. =head3 delete_null $jp->delete_null (); Delete the user-defined null value. See L. This method is "safe" in that it has absolutely no effect if no user-defined value is in place. It does not return a value. This method was added in version 0.38. =head3 no_warn_literals $jp->no_warn_literals (1); Use a true value to switch off warnings about setting boolean values to contradictory things. For example if you want to set the JSON C literal to turn into the string "false", $jp->no_warn_literals (1); $jp->set_false ("false"); See also L. This also switches off the warning L. This method was added in version 0.38. =head1 RESTRICTIONS This module imposes the following restrictions on its input. =over =item JSON only JSON::Parse is a strict parser. It only accepts input which exactly meets the criteria of L. That means, for example, JSON::Parse does not accept single quotes (') instead of double quotes ("), or numbers with leading zeros, like 0123. JSON::Parse does not accept control characters (0x00 - 0x1F) in strings, missing commas between array or hash elements like C<["a" "b"]>, or trailing commas like C<["a","b","c",]>. It also does not accept trailing non-whitespace, like the second "]" in C<["a"]]>. =item No incremental parsing JSON::Parse does not parse incrementally. It only parses fully-formed JSON strings which include all opening and closing brackets. This is an inherent part of the design of the module. Incremental parsing in the style of L would (obviously) require some kind of callback structure to deal with the elements of the partially digested structures, but JSON::Parse was never designed to do this; it merely converts what it sees into a Perl structure. Claims to offer incremental JSON parsing in other modules' documentation should be diligently verified. =item UTF-8 only Although JSON may come in various encodings of Unicode, JSON::Parse only parses the UTF-8 format. If input is in a different Unicode encoding than UTF-8, convert the input before handing it to this module. For example, for the UTF-16 format, use Encode 'decode'; my $input_utf8 = decode ('UTF-16', $input); my $perl = parse_json ($input_utf8); or, for a file, use C<:encoding> (see L and L): open my $input, "<:encoding(UTF-16)", 'some-json-file'; JSON::Parse does not determine the nature of the octet stream, as described in part 3 of L. This restriction to UTF-8 applies regardless of whether Perl thinks that the input string is a character string or a byte string. Non-UTF-8 input will cause an L error. =back =head1 DIAGNOSTICS L does not produce error messages. L and L die on encountering invalid input. L uses L to pass error messages as warnings. Error messages have the line number, and the byte number where appropriate, of the input which caused the problem. The line number is formed simply by counting the number of "\n" (linefeed, ASCII 0x0A) characters in the whitespace part of the JSON. In L and L, parsing errors are fatal, so to continue after an error occurs, put the parsing into an C block: my $p; eval { $p = parse_json ($j); }; if ($@) { # handle error } The following error messages are produced: =head2 Unexpected character An unexpected character (byte) was encountered in the input. For example, when looking at the beginning of a string supposedly containing JSON, if the module encounters a plus sign, it will give an error like this: assert_valid_json ('+'); gives output JSON error at line 1, byte 1/1: Unexpected character '+' parsing initial state: expecting whitespace: 'n', '\r', '\t', ' ' or start of string: '"' or digit: '0-9' or minus: '-' or start of an array or object: '{', '[' or start of literal: 't', 'f', 'n' The message always includes a list of what characters are allowed. If there is some recognizable structure being parsed, the error message will include its starting point in the form "starting from byte n": assert_valid_json ('{"this":"\a"}'); gives output JSON error at line 1, byte 11/13: Unexpected character 'a' parsing string starting from byte 9: expecting escape: '', '/', '"', 'b', 'f', 'n', 'r', 't', 'u' A feature of JSON is that parsing it requires only one byte to be examined at a time. Thus almost all parsing problems can be handled using the "Unexpected character" error type, including spelling errors in literals: assert_valid_json ('[true,folse]'); gives output JSON error at line 1, byte 8/12: Unexpected character 'o' parsing literal starting from byte 7: expecting 'a' and the missing second half of a surrogate pair: assert_valid_json ('["\udc00? <-- should be a second half here"]'); gives output JSON error at line 1, byte 9/44: Unexpected character '?' parsing unicode escape starting from byte 3: expecting '\' All kinds of errors can occur parsing numbers, for example a missing fraction, assert_valid_json ('[1.e9]'); gives output JSON error at line 1, byte 4/6: Unexpected character 'e' parsing number starting from byte 2: expecting digit: '0-9' and a leading zero, assert_valid_json ('[0123]'); gives output JSON error at line 1, byte 3/6: Unexpected character '1' parsing number starting from byte 2: expecting whitespace: 'n', '\r', '\t', ' ' or comma: ',' or end of array: ']' or dot: '.' or exponential sign: 'e', 'E' The error message is this complicated because all of the following are valid here: whitespace: C<[0 ]>; comma: C<[0,1]>, end of array: C<[0]>, dot: C<[0.1]>, or exponential: C<[0e0]>. These are all handled by this error. Thus the error messages are a little confusing as diagnostics. Versions of this module prior to 0.29 gave more informative messages like "leading zero in number". (The messages weren't documented.) The reason to change over to the single message was because it makes the parsing code simpler, and because the testing code described in L makes use of the internals of this error to check that the error message produced actually do correspond to the invalid and valid bytes allowed by the parser, at the exact byte given. This is a bytewise error, thus for example if a miscoded UTF-8 appears in the input, an error message saying what bytes would be valid at that point will be printed. no utf8; use JSON::Parse 'assert_valid_json'; # Error in first byte: my $bad_utf8_1 = chr (hex ("81")); eval { assert_valid_json ("[\"$bad_utf8_1\"]"); }; print "$@\n"; # Error in third byte: my $bad_utf8_2 = chr (hex ('e2')) . chr (hex ('9C')) . 'b'; eval { assert_valid_json ("[\"$bad_utf8_2\"]"); }; print "$@\n"; prints JSON error at line 1, byte 3/5: Unexpected character 0x81 parsing string starting from byte 2: expecting printable ASCII or first byte of UTF-8: '\x20-\x7f', '\xC2-\xF4' at examples/bad-utf8.pl line 10. JSON error at line 1, byte 5/7: Unexpected character 'b' parsing string starting from byte 2: expecting bytes in range 80-bf: '\x80-\xbf' at examples/bad-utf8.pl line 16. =head2 Unexpected end of input The end of the string was encountered before the end of whatever was being parsed was. For example, if a quote is missing from the end of the string, it will give an error like this: assert_valid_json ('{"first":"Suzuki","second":"Murakami","third":"Asada}'); gives output JSON error at line 1: Unexpected end of input parsing string starting from byte 47 =head2 Not surrogate pair While parsing a string, a surrogate pair was encountered. While trying to turn this into UTF-8, the second half of the surrogate pair turned out to be an invalid value. assert_valid_json ('["\uDC00\uABCD"]'); gives output JSON error at line 1: Not surrogate pair parsing unicode escape starting from byte 11 =head2 Empty input This error occurs for an input which is an empty (no length or whitespace only) or an undefined value. assert_valid_json (''); gives output JSON error: Empty input parsing initial state Prior to version 0.49, this error was produced by L only, but it is now also produced by L. See L. =head2 Name is not unique This error occurs when parsing JSON when the user has chosen L. For example an input like my $p = JSON::Parse->new (); $p->detect_collisions (1); $p->run ('{"hocus":1,"pocus":2,"hocus":3}'); gives output JSON error at line 1, byte 23/31: Name is not unique: "hocus" parsing object starting from byte 1 at blib/lib/JSON/Parse.pm line 109. where the JSON object has two keys with the same name, C. The terminology "name is not unique" is from the JSON specification. =head2 Contradictory values for "true" and "false" =head3 User-defined value for JSON false evaluates as true This happens if you set JSON false to map to a true value: $jp->set_false (1); To switch off this warning, use L. This warning was added in version 0.38. =head3 User-defined value for JSON true evaluates as false This happens if you set JSON true to map to a false value: $jp->set_true (undef); To switch off this warning, use L. This warning was added in version 0.38. =head2 User-defined value overrules copy_literals This warning is given if you set up literals with L then you also set up your own true, false, or null values with L, L, or L. This warning was added in version 0.38. =head1 PERFORMANCE On the author's computer, the module's speed of parsing is approximately the same as L, with small variations depending on the type of input. For validation, L is faster than any other module known to the author, and up to ten times faster than JSON::XS. Some special types of input, such as floating point numbers containing an exponential part, like "1e09", seem to be about two or three times faster to parse with this module than with L. In JSON::Parse, parsing of exponentials is done by the system's C function, but JSON::XS contains its own parser for exponentials, so these results may be system-dependent. At the moment the main place JSON::XS wins over JSON::Parse is in strings containing escape characters, where JSON::XS is about 10% faster on the module author's computer and compiler. As of version 0.33, despite some progress in improving JSON::Parse, I haven't been able to fully work out the reason behind the better speed. There is some benchmarking code in the github repository under the directory "benchmarks" for those wishing to test these claims. The script L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/benchmarks/bench> is an adaptation of the similar script in the L distribution. The script L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/benchmarks/pub-bench.pl> runs the benchmarks and prints them out as POD. The following benchmark tests used version 0.47 of JSON::Parse and version 3.03 of JSON::XS on Perl Version 18.2, compiled with Clang version 3.4.1 on FreeBSD 10.3. The files in the "benchmarks" directory of JSON::Parse. "short.json" and "long.json" are the benchmarks used by JSON::XS. =over =item short.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| JP::valid | 776722.963 | 0.0000129 | JSON::Parse | 285326.803 | 0.0000350 | JSON::XS | 257319.264 | 0.0000389 | --------------+------------+------------+ =item long.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| JP::valid | 13985.675 | 0.0007150 | JSON::Parse | 5128.138 | 0.0019500 | JSON::XS | 5919.977 | 0.0016892 | --------------+------------+------------+ =item words-array.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| JP::valid | 285326.803 | 0.0000350 | JSON::Parse | 32589.775 | 0.0003068 | JSON::XS | 32263.877 | 0.0003099 | --------------+------------+------------+ =item exp.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| JP::valid | 128266.177 | 0.0000780 | JSON::Parse | 52626.148 | 0.0001900 | JSON::XS | 19849.995 | 0.0005038 | --------------+------------+------------+ =item literals.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| JP::valid | 313007.761 | 0.0000319 | JSON::Parse | 47180.022 | 0.0002120 | JSON::XS | 28826.832 | 0.0003469 | --------------+------------+------------+ =item cpantesters.json Repetitions: 10 x 100 = 1000 --------------+------------+------------+ module | 1/min | min | --------------|------------|------------| JP::valid | 1398.241 | 0.0071518 | JSON::Parse | 211.734 | 0.0472291 | JSON::XS | 215.100 | 0.0464900 | --------------+------------+------------+ =back =head1 SEE ALSO =over =item RFC 7159 JSON is specified in L. =item json.org L is the website for JSON, authored by Douglas Crockford. =item JSON::Create L is a companion module to JSON::Parse by the same author. As of version 0.08, I'm using it everywhere, but it should still be considered to be in a testing stage. Please feel free to try it out. =item JSON::Tokenize L is part of the JSON::Parse distribution, a tokenizer which reduces a JSON string to tokens. This makes the JSON::Parse tokenizer available to people who want to write their own JSON parsers. =item JSON::Repair L is an example module which demonstrates using JSON::Parse to apply some kinds of heuristics to repair "relaxed JSON" or otherwise broken JSON into compliant JSON. =back =head2 Other CPAN modules for parsing and producing JSON =over =item Reading and writing JSON =over =item L This calls on either L or L. =item L This is part of the Perl core, installed when you install Perl. "PP" stands for "Pure Perl", which means it is in Perl-only without the XS (C-based) parsing. This is slower but may be necessary if you cannot install modules requiring a C compiler. =item L This is an all-purpose JSON module in XS, which means it requires a C compiler to install. =item L This is a fork of L related to a disagreement about how to report bugs. Please see the module for details. =item L "Does what I want" module. =item L Wraps a C library called yajl. =item L Relies on L. =item L Based on L. =item L Takes advantage of a similarity between YAML (yet another markup language) and JSON to provide a JSON parser/producer using L. =item L Relies on L. =item L Uses the JSON library from Glib, a library of C functions for the Linux GNOME desktop project. =item L Part of the L standalone web framework, "pure Perl" JSON reader/writer. As of version 6.25 of Mojolicious, this actually depends on L. =item L This is a fork of L. =item L Slurp a JSON file into a data structure, and the reverse. It relies on L. =back =item Special-purpose modules =over =item L and L C is a special-purpose module for parsing JSON objects which have key collisions (something like C<{"a":1,"a":2}>) within objects. (JSON::Parse's handling of key collisions is discussed in L in this document.) =item L This module offers C and C literals similar to JSON. =item L For one-liners. =item L Convert JSON data to other formats. =item L This module generates JSON, colorized with ANSI escape sequences. =item L Configuration files in JSON =item L Automatically change a JSON string when a data structure changes. =item L Create JSON under memory limitations. =item L Extract parts of a JSON string. =item L Include JSON in a Perl program. =item L Search nested hashref/arrayref structures using JSONPath. =back =item Test-related modules =over =item L This offers a way to compare two different JSON strings to see if they refer to the same object. As of version 0.11, it relies on L. =item L JSON Test Utility. As of version 0.02, it relies on L. =item L Compare JSON with L. As of version 0.03, it relies on L. =back =item Type-related modules These untangle numbers, strings, and booleans into JSON types. =over =item L =item L =item L =item L =back =item Combination modules These modules rely on more than one back-end module. =over =item L A module which combines L, L, and L. The original L combines L and L, so this prioritizes L. =item L This module combines L, L versions one and two, and L. =item L A "combination module" which supports two different interfaces of L. However, JSON::XS is now onto version 3. =item L This pulls in L instead of L. =back =item JSON extensions These modules extend JSON with comments and other things. =over =item L "An extension of JSON that allows for better human-readability". =item L "Relaxed JSON with a little bit of YAML" =item L "A relaxed and easy diffable JSON variant" =back =item Other modules =over =item L =item L =item L =item L =item L =item L =item L =item L =item L =item L and L =item L =item L =item L =item L =back =back =head1 SCRIPT A script "validjson" is supplied with the module. This runs L on its inputs, so run it like this. validjson *.json The default behaviour is to just do nothing if the input is valid. For invalid input it prints what the problem is: validjson ids.go ids.go: JSON error at line 1, byte 1/7588: Unexpected character '/' parsing initial state: expecting whitespace: '\n', '\r', '\t', ' ' or start of string: '"' or digit: '0-9' or minus: '-' or start of an array or object: '{', '[' or start of literal: 't', 'f', 'n'. If you need confirmation, use its --verbose option: validjson -v *.json atoms.json is valid JSON. ids.json is valid JSON. kanjidic.json is valid JSON. linedecomps.json is valid JSON. radkfile-radicals.json is valid JSON. =head1 TEST RESULTS The CPAN testers results are at the usual place. The ActiveState test results are at L. =head1 DEPENDENCIES =over =item L =back =head1 EXPORTS The module exports nothing by default. Functions L, L, L, L and L, as well as the old function names L and L, can be exported on request. All of the functions can be exported using the tag ':all': use JSON::Parse ':all'; =head1 TESTING =head2 Internal testing code The module incorporates extensive testing related to the production of error messages and validation of input. Some of the testing code is supplied with the module in the F subdirectory of the distribution. More extensive testing code is in the git repository. This is not supplied in the CPAN distribution. A script, L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/randomjson.pl>, generates a set number of bytes of random JSON and checks that the module's bytewise validation of input is correct. It does this by taking a valid fragment, then adding each possible byte from 0 to 255 to see whether the module correctly identifies it as valid or invalid at that point, then randomly picking one of the valid bytes and adding it to the fragment and continuing the process until a complete valid JSON input is formed. The module has undergone about a billion repetitions of this test. This setup relies on a C file, L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/json-random-test.c>, which isn't in the CPAN distribution, and it also requires L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/Json3.xs> to be edited to make the macro C true (uncomment line 7 of the file). The testing code uses C setjmp/longjmp, so it's not guaranteed to work on all operating systems and is commented out for CPAN releases. A pure C version called L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/random-test.c> also exists. This applies exactly the same tests, and requires no Perl at all. If you're interested in testing your own JSON parser, the outputs generated by L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/randomjson.pl> are quite a good place to start. The default is to produce UTF-8 output, which looks pretty horrible since it tends to produce long strings of UTF-8 garbage. (This is because it chooses randomly from 256 bytes and the end-of-string marker C<"> has only a 1/256 chance of being chosen, so the strings tend to get long and messy). You can mess with the internals of JSON::Parse by setting MAXBYTE in F to 0x80, recompiling (you can ignore the compiler warnings), and running F again to get just ASCII random JSON things. This breaks the UTF-8 functionality of JSON::Parse, so please don't install that version. =head2 JSON Parsing Test Suite Version 0.48 passed all but two of the yes/no tests of the L. The first failure was that L did not mark L as invalid JSON, and the second was that L did not mark L as invalid json. The tests also revealed an inconsistency between L and L, which was reporting the completely empty file as invalid. Running these tests also revealed several bugs in the script L. All of these errors were amended in version 0.49. I attempted to include the JSON Parsing Test Suite tests in the module's tests, but some of the files (like 100,000 open arrays) actually L, so they're not really suitable for distribution. The tests are found, however, in the repository under L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/xt/jpts.t> and the subdirectory L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/xt/jpts>, so if you are interested in the results, please copy that and try it. There is also a test for the L script as L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/xt/validjson.t> in the repository. These are author tests, so you may need to install extra modules to run them. These author tests are run automatically before any code is uploaded to CPAN. =head1 HISTORY See L|https://github.com/benkasminbullock/JSON-Parse/blob/c00e1e8b7dfc7958de6700700ee20582f81b56a6/Changes> in the distribution for a full list of changes. This module started out under the name C. It was originally a way to escape from having to use the other JSON modules on CPAN. The biggest issue that I had with the other modules was the way that Unicode was handled. Insisting on the pure Perl method of dealing with JSON strings, which are required to be in Unicode anyway, seems to me little more than superstition, something like telling programmers not to step on cracks in the pavement. This module completely bypasses that. See L for the details of how this module differs from the other modules. The reason it only parsed JSON was that when I started this I didn't know the Perl extension language XS very well (I still don't know it very well), and I was not confident about making a JSON producer, so it only parsed JSON, which was the main job I needed to do. It originally used lex and yacc in the form of flex and bison, since discarded. I also found out that someone else had a JSON parser called Argo in Java, so to save confusion I dropped the name JSON::Argo and renamed this JSON::Parse, keeping the version numbers continuous. The module has since been completely rewritten, twice, mostly in an attempt to improve performance, after I found that JSON::XS was much faster than the original JSON::Parse. (The first rewrite of the module was not released to CPAN, this is the second one, which explains why some files have names like F). I also hoped to make something useful which wasn't in any existing CPAN module by offering the high-speed validator, L. I also rewrote the module due to some bugs I found, for example up to version 0.09 it was failing to accept whitespace after an object key string, so a JSON input of the form C<{ "x" : "y" }>, with whitespace between the C<"x"> and the colon, C<:>, would cause it to fail. That was one big reason I created the random testing regime described in L above. I believe that the module is now compliant with the JSON specification. After starting JSON::Create, I realised that some edge case handling in JSON::Parse needed to be improved. This resulted in the addition of the hash collision and literal-overriding methods introduced in versions 0.37 and 0.38 of this module. Version 0.42 fixed a very serious bug where long strings could overflow an internal buffer, and could cause a segmentation fault. Version 0.48 removed an experimental feature called C<$json_diagnostics> which made the module's errors be produced in JSON format, and replaced it with the current L method, for the benefit of L. Version 0.49 brought the module into conformance with the L. Version 0.54 removed support for the Solaris operating system. =head1 ACKNOWLEDGEMENTS Shlomi Fish (SHLOMIF) fixed some memory leaks in version 0.40. kolmogorov42 (https://github.com/kolmogorov42) reported a very serious bug which led to version 0.42. =head1 AUTHOR Ben Bullock, =head1 COPYRIGHT & LICENCE This package and associated files are copyright (C) 2013-2020 Ben Bullock. You can use, copy, modify and redistribute this package and associated files under the Perl Artistic Licence or the GNU General Public Licence. JSON-Parse-0.56/lib/JSON/Parse.pm000644 001751 001751 00000003472 13616666630 015613 0ustar00benben000000 000000 package JSON::Parse; require Exporter; @ISA = qw(Exporter); @EXPORT_OK = qw/ assert_valid_json json_file_to_perl json_to_perl parse_json parse_json_safe valid_json validate_json /; %EXPORT_TAGS = ( all => \@EXPORT_OK, ); use warnings; use strict; use Carp; our $VERSION = '0.56'; require XSLoader; XSLoader::load (__PACKAGE__, $VERSION); # Experimental, return a string of JSON as the error. our $json_diagnostics; # JSON "null" value. Although we're now using PL_sv_yes and PL_sv_no, # we don't use PL_sv_undef, because perldoc perlguts says it's a bad # idea. our $null; sub parse_json_safe { my $p; eval { $p = parse_json_safer (@_); }; if ($@) { my $error = $@; if (ref $error eq 'HASH') { my $error_as_string = $error->{"error as string"}; carp "JSON::Parse::parse_json_safe: $error_as_string"; } else { $error =~ s/at\s\S+\.pm\s+line\s+[0-9]+\.\s*$//; carp "JSON::Parse::parse_json_safe: $error"; } return undef; } return $p; } # Old names of subroutines. sub json_to_perl { goto &parse_json; } sub validate_json { goto &assert_valid_json; } sub valid_json { my ($json) = @_; if (! $json) { return 0; } eval { assert_valid_json (@_); }; if ($@) { return 0; } return 1; } sub json_file_to_perl { my ($file_name) = @_; my $json = ''; open my $in, "<:encoding(utf8)", $file_name or croak "Error opening $file_name: $!"; while (<$in>) { $json .= $_; } close $in or croak $!; return parse_json ($json); } sub run { my ($parser, $json) = @_; if ($parser->get_warn_only ()) { my $out; eval { $out = $parser->run_internal ($json); }; if ($@) { warn "$@"; } return $out; } else { return $parser->run_internal ($json); } } 1; JSON-Parse-0.56/lib/JSON/Tokenize.pm000644 001751 001751 00000001646 13622410776 016325 0ustar00benben000000 000000 package JSON::Tokenize; use warnings; use strict; require Exporter; our @ISA = qw(Exporter); use JSON::Parse; our @EXPORT_OK = qw/tokenize_json tokenize_start tokenize_next tokenize_start tokenize_end tokenize_type tokenize_child tokenize_text/; our %EXPORT_TAGS = ('all' => \@EXPORT_OK); use Carp; our $VERSION = '0.56'; sub tokenize_text { my ($input, $token) = @_; if (! $input || ! $token) { croak "tokenize_text requires input string and JSON::Tokenize object"; } my $start = tokenize_start ($token); my $length = tokenize_end ($token) - $start; my $text; if (utf8::is_utf8 ($input)) { # $start and $length refer to bytes, so we need to convert # $input into bytes. my $copy = $input; utf8::encode ($copy); $text = substr ($copy, $start, $length); # Make the output utf8-flagged. utf8::decode ($text); } else { $text = substr ($input, $start, $length); } return $text; } 1; JSON-Parse-0.56/lib/JSON/Tokenize.pod000644 001751 001751 00000006714 13622411131 016456 0ustar00benben000000 000000 =encoding UTF-8 =head1 NAME JSON::Tokenize - tokenize a string containing JSON =head1 SYNOPSIS use JSON::Tokenize ':all'; my $input = '{"tuttie":["fruity", true, 100]}'; my $token = tokenize_json ($input); print_tokens ($token, 0); sub print_tokens { my ($token, $depth) = @_; while ($token) { my $start = tokenize_start ($token); my $end = tokenize_end ($token); my $type = tokenize_type ($token); print " " x $depth; my $value = substr ($input, $start, $end - $start); print ">>$value<< has type $type\n"; my $child = tokenize_child ($token); if ($child) { print_tokens ($child, $depth+1); } my $next = tokenize_next ($token); $token = $next; } } This outputs >>{"tuttie":["fruity", true, 100]}<< has type object >>"tuttie"<< has type string >>:<< has type colon >>["fruity", true, 100]<< has type array >>"fruity"<< has type string >>,<< has type comma >>true<< has type literal >>,<< has type comma >>100<< has type number =head1 VERSION This documents version 0.56 of JSON::Tokenize corresponding to L released on Mon Feb 17 13:10:15 2020 +0900. =head1 DESCRIPTION This is a module for tokenizing a JSON string. It breaks the string into individual tokens without creating any Perl structures. Thus it can be used for tasks such as picking out or searching through parts of a large JSON structure without storing each part of the entire structure as individual Perl variables in memory. This module is an experimental part of L and its interface is likely to change. The tokenizing functions are currently written in a very primitive way. =head1 FUNCTIONS =head2 tokenize_json my $token = tokenize_json ($json); =head2 tokenize_next my $next = tokenize_next ($token); Walk the tree of tokens. =head2 tokenize_child my $child = tokenize_child ($child); Walk the tree of tokens. =head2 tokenize_start my $start = tokenize_start ($token); Get the start of the token as a byte offset from the start of the string. Note this is a byte offset not a character offset. =head2 tokenize_end my $end = tokenize_end ($token); Get the end of the token as a byte offset from the start of the string. Note this is a byte offset not a character offset. =head2 tokenize_type my $type = tokenize_type ($token); Get the type of the token as a string. The possible return values are "invalid", "initial state", "string", "number", "literal", "object", "array", "unicode escape" =head2 tokenize_text my $text = tokenize_text ($json, $token); Given a token C<$token> from this parsing and the JSON in C<$json>, return the text which corresponds to the token. This is a convenience function written in Perl which uses L and L and C to get the string from C<$json>. =head1 AUTHOR Ben Bullock, =head1 COPYRIGHT & LICENCE This package and associated files are copyright (C) 2016-2020 Ben Bullock. You can use, copy, modify and redistribute this package and associated files under the Perl Artistic Licence or the GNU General Public Licence. JSON-Parse-0.56/script/validjson000755 001751 001751 00000002167 13016170166 016073 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use lib '/home/ben/projects/Json3/blib/lib'; use lib '/home/ben/projects/Json3/blib/arch'; use JSON::Parse 'assert_valid_json'; use Getopt::Long; my $ok = GetOptions ( "verbose" => \my $verbose, "help" => \my $help, ); if (! $ok || $help) { usage (); exit; } for my $file (@ARGV) { eval { open my $in, "<:raw", $file or die "Can't open '$file': $!"; my $text = ''; while (my $line = <$in>) { $text .= $line; } close $in or die $!; assert_valid_json ($text); }; if ($@) { my $error = $@; $error =~ s/\n+$//; if ($error !~ /\Q$file/) { $error = "$file: $error"; } if ($error =~ /validjson line [0-9]+\.$/) { $error =~ s/\sat\s\S+\sline.*$/\./; } print "$error\n"; } else { if ($verbose) { print "'$file' is valid JSON.\n"; } } } sub usage { print <{a}, "\n"; my $q = parse_json_safe ($j); JSON-Parse-0.56/examples/collide.pl000644 001751 001751 00000000363 12614672521 016437 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse; my $jp = JSON::Parse->new (); $jp->detect_collisions (1); eval { $jp->run ('{"animals":{"cat":"moggy","cat":"feline","cat":"neko"}}'); }; print "$@\n" if $@; JSON-Parse-0.56/examples/synopsis.pl000644 001751 001751 00000000320 12255715005 016700 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse 'parse_json'; my $json = '["golden", "fleece"]'; my $perl = parse_json ($json); # Same effect as $perl = ['golden', 'fleece']; JSON-Parse-0.56/examples/sasori.pl000644 001751 001751 00000000364 12255464222 016323 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse ':all'; # The scalar $sasori looks like Unicode to Perl use utf8; my $sasori = '["蠍"]'; my $p = parse_json ($sasori); print utf8::is_utf8 ($p->[0]); # Prints 1. JSON-Parse-0.56/examples/first-bit.pl000644 001751 001751 00000001504 12440454762 016727 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse 'assert_valid_json'; my $json = <>$value<< has type $type\n"; my $child = tokenize_child ($token); if ($child) { print_tokens ($child, $depth+1); } my $next = tokenize_next ($token); $token = $next; } } JSON-Parse-0.56/examples/unicode-details.pl000644 001751 001751 00000002144 13016247667 020102 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse ':all'; use Unicode::UTF8 'decode_utf8'; binmode STDOUT, ":encoding(utf8)"; no utf8; my $highbytes = "か"; my $not_utf8 = "$highbytes\\u3042"; my $test = "{\"a\":\"$not_utf8\"}"; my $out = parse_json ($test); # JSON::Parse does something unusual here in promoting the first part # of the string into UTF-8. print "JSON::Parse gives this: ", $out->{a}, "\n"; # Perl cannot assume that $highbytes is in UTF-8, so it has to just # turn the initial characters into garbage. my $add_chr = $highbytes . chr (0x3042); print "Perl's output is like this: ", $add_chr, "\n"; # In fact JSON::Parse's behaviour is equivalent to this: my $equiv = decode_utf8 ($highbytes) . chr (0x3042); print "JSON::Parse did something like this: ", $equiv, "\n"; # With character strings switched on, Perl and JSON::Parse do the same # thing. use utf8; my $is_utf8 = "か"; my $test2 = "{\"a\":\"$is_utf8\\u3042\"}"; my $out2 = parse_json ($test2); print "JSON::Parse: ", $out2->{a}, "\n"; my $add_chr2 = $is_utf8 . chr (0x3042); print "Native Perl: ", $add_chr2, "\n"; JSON-Parse-0.56/examples/ebi.pl000644 001751 001751 00000000373 12255464231 015562 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse ':all'; # The scalar $ebi does not look like Unicode to Perl no utf8; my $ebi = '["海老"]'; my $p = parse_json ($ebi); print utf8::is_utf8 ($p->[0]); # Prints nothing. JSON-Parse-0.56/examples/json-tiny-round-trip-demo.pl000644 001751 001751 00000002065 13016223334 021771 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Tiny '0.54', qw(decode_json encode_json); use JSON::Parse; use JSON::Create; my $cream = '{"clapton":true,"hendrix":false,"bruce":true,"fripp":false}'; my $jp = JSON::Parse->new (); my $jc = JSON::Create->new (); print "First do a round-trip of our modules:\n\n"; print $jc->run ($jp->run ($cream)), "\n\n"; print "Now do a round-trip of JSON::Tiny:\n\n"; print encode_json (decode_json ($cream)), "\n\n"; print "First, incompatible mode:\n\n"; print 'tiny(parse): ', encode_json ($jp->run ($cream)), "\n"; print 'create(tiny): ', $jc->run (decode_json ($cream)), "\n\n"; $jp->set_true (JSON::Tiny::true); $jp->set_false (JSON::Tiny::false); print "Compatibility with JSON::Parse:\n\n"; print 'tiny(parse):', encode_json ($jp->run ($cream)), "\n\n"; $jc->bool ('JSON::Tiny::_Bool'); print "Compatibility with JSON::Create:\n\n"; print 'create(tiny):', $jc->run (decode_json ($cream)), "\n\n"; print "JSON::Parse and JSON::Create are still compatible too:\n\n"; print $jc->run ($jp->run ($cream)), "\n"; JSON-Parse-0.56/examples/chr.pl000644 001751 001751 00000000463 12256176567 015614 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; no utf8; my $kani = '87f9'; print "hex is character string\n" if utf8::is_utf8 ($kani); # prints nothing $kani = chr (hex ($kani)); print "chr makes it a character string\n" if utf8::is_utf8 ($kani); # prints "chr makes it a character string" JSON-Parse-0.56/examples/bad-utf8.txt000644 001751 001751 00000000570 12257216151 016636 0ustar00benben000000 000000 JSON error at line 1, byte 3/5: Unexpected character 0x81 parsing string starting from byte 2: expecting printable ASCII or first byte of UTF-8: '\x20-\x7f', '\xC2-\xF4' at examples/bad-utf8.pl line 10. JSON error at line 1, byte 5/7: Unexpected character 'b' parsing string starting from byte 2: expecting bytes in range 80-bf: '\x80-\xbf' at examples/bad-utf8.pl line 16. JSON-Parse-0.56/examples/kani.pl000644 001751 001751 00000000517 12256175564 015756 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse ':all'; no utf8; # 蟹 my $kani = '["\u87f9"]'; my $p = parse_json ($kani); print "It's marked as a character string" if utf8::is_utf8 ($p->[0]); # Prints "It's marked as a character string" because it's upgraded # regardless of the input string's flags. JSON-Parse-0.56/examples/long-number.pl000644 001751 001751 00000000334 12253505156 017245 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse ':all'; my $long_number = '[12345678901234567890123456789012345678901234567890]'; my $out = parse_json ($long_number); print "$out->[0]\n"; JSON-Parse-0.56/examples/hash.pl000644 001751 001751 00000000232 13012716014 015727 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse ':all'; my $perl = parse_json ('{"a":1, "b":2}'); print ref $perl, "\n"; JSON-Parse-0.56/examples/true-subs.pl000644 001751 001751 00000000363 12614670614 016756 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use JSON::Parse; my $json = '{"yes":true,"no":false}'; my $jp = JSON::Parse->new (); $jp->set_true ('Yes, that is so true'); my $out = $jp->run ($json); print $out->{yes}, "\n"; JSON-Parse-0.56/t/whitespace.t000644 001751 001751 00000000450 12253504436 015430 0ustar00benben000000 000000 use warnings; use strict; use Test::More tests => 2; use JSON::Parse qw/valid_json json_to_perl/; my $json = <builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; binmode STDOUT, ":encoding(utf8)"; binmode STDERR, ":encoding(utf8)"; use JSON::Parse 'valid_json'; use JSON::Tokenize; use JSON::Tokenize ':all'; my $input = '{"tuttie":["fruity", true, 100]}'; ok (valid_json ($input)); my $token = tokenize_json ($input); is (tokenize_type ($token), 'object'); my $child = tokenize_child ($token); is (tokenize_type ($child), "string"); is (tokenize_text ($input, $child), '"tuttie"'); my $next = tokenize_next ($child); is (tokenize_type ($next), "colon"); is (tokenize_start ($next), 9, "start at 9"); is (tokenize_text ($input, $next), ":"); my $nnext = tokenize_next ($next); is (tokenize_text ($input, $nnext), '["fruity", true, 100]'); use utf8; my $utf8input = '{"くそ":"くらえ"}'; ok (valid_json ($utf8input), "valid input"); my $tokenutf8 = tokenize_json ($utf8input); my $childutf8 = tokenize_child ($tokenutf8); is (tokenize_type ($childutf8), "string", "is a string"); is (tokenize_text ($utf8input, $childutf8), '"くそ"'); done_testing (); JSON-Parse-0.56/t/Json3.t000644 001751 001751 00000006114 12614343043 014267 0ustar00benben000000 000000 # This is a test for module Json3. use warnings; use strict; use Test::More; use JSON::Parse 'parse_json'; # Empty array. my $p = parse_json ('[]'); ok ($p); is (ref $p, 'ARRAY'); is (scalar @$p, 0); # Empty hash. my $o = parse_json ('{}'); ok ($o); is (ref $o, 'HASH'); is (scalar keys %$o, 0); # Array with one element. my $a1 = parse_json ('[1]'); ok ($a1); is (ref $a1, 'ARRAY'); is (scalar @$a1, 1); is ($a1->[0], 1, "Got value 1"); # Array of integer numbers. my $ai = parse_json ('[1,12,123,1234,12345,123456,1234567,12345678]'); ok ($ai); is (ref $ai, 'ARRAY'); is (scalar @$ai, 8); is_deeply ($ai, [1,12,123,1234,12345,123456,1234567,12345678]); # Object with one pair of elements, a number as value. my $o1 = parse_json ('{"a":1}'); ok ($o1); is (ref $o1, 'HASH', "Got a hash"); is (scalar keys %$o1, 1); ok (defined ($o1->{a}), "Key for 'a' is defined"); is ($o1->{a}, 1, "Value for 'a' is one"); # Object with one pair of elements, a string as value. my $o2 = parse_json ('{"william":"shakespeare"}'); ok ($o2, "got a value"); is (ref $o2, 'HASH', "Got a hash"); is (scalar keys %$o2, 1, "Right no of keys"); ok (defined ($o2->{william}), "Got key william"); is ($o2->{william}, 'shakespeare', "Got right value for william"); # Object with a lot of whitespace. my $w = <{Kash})); is ($ow->{Funky}, 'Gibbon'); # Array of floating point numbers my $af = parse_json ('[0.001, 2.5e4, 3e-12]'); ok ($af); is (ref $af, 'ARRAY'); is (scalar @$af, 3); my $eps = 1e-3; cmp_ok (abs ($af->[0] - 0.001), '<', 0.001 * $eps); cmp_ok (abs ($af->[1] - 2.5e4), '<', 2.5e4 * $eps); cmp_ok (abs ($af->[2] - 3e-12), '<', 3e-12 * $eps); # Nested hash my $on2 = parse_json ('{"gust":{"breeze":"wind"}}'); ok ($on2); is (ref $on2, 'HASH'); is (scalar keys %$on2, 1); is_deeply ($on2, {gust => {breeze => 'wind'}}, "Nested hash depth 2"); # Nested hash my $on4 = parse_json ('{"gusty":{"breezy":{"monkey":{"flat":"hog"}},"miserable":"dawson"}}'); ok ($on4); is (ref $on4, 'HASH'); is (scalar keys %$on4, 1); is_deeply ($on4, {gusty => {breezy => {monkey => {flat => 'hog'}}, miserable => 'dawson'}}, "Nested hash depth 4"); # Array of things with escapes my $escjson = '["\\t", "bubbles\n", "\u1234", "\nmonkey\n", "milky\tmoggy", "mocha\tmoggy\n"]'; my $aesc = parse_json ($escjson); # Test one by one. is ($aesc->[0], "\t"); is ($aesc->[1], "bubbles\n"); ok (utf8::is_utf8 ($aesc->[2]), "Unicode switched on for character escapes"); is ($aesc->[3], "\nmonkey\n"); is ($aesc->[4], "milky\tmoggy"); is ($aesc->[5], "mocha\tmoggy\n"); my $ao = parse_json ('[{"baby":"chops"}, {"starsky":"hutch"}]'); ok ($ao, "Got JSON"); is (ref $ao, 'ARRAY'); is_deeply ($ao, [{baby => 'chops'}, {starsky => 'hutch'}]); # Literals my $at = parse_json ('[true]'); ok ($at); is ($at->[0], 1); my $afalse = parse_json ('[false]'); ok ($afalse, "got false value"); is ($afalse->[0], '', "is empty string"); done_testing (); # Local variables: # mode: perl # End: JSON-Parse-0.56/t/unicode.t000644 001751 001751 00000004610 12615070023 014714 0ustar00benben000000 000000 use warnings; use strict; use Test::More; use JSON::Parse qw/json_to_perl valid_json parse_json/; use utf8; binmode STDOUT, ":utf8"; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; my $m = '{"骪":"\u9aaa"}'; ok (valid_json ($m), "Valid good JSON"); my $ar = json_to_perl ($m); ok (defined $ar, "Unicode \\uXXXX parsed"); is ($ar->{骪}, '骪', "Unicode \\uXXXX parsed correctly"); note ("keys = ", keys %$ar); # Here the second unicode piece of the string is added to switch on # the UTF-8 flag inside Perl and get the required invalidity. my $badunicode = '["\uD800", "バター"]'; ok (! valid_json ($badunicode), "$badunicode is invalid"); # This is what the documentation says will happen. However, I'm not # sure this is correct or what the user expects to happen. my $okunicode = '["\uD800"]'; ok (! valid_json ($okunicode), "$okunicode is valid"); my $surpair = '["\uD834\uDD1E"]'; my $spo; eval { $spo = parse_json ($surpair); }; ok (! $@, "parsed surrogate pairs"); is (ord ($spo->[0]), 0x1D11E, "g-clef surrogate pair"); use utf8; my $surpair_force_utf8 = '["\uD834\uDD1E麻婆茄子"]'; my $spo_force_utf8; eval { $spo_force_utf8 = parse_json ($surpair); }; ok (! $@, "parsed surrogate pairs"); is (ord ($spo_force_utf8->[0]), 0x1D11E, "g-clef surrogate pair"); use utf8; my $scorpion = '["蠍"]'; my $p1 = parse_json ($scorpion); ok (utf8::is_utf8 ($p1->[0]), "UTF-8 survives"); no utf8; my $ebi = '["蠍"]'; my $p2 = parse_json ($ebi); ok (! utf8::is_utf8 ($p2->[0]), "Not UTF-8 not marked as UTF-8"); no utf8; # 蟹 my $kani = '["\u87f9", "蟹", "\u87f9猿"]'; my $p = parse_json ($kani); ok (utf8::is_utf8 ($p->[0]), "kani upgraded regardless"); ok (! utf8::is_utf8 ($p->[1]), "input string not upgraded, even though it's UTF-8"); ok (utf8::is_utf8 ($p->[2]), "upgrade this too"); is (length ($p->[2]), 2, "length is two by magic"); ok (! valid_json ('["\uDE8C "]'), "invalid \uDE8C + space"); # Test of the strangely-named "surrogate pairs". my $jc = JSON::Parse->new (); my $wikipedia_1 = '"\ud801\udc37"'; my $out_1 = $jc->run ($wikipedia_1); is ($out_1, "\x{10437}"); my $wikipedia_2 = '"\ud852\udf62"'; my $out_2 = $jc->run ($wikipedia_2); is ($out_2, "\x{24b62}"); my $json_spec = '"\ud834\udd1e"'; my $out_3 = $jc->run ($json_spec); is ($out_3, "\x{1D11E}"); done_testing (); JSON-Parse-0.56/t/utf8.t000644 001751 001751 00000037274 12255475524 014207 0ustar00benben000000 000000 use warnings; use strict; use Test::More; use JSON::Parse ':all'; # Test valid JSON. no utf8; ok (valid_json ('["蟹"]')); ok (valid_json ('{"動物":"像"}')); my $bad_cont = sprintf ('["%c"]', 0x80); ok (! valid_json ($bad_cont)); eval { assert_valid_json ($bad_cont); }; like ($@, qr/Unexpected character 0x80 parsing string/); use utf8; ok (valid_json ('["蟹"]')); ok (valid_json ('{"動物":"像"}')); # From UTF-8 SAMPLER ok (valid_json ('["¥ · £ · € · $ · ¢ · ₡ · ₢ · ₣ · ₤ · ₥ · ₦ · ₧ · ₨ · ₩ · ₪ · ₫ · ₭ · ₮ · ₯ · ₹"]')); ok (valid_json ('["ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ\nᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ\nᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬"]')); ok (valid_json ('["Τη γλώσσα μου έδωσαν ελληνική\nτο σπίτι φτωχικό στις αμμουδιές του Ομήρου.\nΜονάχη έγνοια η γλώσσα μου στις αμμουδιές του Ομήρου.\nαπό το Άξιον Εστί\nτου Οδυσσέα Ελύτη"]')); ok (valid_json ('["Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ\nτὸ σπίτι φτωχικὸ στὶς ἀμμουδιὲς τοῦ Ὁμήρου.\nΜονάχη ἔγνοια ἡ γλῶσσα μου στὶς ἀμμουδιὲς τοῦ Ὁμήρου.\nἀπὸ τὸ Ἄξιον ἐστί\nτοῦ Ὀδυσσέα Ἐλύτη"]')); ok (valid_json ('["ვეპხის ტყაოსანი შოთა რუსთაველი\nღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა."]')); my $glassjson = <<'EOF'; {"Sanskrit":"काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥", "Sanskrit (standard transcription)":"kācaṃ śaknomyattum; nopahinasti mām.", "Classical Greek":"ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει.", "Greek (monotonic)":"Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα.", "Greek (polytonic)":"Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα. ", "Etruscan":"(NEEDED)", "Latin":"Vitrum edere possum; mihi non nocet.", "Old French":"Je puis mangier del voirre. Ne me nuit.", "French":"Je peux manger du verre, ça ne me fait pas mal.", "Provençal / Occitan":"Pòdi manjar de veire, me nafrariá pas.", "Québécois":"J'peux manger d'la vitre, ça m'fa pas mal.", "Walloon":"Dji pou magnî do vêre, çoula m' freut nén må. ", "Champenois":"(NEEDED) ", "Lorrain":"(NEEDED)", "Picard":"Ch'peux mingi du verre, cha m'foé mie n'ma. ", "Corsican/Corsu":"(NEEDED) ", "Jèrriais":"(NEEDED)", "Kreyòl Ayisyen (Haitï)":"Mwen kap manje vè, li pa blese'm.", "Basque":"Kristala jan dezaket, ez dit minik ematen.", "Catalan / Català":"Puc menjar vidre, que no em fa mal.", "Spanish":"Puedo comer vidrio, no me hace daño.", "Aragonés":"Puedo minchar beire, no me'n fa mal . ", "Aranés":"(NEEDED) ", "Mallorquín":"(NEEDED)", "Galician":"Eu podo xantar cristais e non cortarme.", "European Portuguese":"Posso comer vidro, não me faz mal.", "Brazilian Portuguese (8)":"Posso comer vidro, não me machuca.", "Caboverdiano/Kabuverdianu (Cape Verde)":"M' podê cumê vidru, ca ta maguâ-m'.", "Papiamentu":"Ami por kome glas anto e no ta hasimi daño.", "Italian":"Posso mangiare il vetro e non mi fa male.", "Milanese":"Sôn bôn de magnà el véder, el me fa minga mal.", "Roman":"Me posso magna' er vetro, e nun me fa male.", "Napoletano":"M' pozz magna' o'vetr, e nun m' fa mal.", "Venetian":"Mi posso magnare el vetro, no'l me fa mae.", "Zeneise (Genovese)":"Pòsso mangiâ o veddro e o no me fà mâ.", "Sicilian":"Puotsu mangiari u vitru, nun mi fa mali. ", "Campinadese (Sardinia)":"(NEEDED) ", "Lugudorese (Sardinia)":"(NEEDED)", "Romansch (Grischun)":"Jau sai mangiar vaider, senza che quai fa donn a mai. ", "Romany / Tsigane":"(NEEDED)", "Romanian":"Pot să mănânc sticlă și ea nu mă rănește.", "Esperanto":"Mi povas manĝi vitron, ĝi ne damaĝas min. ", "Pictish":"(NEEDED) ", "Breton":"(NEEDED)", "Cornish":"Mý a yl dybry gwéder hag éf ny wra ow ankenya.", "Welsh":"Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi.", "Manx Gaelic":"Foddym gee glonney agh cha jean eh gortaghey mee.", "Old Irish (Ogham)":"᚛᚛ᚉᚑᚅᚔᚉᚉᚔᚋ ᚔᚈᚔ ᚍᚂᚐᚅᚑ ᚅᚔᚋᚌᚓᚅᚐ᚜", "Old Irish (Latin)":"Con·iccim ithi nglano. Ním·géna.", "Irish":"Is féidir liom gloinne a ithe. Ní dhéanann sí dochar ar bith dom.", "Ulster Gaelic":"Ithim-sa gloine agus ní miste damh é.", "Scottish Gaelic":"S urrainn dhomh gloinne ithe; cha ghoirtich i mi.", "Anglo-Saxon (Runes)":"ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬", "Anglo-Saxon (Latin)":"Ic mæg glæs eotan ond hit ne hearmiað me.", "Middle English":"Ich canne glas eten and hit hirtiþ me nouȝt.", "English":"I can eat glass and it doesn't hurt me.", "English (IPA)":"[aɪ kæn iːt glɑːs ænd ɪt dɐz nɒt hɜːt miː] (Received Pronunciation)", "English (Braille)":"⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑", "Jamaican":"Mi kian niam glas han i neba hot mi.", "Lalland Scots / Doric":"Ah can eat gless, it disnae hurt us. ", "Glaswegian":"(NEEDED)", "Gothic (4)":"𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸.", "Old Norse (Runes)":"ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ", "Old Norse (Latin)":"Ek get etið gler án þess að verða sár.", "Norsk / Norwegian (Nynorsk)":"Eg kan eta glas utan å skada meg.", "Norsk / Norwegian (Bokmål)":"Jeg kan spise glass uten å skade meg.", "Føroyskt / Faroese":"Eg kann eta glas, skaðaleysur.", "Íslenska / Icelandic":"Ég get etið gler án þess að meiða mig.", "Svenska / Swedish":"Jag kan äta glas utan att skada mig.", "Dansk / Danish":"Jeg kan spise glas, det gør ikke ondt på mig.", "Sønderjysk":"Æ ka æe glass uhen at det go mæ naue.", "Frysk / Frisian":"Ik kin glês ite, it docht me net sear.", "Nederlands / Dutch":"Ik kan glas eten, het doet mij geen kwaad.", "Kirchröadsj/Bôchesserplat":"Iech ken glaas èèse, mer 't deet miech jing pieng.", "Afrikaans":"Ek kan glas eet, maar dit doen my nie skade nie.", "Lëtzebuergescht / Luxemburgish":"Ech kan Glas iessen, daat deet mir nët wei.", "Deutsch / German":"Ich kann Glas essen, ohne mir zu schaden.", "Ruhrdeutsch":"Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut.", "Langenfelder Platt":"Isch kann Jlaas kimmeln, uuhne datt mich datt weh dääd.", "Lausitzer Mundart (\"Lusatian\")":"Ich koann Gloos assn und doas dudd merr ni wii.", "Odenwälderisch":"Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud.", "Sächsisch / Saxon":"'sch kann Glos essn, ohne dass'sch mer wehtue.", "Pfälzisch":"Isch konn Glass fresse ohne dasses mer ebbes ausmache dud.", "Schwäbisch / Swabian":"I kå Glas frässa, ond des macht mr nix!", "Deutsch (Voralberg)":"I ka glas eassa, ohne dass mar weh tuat.", "Bayrisch / Bavarian":"I koh Glos esa, und es duard ma ned wei.", "Allemannisch":"I kaun Gloos essen, es tuat ma ned weh.", "Schwyzerdütsch (Zürich)":"Ich chan Glaas ässe, das schadt mir nöd.", "Schwyzerdütsch (Luzern)":"Ech cha Glâs ässe, das schadt mer ned. ", "Plautdietsch":"(NEEDED)", "Hungarian":"Meg tudom enni az üveget, nem lesz tőle bajom.", "Suomi / Finnish":"Voin syödä lasia, se ei vahingoita minua.", "Sami (Northern)":"Sáhtán borrat lása, dat ii leat bávččas.", "Erzian":"Мон ярсан суликадо, ды зыян эйстэнзэ а ули.", "Northern Karelian":"Mie voin syvvä lasie ta minla ei ole kipie.", "Southern Karelian":"Minä voin syvvä st'oklua dai minule ei ole kibie. ", "Vepsian":"(NEEDED) ", "Votian":"(NEEDED) ", "Livonian":"(NEEDED)", "Estonian":"Ma võin klaasi süüa, see ei tee mulle midagi.", "Latvian":"Es varu ēst stiklu, tas man nekaitē.", "Lithuanian":"Aš galiu valgyti stiklą ir jis manęs nežeidžia ", "Old Prussian":"(NEEDED) ", "Sorbian (Wendish)":"(NEEDED)", "Czech":"Mohu jíst sklo, neublíží mi.", "Slovak":"Môžem jesť sklo. Nezraní ma.", "Polska / Polish":"Mogę jeść szkło i mi nie szkodzi.", "Slovenian":"Lahko jem steklo, ne da bi mi škodovalo.", "Croatian":"Ja mogu jesti staklo i ne boli me.", "Serbian (Latin)":"Ja mogu da jedem staklo.", "Serbian (Cyrillic)":"Ја могу да једем стакло.", "Macedonian":"Можам да јадам стакло, а не ме штета.", "Russian":"Я могу есть стекло, оно мне не вредит.", "Belarusian (Cyrillic)":"Я магу есці шкло, яно мне не шкодзіць.", "Belarusian (Lacinka)":"Ja mahu jeści škło, jano mne ne škodzić.", "Ukrainian":"Я можу їсти скло, і воно мені не зашкодить.", "Bulgarian":"Мога да ям стъкло, то не ми вреди.", "Georgian":"მინას ვჭამ და არა მტკივა.", "Armenian":"Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։", "Albanian":"Unë mund të ha qelq dhe nuk më gjen gjë.", "Turkish":"Cam yiyebilirim, bana zararı dokunmaz.", "Turkish (Ottoman)":"جام ييه بلورم بڭا ضررى طوقونمز", "Bangla / Bengali":"আমি কাঁচ খেতে পারি, তাতে আমার কোনো ক্ষতি হয় না।", "Marathi":"मी काच खाऊ शकतो, मला ते दुखत नाही.", "Kannada":"ನನಗೆ ಹಾನಿ ಆಗದೆ, ನಾನು ಗಜನ್ನು ತಿನಬಹುದು", "Hindi":"मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती.", "Tamil":"நான் கண்ணாடி சாப்பிடுவேன், அதனால் எனக்கு ஒரு கேடும் வராது.", "Telugu":"నేను గాజు తినగలను మరియు అలా చేసినా నాకు ఏమి ఇబ్బంది లేదు", "Sinhalese":"මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ.", "Urdu(3)":"میں کانچ کھا سکتا ہوں اور مجھے تکلیف نہیں ہوتی ۔", "Pashto(3)":"زه شيشه خوړلې شم، هغه ما نه خوږوي", "Farsi / Persian(3)":".من می توانم بدونِ احساس درد شيشه بخورم", "Arabic(3)":"أنا قادر على أكل الزجاج و هذا لا يؤلمني. ", "Aramaic":"(NEEDED)", "Maltese":"Nista' niekol il-ħġieġ u ma jagħmilli xejn.", "Hebrew(3)":"אני יכול לאכול זכוכית וזה לא מזיק לי.", "Yiddish(3)":"איך קען עסן גלאָז און עס טוט מיר נישט װײ. ", "Judeo-Arabic":"(NEEDED) ", "Ladino":"(NEEDED) ", "Gǝʼǝz":"(NEEDED) ", "Amharic":"(NEEDED)", "Twi":"Metumi awe tumpan, ɜnyɜ me hwee.", "Hausa (Latin)":"Inā iya taunar gilāshi kuma in gamā lāfiyā.", "Hausa (Ajami) (2)":"إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا", "Yoruba(4)":"Mo lè je̩ dígí, kò ní pa mí lára.", "Lingala":"Nakokí kolíya biténi bya milungi, ekosála ngáí mabé tɛ́.", "(Ki)Swahili":"Naweza kula bilauri na sikunyui.", "Malay":"Saya boleh makan kaca dan ia tidak mencederakan saya.", "Tagalog":"Kaya kong kumain nang bubog at hindi ako masaktan.", "Chamorro":"Siña yo' chumocho krestat, ti ha na'lalamen yo'.", "Fijian":"Au rawa ni kana iloilo, ia au sega ni vakacacani kina.", "Javanese":"Aku isa mangan beling tanpa lara.", "Burmese":"က္ယ္ဝန္‌တော္‌၊က္ယ္ဝန္‌မ မ္ယက္‌စားနုိင္‌သည္‌။ ၎က္ရောင္‌့ ထိခုိက္‌မ္ဟု မရ္ဟိပာ။ (9)", "Vietnamese (quốc ngữ)":"Tôi có thể ăn thủy tinh mà không hại gì.", "Vietnamese (nôm) (4)":"些 𣎏 世 咹 水 晶 𦓡 空 𣎏 害 咦", "Khmer":"ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ", "Lao":"ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ.", "Thai":"ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ", "Mongolian (Cyrillic)":"Би шил идэй чадна, надад хортой биш", "Mongolian (Classic) (5)":"ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ ", "Dzongkha":"(NEEDED)", "Nepali":"म काँच खान सक्छू र मलाई केहि नी हुन्‍न् ।", "Tibetan":"ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།", "Chinese":"我能吞下玻璃而不伤身体。", "Chinese (Traditional)":"我能吞下玻璃而不傷身體。", "Taiwanese(6)":"Góa ē-tàng chia̍h po-lê, mā bē tio̍h-siong.", "Japanese":"私はガラスを食べられます。それは私を傷つけません。", "Korean":"나는 유리를 먹을 수 있어요. 그래도 아프지 않아요", "Bislama":"Mi save kakae glas, hemi no save katem mi.", "Hawaiian":"Hiki iaʻu ke ʻai i ke aniani; ʻaʻole nō lā au e ʻeha.", "Marquesan":"E koʻana e kai i te karahi, mea ʻā, ʻaʻe hauhau.", "Inuktitut (10)":"ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ", "Chinook Jargon":"Naika məkmək kakshət labutay, pi weyk ukuk munk-sik nay.", "Navajo":"Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da. ", "Cherokee (and Cree, Chickasaw, Cree, Micmac, Ojibwa, Lakota, Náhuatl, Quechua, Aymara, and other American languages)":"(NEEDED) ", "Garifuna":"(NEEDED) ", "Gullah":"(NEEDED)", "Lojban":"mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi", "Nórdicg":"Ljœr ye caudran créneþ ý jor cẃran." } EOF assert_valid_json ($glassjson); ok (valid_json ($glassjson)); no utf8; # Markus Kuhn validation file for my $c (0xc0..0xf4) { my $badinitial = sprintf ("%c ", $c); ok (! valid_json ($badinitial), "first byte $c, second byte space invalid"); } my @overlong = split /\n/, (qq/ # 4.1 Examples of an overlong ASCII character c0 af e0 80 af f0 80 80 af # 4.2 Maximum overlong sequences c1 bf e0 9f bf f0 8f bf bf # 4.3 Overlong representation of the NUL character c0 80 e0 80 80 f0 80 80 80 # 5.1 Single UTF-16 surrogates ed a0 80 ed ad bf ed ae 80 ed af bf ed b0 80 ed be 80 ed bf bf # 5.2 Paired UTF-16 surrogates ed a0 80 ed b0 80 ed a0 80 ed bf bf ed ad bf ed b0 80 ed ad bf ed bf bf ed ae 80 ed b0 80 ed ae 80 ed bf bf ed af bf ed b0 80 ed af bf ed bf bf /); for my $overlong (@overlong) { if ($overlong =~ /^#/) { next; } my @bytes = split / /, $overlong; my $bad = join ('', map {sprintf "%c", hex ($_)} @bytes); ok (! valid_json ($bad), "$overlong invalid"); } done_testing (); exit; JSON-Parse-0.56/t/test.json000644 001751 001751 00000010564 12131136564 014766 0ustar00benben000000 000000 { "pod" : "NAME Algorithm::NGram SYNPOSIS use Algorithm::NGram; my $ng = Algorithm::NGram->new(ngram_width => 3); # use trigrams # feed in text $ng->add_text($text1); # analyze $text1 $ng->add_text($text2); # analyze $text2 # feed in arbitrary sequence of tokens $ng->add_start_token; $ng->add_tokens(qw/token1 token2 token3/); $ng->add_end_token; my $output = $ng->generate_text; DESCRIPTION This is a module for analyzing token sequences with n-grams. You can use it to parse a block of text, or feed in your own tokens. It can generate new sequences of tokens from what has been fed in. EXPORT None. METHODS new Create a new n-gram analyzer instance. Options: ngram_width This is the \"window size\" of how many tokens the analyzer will keep track of. A ngram_width of two will make a bigram, a ngram_width of three will make a trigram, etc... ngram_width Returns token window size (e.g. the \"n\" in n-gram) token_table Returns n-gram table add_text Splits a block of text up by whitespace and processes each word as a token. Automatically calls \"add_start_token()\" at the beginning of the text and \"add_end_token()\" at the end. add_tokens Adds an arbitrary list of tokens. add_start_token Adds the \"start token.\" This is useful because you often will want to mark the beginnings and ends of a token sequence so that when generating your output the generator will know what tokens start a sequence and when to end. add_end_token Adds the \"end token.\" See \"add_start_token()\". analyze Generates an n-gram frequency table. Returns a hashref of *N => tokens => count*, where N is the number of tokens (will be from 2 to ngram_width). You will not normally need to call this unless you want to get the n-gram frequency table. generate_text After feeding in text tokens, this will return a new block of text based on whatever text was added. generate Generates a new sequence of tokens based on whatever tokens have previously been fed in. next_tok Given a list of tokens, will pick a possible token to come next. token_lookup Returns a hashref of the counts of tokens that follow a sequence of tokens. token_key Serializes a sequence of tokens for use as a key into the n-gram table. You will not normally need to call this. serialize Returns the tokens and n-gram (if one has been generated) in a string deserialize($string) Deserializes a string and returns an \"Algorithm::NGram\" instance SEE ALSO Text::Ngram, Text::Ngrams AUTHOR Mischa Spiegelmock, COPYRIGHT AND LICENSE Copyright 2007 by Mischa Spiegelmock This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. ", "date" : "2011-09-23T05:08:06", "status" : "latest", "author" : "REVMISCHA", "directory" : false, "maturity" : "released", "indexed" : true, "documentation" : "Algorithm::NGram", "id" : "rEbfZ9plgUX_e6ySf4u3d5ZGrhE", "module" : [ { "indexed" : true, "authorized" : true, "version" : "0.9", "name" : "Algorithm::NGram", "version_numified" : 0.9 } ], "authorized" : true, "pod_lines" : [ [ 19, 51 ], [ 92, 5 ], [ 98, 5 ], [ 104, 7 ], [ 130, 5 ], [ 143, 8 ], [ 157, 5 ], [ 168, 8 ], [ 211, 6 ], [ 225, 6 ], [ 256, 5 ], [ 283, 5 ], [ 301, 6 ], [ 313, 5 ], [ 331, 5 ], [ 363, 18 ] ], "version" : "0.9", "binary" : false, "name" : "NGram.pm", "version_numified" : 0.9, "path" : "lib/Algorithm/NGram.pm", "release" : "Algorithm-NGram-0.9", "description" : "This is a module for analyzing token sequences with n-grams. You can use it to parse a block of text, or feed in your own tokens. It can generate new sequences of tokens from what has been fed in.", "distribution" : "Algorithm-NGram", "stat" : { "uid" : 500, "mtime" : 1316754239, "mode" : 33188, "size" : 7715, "gid" : 500 }, "level" : 2, "sloc" : 143, "slop" : 94, "mime" : "text/x-script.perl-module" } JSON-Parse-0.56/t/test-empty-string.t000644 001751 001751 00000000515 12253504436 016715 0ustar00benben000000 000000 # This tests for an old bug where empty strings didn't work properly. use warnings; use strict; use JSON::Parse 'parse_json'; use Test::More; my $json = parse_json ('{"buggles":"","bibbles":""}'); is ($json->{buggles}, ''); is ($json->{bibbles}, ''); $json->{buggles} .= "chuggles"; is ($json->{bibbles}, ''); done_testing (); exit; JSON-Parse-0.56/t/JSON-Parse.t000644 001751 001751 00000007133 13016165672 015125 0ustar00benben000000 000000 use warnings; use strict; use Test::More; use JSON::Parse qw/parse_json valid_json parse_json_safe/; use utf8; binmode STDOUT, ":utf8"; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; #binmode STDOUT, ":utf8"; my $jason = <<'EOF'; {"bog":"log","frog":[1,2,3],"guff":{"x":"y","z":"monkey","t":[0,1,2.3,4,59999]}} EOF my $x = parse_json ($jason); note ($x->{guff}->{t}->[2]); cmp_ok (abs ($x->{guff}->{t}->[2] - 2.3), '<', 0.00001, "Two point three"); my $xs = parse_json_safe ($jason); note ($xs->{guff}->{t}->[2]); cmp_ok (abs ($xs->{guff}->{t}->[2] - 2.3), '<', 0.00001, "Two point three"); my $fleece = '{"凄い":"技", "tickle":"baby"}'; my $y = parse_json ($fleece); ok ($y->{tickle} eq 'baby', "Parse hash"); my $ys = parse_json_safe ($fleece); ok ($ys->{tickle} eq 'baby', "Parse hash"); ok (valid_json ($fleece), "Valid OK JSON"); my $argonauts = '{"medea":{"magic":true,"nice":false}}'; my $z = parse_json ($argonauts); ok ($z->{medea}->{magic}, "Parse true literal."); ok (! ($z->{medea}->{nice}), "Parse false literal."); my $zs = parse_json_safe ($argonauts); ok ($zs->{medea}->{magic}, "Parse true literal."); ok (! ($zs->{medea}->{nice}), "Parse false literal."); ok (valid_json ($argonauts), "Valid OK JSON"); # Test that empty inputs result in an undefined return value, and no # error message. eval { my $Q = parse_json (''); }; ok ($@, "Empty string makes error"); ok ($@ =~ /empty input/i, "Empty input error for empty input"); eval { # Switch off uninitialized value warning for this test. no warnings; my $R = parse_json (undef); }; ok ($@, "Empty string makes error"); ok ($@ =~ /empty input/i, "Empty input error for empty input"); eval { my $S = parse_json (' '); }; ok ($@, "Empty string makes error"); ok ($@ =~ /empty input/i, "Empty input error for empty input"); my $n; eval { $n = '{"骪":"\u9aaa"'; my $nar = parse_json ($n); }; ok ($@, "found error"); { my $warning; local $SIG{__WARN__} = sub { $warning = $_[0]; }; eval { $n = '{"骪":"\u9aaa"'; my $nar = parse_json_safe ($n); }; ok (! $@, "no exception with parse_json_safe"); unlike ($warning, qr/\n.+/, "no newlines in middle of error"); like ($warning, qr/JSON-Parse\.t/, "right file name for error"); } ok (! valid_json ($n), "! Not valid missing end }"); my $bad1 = '"bad":"city"}'; $@ = undef; eval { parse_json ($bad1); }; ok ($@, "found error in '$bad1'"); #like ($@, qr/stray characters/, "Error message as expected"); my $notjson = 'this is not lexable'; $@ = undef; eval { parse_json ($notjson); }; ok ($@, "Got error message"); #like ($@, qr/stray characters/i, "unlexable message $@ OK"); ok (! valid_json ($notjson), "Not valid bad json"); my $wi =<{address}->{postalCode} eq '10021', "Test a value $xi->{address}->{postalCode}"); ok (valid_json ($wi), "Validate"); my $perl_a = parse_json ('["a", "b", "c"]'); ok (ref $perl_a eq 'ARRAY', "json array to perl array"); my $perl_b = parse_json ('{"a":1, "b":2}'); ok (ref $perl_b eq 'HASH', "json object to perl hash"); done_testing (); exit; JSON-Parse-0.56/t/syntax.t000644 001751 001751 00000000455 13050303352 014615 0ustar00benben000000 000000 # This is a test for a false syntax error produced by this module on # legitimate input. use warnings; use strict; use FindBin '$Bin'; use Test::More; use JSON::Parse 'json_file_to_perl'; eval { my $json = json_file_to_perl ("$Bin/syntax-error-1.json"); }; note ($@); ok (! $@); done_testing (); JSON-Parse-0.56/t/perl-monks-1165399.t000644 001751 001751 00000001504 13012464667 016230 0ustar00benben000000 000000 # http://perlmonks.org/?node_id=1165399 # https://github.com/benkasminbullock/JSON-Parse/issues/34 use warnings; use strict; use utf8; use FindBin '$Bin'; use Test::More; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; binmode STDOUT, ":encoding(utf8)"; binmode STDERR, ":encoding(utf8)"; use Data::Dumper; use JSON::Parse; my $j = JSON::Parse->new(); # no complain, no effect: $j->warn_only(1); # legal json: eval { my $pl = $j->run('{"k":"v"}'); }; ok (! $@); # illegal json, the following statement dies: my $warning; $SIG{__WARN__} = sub { $warning = "@_" }; eval { my $pl = $j->run('illegal json'); }; ok (! $@, "No fatal error"); ok ($warning, "Got warning"); undef $SIG{__WARN__}; done_testing (); JSON-Parse-0.56/t/array.t000644 001751 001751 00000001605 12256004614 014411 0ustar00benben000000 000000 use warnings; use strict; use Test::More; use JSON::Parse ':all'; # This was a bug with uninitialized memory. my $array = '[0,0,0,0,0,0,0,0,0,1,1,0,0,1]'; my $parray = eval ($array); my $jarray = parse_json ($array); is_deeply ($jarray, $parray); my $array2 = '[0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]'; my $parray2 = eval ($array2); my $jarray2 = parse_json ($array2); is_deeply ($jarray2, $parray2); done_testing (); exit; JSON-Parse-0.56/t/read-file.t000644 001751 001751 00000000366 13050303352 015120 0ustar00benben000000 000000 #!/home/ben/software/install/bin/perl use warnings; use strict; use Test::More; use FindBin '$Bin'; use JSON::Parse 'json_file_to_perl'; my $p = json_file_to_perl ("$Bin/test.json"); ok ($p->{distribution} eq 'Algorithm-NGram'); done_testing (); JSON-Parse-0.56/t/syntax-error-1.json000644 001751 001751 00000000050 12253504436 016611 0ustar00benben000000 000000 { "version_numified" : 8e-06 } JSON-Parse-0.56/t/collision.t000644 001751 001751 00000002067 12614664057 015304 0ustar00benben000000 000000 use warnings; use strict; use utf8; use FindBin '$Bin'; use Test::More; use Encode 'decode_utf8'; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; binmode STDOUT, ":encoding(utf8)"; binmode STDERR, ":encoding(utf8)"; use JSON::Parse 'parse_json'; my $j = '{"a":1, "a":2}'; my $p = parse_json ($j); cmp_ok ($p->{a}, '==', 2, "Test documented hash key collision behaviour"); my $j2 = '{"a":1, "a":2, "a":3, "a":4, "a":5, "a":6, "a":7, "a":8, "a":9, "a":10}'; my $p2 = parse_json ($j2); cmp_ok ($p2->{a}, '==', 10, "Test documented hash key collision behaviour"); my $focus = '{"hocus":10,"pocus":20,"hocus":30,"focus":40}'; my $jp = JSON::Parse->new (); $jp->detect_collisions (1); eval { $jp->run ($focus); }; ok ($@); like ($@, qr/"hocus"/); my $yodi = '{"ほかす":10,"ぽかす":20,"ほかす":30,"ふぉかす":40}'; eval { $jp->run ($yodi); }; ok ($@); my $error = decode_utf8 ($@); like ($error, qr/"ほかす"/); note ($error); done_testing (); JSON-Parse-0.56/t/rfc7159.t000644 001751 001751 00000003303 12556116403 014373 0ustar00benben000000 000000 use warnings; use strict; use Test::More; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; binmode STDOUT, ":encoding(utf8)"; binmode STDERR, ":encoding(utf8)"; use JSON::Parse qw/parse_json valid_json/; my $stringonly = '"this"'; my $j; eval { $j = parse_json ($stringonly); }; ok (! $@, "no errors parsing rfc7159 json"); is ($j, 'this', "Got correct value as well"); ok (valid_json ($stringonly), "And it's valid json too"); my $numberonly = '3.14'; my $j2; eval { $j2 = parse_json ($numberonly); }; ok (! $@, "no errors parsing rfc7159 json"); cmp_ok (abs ($j2 - $numberonly), '<', 0.0001, "got number back"); ok (valid_json ($numberonly), "And it's valid JSON too"); my $numberonly2 = '0.14'; my $jx; eval { $jx = parse_json ($numberonly2); }; ok (! $@, "no errors parsing rfc7159 json $numberonly2"); cmp_ok (abs ($jx - ($numberonly2 + 0.0)), '<', 0.0001, "got number back $numberonly2"); ok (valid_json ($numberonly2), "And it's valid JSON too"); my $numberws = ' 5.55e10 '; ok (valid_json ($numberws), "$numberws validated"); my $literalws = ' true '; ok (valid_json ($literalws), "'$literalws' validates"); my $j3; eval { $j3 = parse_json ($literalws); }; ok (! $@, "no errors parsing '$literalws'"); ok ($j3, "'$literalws' gives a true value"); is ($j3, 1, "'$literalws' is equal to one"); my $literal = 'null'; ok (valid_json ($literal), "'$literal' validates"); my $j4; eval { $j4 = parse_json ($literal); }; ok (! $@, "no errors parsing '$literal'"); ok (! $j4, "bare literal null is false value"); ok (! defined ($j4), "bare literal null is undefined"); done_testing (); JSON-Parse-0.56/t/kolmorogov42-1.t000644 001751 001751 00000010714 13012464667 016007 0ustar00benben000000 000000 use warnings; use strict; use utf8; use FindBin '$Bin'; use Test::More; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; binmode STDOUT, ":encoding(utf8)"; binmode STDERR, ":encoding(utf8)"; use JSON::Parse 'parse_json'; my $input = <<'EOF'; {"date":1468338282,"text":"asasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdae\nrt432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasd555sdaert432fghyuiopasasddsasasasdasdasdasdasdasdasdasdassdfsdfsdfs"} EOF my $p; eval { $p = parse_json ($input); }; ok (! $@); ok ($p); done_testing (); JSON-Parse-0.56/t/bugzilla-2049.t000644 001751 001751 00000000306 13176242603 015501 0ustar00benben000000 000000 use warnings; use strict; use Test::More; use JSON::Parse 'json_file_to_perl'; eval { my $type = ''; my $tri2file = json_file_to_perl ('$type-tri2file.txt'); }; ok ($@); note ($@); done_testing (); JSON-Parse-0.56/t/numbers.t000644 001751 001751 00000002760 12254541431 014752 0ustar00benben000000 000000 use warnings; use strict; use JSON::Parse qw/json_to_perl parse_json/; use Test::More; my $p; # This was causing some problems with the new grammar / lexer. my $jeplus = '[1.9e+9]'; eval { $p = json_to_perl ($jeplus); }; #note ($@); ok (! $@, "Parsed $jeplus OK"); my $j = <{integer}, 100), "Got 100 for integer"); ok (compare ($p->{decimal} , 1.5), "Got 1.5 for decimal"); ok (compare ($p->{exponent} , 100), "Got 100 for exponent"); ok (compare ($p->{"exponent-"} , 19/1000), "got 19/1000 for exponent-"); ok (compare ($p->{"exponent+"} , 1_900_000_000), "got 1_900_000_000 for exponent+"); ok (compare ($p->{fraction} , 0.01), "got 0.01 for fraction"); my $q = @{json_to_perl ('[0.12345]')}[0]; ok (compare ($q, '0.12345'), "Got 0.12345"); # Illegal numbers eval { json_to_perl ('[0...111]'); }; ok ($@, "Don't accept 0...111"); eval { json_to_perl ('[0111]'); }; like ($@, qr/unexpected character/i, "Error for leading zero"); my $long_number = '12345678901234567890123456789012345678901234567890'; my $out = parse_json ("[$long_number]"); is ($out->[0], $long_number); done_testing; exit; sub compare { my ($x, $y) = @_; my $error = 0.00001; if (abs ($x - $y) < $error) { return 1; } print "$x and $y are not equal.\n"; return; } JSON-Parse-0.56/t/object.t000644 001751 001751 00000016207 13013213015 014532 0ustar00benben000000 000000 # Test the new "object" behaviour. # This tests: # * Copy literals, don't use read-only scalars. # * User-defined booleans # ** Correct object name in user-defined booleans # ** Copy literals and user-defined booleans interplay # ** Deletion of user-defined booleans # * Detect hash collisions use warnings; use strict; use utf8; use FindBin '$Bin'; use Test::More; my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; binmode STDOUT, ":encoding(utf8)"; binmode STDERR, ":encoding(utf8)"; use JSON::Parse; # ____ _ _ _ _ # / ___|___ _ __ _ _ | (_) |_ ___ _ __ __ _| |___ # | | / _ \| '_ \| | | | | | | __/ _ \ '__/ _` | / __| # | |__| (_) | |_) | |_| | | | | || __/ | | (_| | \__ \ # \____\___/| .__/ \__, | |_|_|\__\___|_| \__,_|_|___/ # |_| |___/ # my $jp = JSON::Parse->new (); $jp->copy_literals (1); my $stuff = '{"hocus":true,"pocus":false,"focus":null}'; my $out = $jp->run ($stuff); eval { $out->{pocus} = "bad city"; }; ok (! $@, "Can modify literals without error"); $jp->copy_literals (0); my $stuff2 = '{"hocus":true,"pocus":false,"focus":null}'; my $out2 = $jp->run ($stuff); eval { $out2->{pocus} = "bad city"; }; ok ($@, "Can't modify literals without error"); note ($@); # User-defined booleans package Ba::Bi::Bu::Be::Bo; # https://metacpan.org/source/MAKAMAKA/JSON-PP-2.27300/lib/JSON/PP.pm#L1390 $Ba::Bi::Bu::Be::Bo::true = do { bless \(my $dummy = 1), "JSON::PP::Boolean" }; $Ba::Bi::Bu::Be::Bo::false = do { bless \(my $dummy = 0), "JSON::PP::Boolean" }; $Ba::Bi::Bu::Be::Bo::null = do { bless \(my $dummy), "JSON::PP::Boolean" }; sub true {$Ba::Bi::Bu::Be::Bo::true;} sub false {$Ba::Bi::Bu::Be::Bo::false;} sub null {$Ba::Bi::Bu::Be::Bo::null;} 1; package main; # $jpub = j-son p-arser with u-ser b-ooleans my $jpub = JSON::Parse->new (); my $jpub1 = $jpub->run ($stuff); eval { $jpub1->{hocus} = "bad city"; }; ok ($@, "got error altering literals with default JSON::Parse object"); # Use the same things all the people on CPAN do, switching off the # warnings. $jpub->set_true ($Ba::Bi::Bu::Be::Bo::true); $jpub->no_warn_literals (1); $jpub->set_false ($Ba::Bi::Bu::Be::Bo::false); $jpub->no_warn_literals (0); $jpub->set_null ($Ba::Bi::Bu::Be::Bo::null); my $jpub2 = $jpub->run ($stuff); eval { $jpub2->{hocus} = "bad city"; }; ok (! $@, "Values are not read-only with user-defined true/false values"); my $jpub3 = $jpub->run ($stuff); like (ref $jpub3->{hocus}, qr/JSON::PP::Boolean/, "true value correct type"); like (ref $jpub3->{pocus}, qr/JSON::PP::Boolean/, "false value correct type"); like (ref $jpub3->{focus}, qr/JSON::PP::Boolean/, "null value correct type"); # Now test the same thing after switching on copy_literals. $jpub->no_warn_literals (1); $jpub->copy_literals (1); $jpub->no_warn_literals (0); my $jpub4 = $jpub->run ($stuff); like (ref $jpub4->{hocus}, qr/JSON::PP::Boolean/, "true value correct type even with copy-literals"); like (ref $jpub4->{pocus}, qr/JSON::PP::Boolean/, "false value correct type even with copy-literals"); like (ref $jpub4->{focus}, qr/JSON::PP::Boolean/, "null value correct type even with copy-literals"); # Now delete all our user-defined booleans $jpub->delete_true (); $jpub->delete_false (); $jpub->delete_null (); # Test the objects have gone. my $jpub5 = $jpub->run ($stuff); unlike (ref $jpub5->{hocus}, qr/JSON::PP::Boolean/, "User true deleted"); unlike (ref $jpub5->{pocus}, qr/JSON::PP::Boolean/, "User false deleted"); unlike (ref $jpub5->{focus}, qr/JSON::PP::Boolean/, "User null deleted"); # Now test that copy-literals is still working. my $jpub6 = $jpub->run ($stuff); eval { $jpub6->{hocus} = "bad city"; }; ok (! $@, "Values are not read-only, copy literals still works"); # Finally switch off copy-literals and check that things are back to # the default behaviour. $jpub->copy_literals (0); my $jpub7 = $jpub->run ($stuff); unlike (ref $jpub7->{hocus}, qr/JSON::PP::Boolean/, "User true deleted"); unlike (ref $jpub7->{pocus}, qr/JSON::PP::Boolean/, "User false deleted"); unlike (ref $jpub7->{focus}, qr/JSON::PP::Boolean/, "User null deleted"); eval { $jpub7->{hocus} = "bad city"; }; ok ($@, "Values are read-only again"); # Check it's the right error, "Modification of a readonly value". like ($@, qr/Modification/, "Error message looks good"); note ($@); # Check that this doesn't make a warning, we want the user to be able # to set "null" to "undef". my $warning; $SIG{__WARN__} = sub { $warning = "@_"; }; $jpub->set_true (undef); ok ($warning, "Warning on setting true to non-true value"); $jpub->set_true (0); ok ($warning, "Warning on setting true to non-true value"); $jpub->set_true (''); ok ($warning, "Warning on setting true to non-true value"); $warning = undef; $jpub->set_false (undef); ok (! $warning, "no warning when setting user-defined false"); $warning = undef; $jpub->set_false (0); ok (! $warning, "no warning when setting user-defined false"); $warning = undef; $jpub->set_false (''); ok (! $warning, "no warning when setting user-defined false"); $warning = undef; # https://www.youtube.com/watch?v=g4ouPGGLI6Q $jpub->set_false ('Yodeadodoyodeadodoyodeadodoyodeadodoyodeadodoyodeadodoyo-bab-baaaaa Ahhhhhh-aaahhhh-aaaaaa-aaaaAAA! Ohhhhhh-ooohhh-oooooo-oooOOO!'); ok ($warning, "warning when setting user-defined false to a true value"); note ($warning); $warning = undef; $jpub->set_null (undef); $jpub->set_null (0); $jpub->set_null (''); ok (! $warning, "no warning when setting user-defined null"); # ____ _ _ _ _ _ _ # | _ \ ___| |_ ___ ___| |_ ___ ___ | | (_)___(_) ___ _ __ ___ # | | | |/ _ \ __/ _ \/ __| __| / __/ _ \| | | / __| |/ _ \| '_ \/ __| # | |_| | __/ || __/ (__| |_ | (_| (_) | | | \__ \ | (_) | | | \__ \ # |____/ \___|\__\___|\___|\__| \___\___/|_|_|_|___/_|\___/|_| |_|___/ # # my $stuff3 = '{"hocus":1,"pocus":2,"hocus":3,"focus":4}'; my $jp3 = JSON::Parse->new (); eval { $jp3->run ($stuff3); }; ok (!$@, "Did not detect collision in default setting"); $jp3->detect_collisions (1); eval { $jp3->run ($stuff3); }; ok ($@, "Detected collision"); note ($@); $jp3->detect_collisions (0); eval { $jp3->run ($stuff3); }; ok (!$@, "Did not detect collision after reset to 0"); SKIP: { eval "require 5.14;"; if ($@) { skip "diagnostics_hash requires perl 5.14 or later", 1; } # ____ _ _ _ _ _ # | _ \(_) __ _ __ _ _ __ ___ ___| |_(_) ___ | |__ __ _ ___| |__ # | | | | |/ _` |/ _` | '_ \ / _ \/ __| __| |/ __| | '_ \ / _` / __| '_ \ # | |_| | | (_| | (_| | | | | (_) \__ \ |_| | (__ | | | | (_| \__ \ | | | # |____/|_|\__,_|\__, |_| |_|\___/|___/\__|_|\___| |_| |_|\__,_|___/_| |_| # |___/ my $jp4 = JSON::Parse->new (); $jp4->diagnostics_hash (1); eval { $jp4->run ("{{{{{"); }; ok (ref $@ eq 'HASH', "Got hash diagnostics"); }; done_testing (); JSON-Parse-0.56/t/valid-json.t000644 001751 001751 00000015526 12614126344 015353 0ustar00benben000000 000000 # This test is meant to exercise all the possible ways that parsing # can fail, and also check that correct, yet weird or stupid inputs # are not marked as invalid. use warnings; use strict; use Test::More; use JSON::Parse qw/valid_json validate_json/; # https://github.com/benkasminbullock/JSON-Parse/issues/2 my $fdegir1 = <<'EOF'; { "gav": { "groupId": "mygroup", "artifactId": "myartifact" "version": "1.0" } } EOF ok (! valid_json ($fdegir1)); eval { validate_json ($fdegir1); }; ok ($@, "validate_json dies"); like ($@, qr/line 5/i, "line number OK"); my $empty = ' '; run_fail_like ($empty, qr/unexpected end of input/i); my $undef = undef; ok (! valid_json ($undef)); eval { no warnings 'uninitialized'; validate_json ($undef); use warnings 'uninitialized'; }; ok ($@, "undef input dies"); like ($@, qr/empty input/i, "flagged as empty input"); # ____ _ # / ___|___ _ __ ___ _ __ ___ __ _ ___ __ _ _ __ __| | # | | / _ \| '_ ` _ \| '_ ` _ \ / _` / __| / _` | '_ \ / _` | # | |__| (_) | | | | | | | | | | | (_| \__ \ | (_| | | | | (_| | # \____\___/|_| |_| |_|_| |_| |_|\__,_|___/ \__,_|_| |_|\__,_| # # _ # ___ ___ | | ___ _ __ ___ # / __/ _ \| |/ _ \| '_ \/ __| # | (_| (_) | | (_) | | | \__ \ # \___\___/|_|\___/|_| |_|___/ # # Test comma and colon parsing. my $unknown_character = qr/unexpected character/i; my $bad_comma_1 = '{,"bad":"bad"}'; run_fail_like ($bad_comma_1, $unknown_character); my $bad_comma_array = '[,"bad","bad"]'; run_fail_like ($bad_comma_array, $unknown_character); my $bad_comma_2 = '{"bad",:"bad"}'; run_fail_like ($bad_comma_2, $unknown_character); my $bad_comma_3 = '{"bad":,"bad"}'; run_fail_like ($bad_comma_3, $unknown_character); my $bad_comma_4 = '{"bad":"bad",}'; run_fail_like ($bad_comma_4, $unknown_character); my $bad_comma_5 = '["bad","bad",]'; run_fail_like ($bad_comma_5, $unknown_character); my $no_comma_array = '["bad" "bad"]'; run_fail_like ($no_comma_array, $unknown_character); # Single-element array OK run_ok ('["bad"]'); # Empty array OK run_ok ('[]'); # Empty object OK run_ok ('{}'); # Check the checking of final junk my $too_many_end_braces = '{"bad":"bad"}}'; run_fail_like ($too_many_end_braces, $unknown_character); my $too_many_end_brackets = '["bad","bad"]]'; run_fail_like ($too_many_end_brackets, $unknown_character); run_fail_like ('{"bad":"forgot the end quotes}', qr/end of input/i); # Bug in "get_key_string" found by randomtest run_fail_like ("[\"\0]", $unknown_character); # See what happens when we send a string with a null byte. my $contains_null = '["' . "pupparoon\0\0 baba". '"]'; run_fail_like ($contains_null, qr/unexpected.*0x00/i); # See what happens when we send a string with a disallowed byte. my $contains_junk = '["' . chr (07) . '"]'; run_fail_like ($contains_junk, qr/unexpected.*0x07/i); my $contains_escaped_null = '["\u0000"]'; run_ok ($contains_escaped_null); my $contains_escaped_junk = '["\u0007"]'; run_ok ($contains_escaped_junk); # Don't fail on pointless whitespace. my $contains_silly_whitespace = <