pax_global_header00006660000000000000000000000064147334140560014521gustar00rootroot0000000000000052 comment=c6fe0c376aac38f095c744ef12c5337e193b3579 infgen-3.5/000077500000000000000000000000001473341405600126365ustar00rootroot00000000000000infgen-3.5/.gitignore000066400000000000000000000000211473341405600146170ustar00rootroot00000000000000infgen .DS_Store infgen-3.5/LICENSE000066400000000000000000000016351473341405600136500ustar00rootroot00000000000000 Copyright (C) 2005-2022 Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the author be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Mark Adler madler@alumni.caltech.edu infgen-3.5/README.md000066400000000000000000000017741473341405600141260ustar00rootroot00000000000000Synopsis -------- _infgen_ is a deflate stream disassembler. It will read a gzip, zlib, or raw deflate stream, and output a readable description of the contents. Motivation ---------- _infgen_ permits the examination of deflate compressed data for instructional purposes, to see how the data is compressed, and for debugging deflate compressors. Installation ------------ Simply compile `infgen.c`, and provide the compressed data to stdin. The disassembled output will be written to stdout. The zlib library needs to be linked for CRC-32 functionality that is used on PNG input. Test ---- gzip < infgen.c | ./infgen will display the disassembled result of compressing the _infgen_ source code. Use: infgen -h to see the command options. Documentation ------------- A list of all of the command options and detailed technical documentation can be found in the comments at the start of [infgen.c](infgen.c) License ------- This code is under the zlib license, found in the source file and LICENSE. infgen-3.5/infgen.c000066400000000000000000002454001473341405600142550ustar00rootroot00000000000000/* infgen version 3.5, 26 December 2024 Copyright (C) 2005-2024 Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the author be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Mark Adler madler@alumni.caltech.edu */ /* Read a zlib, gzip, png, or raw deflate stream and write a defgen-compatible or simple binary encoded stream representing that input to stdout. This is based on the puff.c code to decompress deflate streams. Note that neither the zlib nor the gzip trailer is checked against the uncompressed data -- only the fact that the trailer is present is checked. Usage: infgen [-[d[d]][m][i][s][c[c]][q[q]][r][b[b]]] < foo.gz > foo.def or: infgen [-[d[d]][m][i][s][c[c]][q[q]][r][b[b]]] foo.gz > foo.def where foo.gz is a gzip file (it could have been a zlib, png, or raw deflate stream as well), and foo.def is a defgen description of the file or stream, which is in a readable text format (unless -b is used). For png files, the output is a description of the zlib stream extracted from the IDAT chunks. The description includes the literal/length and distance code lengths for dynamic blocks. The -d (dynamic) option generates directives to exactly reconstruct the dynamic block headers. With -d, the code lengths are still included, but now as comments instead of directives. The -dd option is the same as -d, but with the bit sequences for each item shown as a comment after the item. The -m (match) option shows the copied data after each match. The -i (info) option generates additional directives for gzip or zlib headers that permit their exact reconstruction. For png files, -i will show chunk types and lengths as comments, but not the contents, other than IDAT chunks. The -s (statistics) option writes out comments with statistics for each deflate block and totals at the end. The -c (color) option prints the output with different standard terminal colors for the different components, on supporting terminals. -cc uses the high-intensity colors instead of the standard colors. With -dd, Huffman code bits are gray in the comment field, to discriminate the code bits from the extra bits in lens and match directives. The -q (quiet) option suppresses the dynamic block code lengths, whether as directives or as comments. The -qq (really quiet) option suppresses the output of all deflate stream descriptions, leaving only the header and trailer information. However if -qq is used with -s, the statistics information on the deflate stream is still included. The -r (raw) option forces the interpretation of the input as a raw deflate stream, for those cases where the start of a raw stream happens to mimic one of the other headers. The -b (binary) option writes a compact binary format instead of the defgen format. In that case, all other options except -r are ignored. The -bb option includes compressed-data bit counts in the output. Examples: infgen foo.gz show the disassembled gzip/deflate stream infgen -q foo.gz show only the block data contents infgen -s foo.gz include statistics on the blocks and streams infgen -id foo.gz show the contents of wrapper and block headers infgen -ddm foo.gz show dynamic headers, bits, and matched strings infgen -iddcc foo.gz let's see it all in technicolor infgen -b foo.gz > foo.bin write the compact binary format instead of text Both the defgen and the compact binary formats are described below. */ /* defgen format: Content: The defgen format consists of lines of comments and directives. Each line is terminated by a single new line character '\n', though it may be written as "\r\n" on systems with end-of-line conversions. defgen accepts either. The directives are used to construct (or reconstruct) a deflate stream. Each directive is a word at the start of the line, possibly followed by parameters. Comments: defgen lines whose first character is an exclamation mark ('!') are comments, and are ignored by defgen. Blank lines are also ignored. All other lines are directives. If an exclamation mark appears after a directive and not following a single quote, then it and the characters after it are a comment and are ignored. infgen-generated informational comments are described below. Headers and trailers: The "gzip" directive writes a gzip header. It is optionally preceded by directives bearing information to be contained in the gzip header: "name", "comment", "extra", "text', "time", "xfl", "os", and "hcrc". The "name", "comment", and "extra" directives use the same parameter format as "data" and "literal" described below, and may be repeated over multiple lines for long content. For "name" and "comment", the parameters do not include the terminating zero. For example: name 'linux-3.1.6.tar The "time", "os", and "xfl" (extra flags) directives each have a single numeric parameter. "os" and "xfl" have a parameter in the range of 0..255, and "time" is in the the range of 0..2^32-1. infgen adds a comment after the time parameter with the local time zone interpretation of that value. If "os" is not present, it is taken to be 3 (for Unix). If "xfl" or "time" is not present, the value is taken to be zero. "text" and "hcrc" have no parameter. "text" sets the text flag. hcrc signals a two-byte CRC of the header. The "crc" directive writes the CRC-32 of the uncompressed data in little-endian order. The "length" directive writes the length of the uncompressed data, modulo 2^32, in little-endian order. The combination of a CRC and a length in that order is the gzip trailer. Either or both can optionally have a numeric parameter in the range 0..2^32-1 which would be used in place of the value derived from the data. infgen does not write those parameters. The "zlib" directive writes a zlib header. The zlib directive has an optional numeric parameter which is the log-base-2 of the window size, in the range 8..15. If there is no parameter, 15 is assumed. zlib may be preceded by the "level" directive, which has one parameter: the compression level used when compressing in the range 0..3. If level is not present, it is taken to be 2. zlib may also be preceded by a "dict" directive with the dictionary id as the numeric parameter, in the range 0..2^32-1. The "adler" directive writes the Adler-32 checksum of the uncompressed data in big-endian order. This is the zlib trailer. adler may optionally have a numeric parameter in the range 0..2^32-1 that is used in place of the actual Adler-32 checksum of the data. Deflate blocks: Deflate data between zlib or gzip headers and trailers, or raw deflate data, consists of a series of deflate blocks. They are begun by the block type directives: "stored", "fixed", or "dynamic", and all end with "end" after the contents of the block. The last block has the directive "last" on its own line before the block type directive. The "stored" directive has an optional parameter which is the data that fills in the dummy bits to get to a byte boundary. If the parameter is not present, those bits are assumed to be zero. An additional "block3" block type indicates the illegal bit pattern for a fourth block type. Block headers: Fixed blocks have no header, and proceed immediately to the data after the fixed directive. A stored block header has as many bits as needed to go to the next byte boundary (see the "stored" parameter above), followed by four bytes of block length information. There is no directive for the length of the stored block, as it is implied by the amount of data up to the next end directive. A dynamic block has a header that describes the Huffman codes used to represent the literal/length and distance codes in the block. That description is itself compressed with a third code. The dynamic header is represented in one of two ways, or not at all. If there is no description of the header, then the block data can be used to construct an optimal set of Huffman codes for the contained symbols, and an optimum way to encode them in the header. In that case, the data immediately follows the dynamic directive. The first explicit way to describe the header is to list the number of bits in each literal/length and distance code. This is done with the "litlen" and "dist" directives. Each directive has two numerical parameters: the symbol index and the number of bits. E.g. "litlen 40 9" or "dist 16 5". In this case, dynamic is followed by all of the litlen directives, which is followed by all the dist directives. The litlen symbol must be in the range 0..285, and dist symbol must be in the range 0..29. The number of bits for both must be in the range 1..15. Only the symbols coded are listed. The header description is complete upon encountering the first "literal", "match", or "end". The second, more explicit way to describe the header is to list the actual contents of the header, from which the code lengths are derived. This is done with the directives "count", "code", "lens", "repeat", and "zeros". count has two parameters: the number of length code lengths, (257..286) and the number of distance code lengths (1..30). code has two numerical parameters, the symbol index (0..18) and the number of bits for that symbol (1..7). lens has any number of parameters in 0..15, where each is the length of the corresponding literal/length or distance code. A zero length means that that symbol has no code and does not appear in the block. repeat and zeros each have one parameter which is the number of times to repeat a bit length. repeat repeats the most recent length 3..6 times. zeros repeats zeros 3..138 times. dynamic is followed by all of the code directives, and then by the len directives, with repeat and zeros directives mixed in. The header description is complete upon encountering the first "literal", "match", or "end". Data: All compressed data is represented using the directives: "data", "literal", and "match", and optionally "copy". "data", "literal", and "copy" have the same parameters, directly representing bytes of data. "data" may be used only in stored blocks and "literal" may be used only in fixed or dynamic blocks. If the -m option is given to infgen, then "copy" shows the data copied after each match. "copy" provides redundant information, as it can be derived from the previously decompressed data and each match length and distance. "copy" directives are ignored by defgen. The parameters of data, literal, and copy are a series of decimal numbers separated by spaces, followed by a string of printable characters. Each decimal number is in the range 0..255, and represents one byte of data. The string is a single quote, followed by any number of characters in the range 32..126. A single quote may appear within the string meaning a single quote in the data -- it does not end the string. The string is ended by the end of line or any other character not in the range 32..126. To append a comment to a line with a string, a tab ('\t') can end the string, which may then be followed by blank space and an exclamation mark for the comment. Either the numbers or the string are optional -- the directive's parameters may have only numbers or only a string. match has two numerical parameters. The first is the length of the match, in 3..258. The second is the distance back, in 1..32768. The data and the current block ends with an "end" directive. The "end" of a block that was started with "last" marks the end of the deflate stream. If that last block does not end at a bit boundary, the "bound" directive has a single numeric parameter with the fill bits, where those bits would be shifted up to fill in the last byte. If bound is not present, the bits up to the byte boundary are filled with zeros. infgen outputs the bound directive only when the fill bits are not all zeros. infgen comments: infgen starts with a comment line indicating the version of infgen that generated the defgen format output. E.g. "! infgen 3.5 output". infgen inserts an empty comment, a line with just an exclamation mark, before each header, deflate block, and trailers. If the -d option is used, then the litlen and dist directives are written as comments. E.g. "! litlen 40 9". If the -dd option is used, then each deflate stream element, other than stored bytes, is appended to each directive as a comment with a series of bit sequences shown as 0's and 1's. In this case literals are always one per line. The bits in each sequence are shown from most significant to least significant, as they appeared in the compressed data. For directives with multiple components, e.g. Huffman codes and extra bits, each component is shown as one bit sequence with the components separated by spaces. The sequences are shown in reverse order. In that way, if the spaces are removed, the bits are in the order they are pulled from the compressed data, reading right to left. So in: match 18 680 ! 10100111 1011 1 1101011 1101011 is the Huffman code and 1 is the extra bit for length 18. Then 1011 is the Huffman code and 10100111 are the extra bits for distance 680. If these bits happened to start at a byte boundary, then the first byte would be 11101011 or 0xeb, then second byte would be 01111011 or 0x7b. The third byte would have the low nybble 1010, or 0xa. With the -s option, infgen will generate statistics comments, all of which begin with "! stats ". There are statistics for each deflate block, and summary statistics after the last deflate block. The statistics comments are as follows: "! stats table n:m" gives the total number of bytes and bits in the dynamic block header, not including the three block identifier bits. For example, "! stats table 58:6" indicating 58 bytes and 6 bits = 470 bits. "! stats literals x.x bits each (n/m)" follows a fixed or dynamic block and gives the average number of bits per literal, the total number of bits for the literals in the block, and the number of literals in the block. For example, "! stats literals 5.7 bits each (3793/664)". If the block has no literals, then "! stats literals none" will be written. "! stats matches x.x% (n x x.x)" follows a fixed or dynamic block and gives the percentage of the uncompressed bytes in the block that came from matches, the number of matches in the block, and the average match length. For example, "! stats matches 82.6% (183 x 17.2)". If the block has no matches, then "! stats matches none" will be written. "! stats stored length n" follows each stored block and gives the number of uncompressed bytes in the stored block, which does not include the stored header. For example: "! stats stored length 838" is a stored block with 838 bytes. "! stats inout n:m (i) j k" follows any block and gives the total number of bytes and bits in the block, including the three-bit block identifier, the total number of symbols in the block (a literal and a match each count as one symbol), the number of uncompressed bytes generated by the block, and the maximum reach of the distances to data before the block. For example, "! stats inout 1889:4 (1906) 3810 -1718" is a block with 1889 bytes and 4 bits, 1906 symbols, 3810 uncompressed bytes, and maximum reach of 1718 bytes before the block by a match in the block. If the block does not reach before itself, the reach value is zero. After the last deflate block, total statistics are output. They all begin with "! stats total ". The block input and output amounts are summed for example as: "! stats total inout 93232233:0 (55120762) 454563840", with the same format as "! stats inout", except without the reach. "! stats total block average 34162.3 uncompressed" states for example that the average number of uncompressed bytes per block was 34162.3. Similarly "! stats total block average 4142.5 symbols" states that there were 4142.5 symbols on average per block. "! stats total literals 6.9 bits each" states that there were 6.9 bits used on average per literal. Lastly the matches are summed: "! stats total matches 95.2% (33314520 x 13.0)" with the same format as "! stats matches". */ /* Compact binary format (-b) deflate content description (gzip and zlib headers and trailers are ignored): 0..0x7f: high byte of distance-1, followed by low byte of distance-1, followed by length-3 (three bytes total) 0x80..0xfe: literals 0..0x7e 0xff: prefix byte, followed by ... 0, 1: stored block (1 = last), followed by leftover bits (one byte) 2, 3: fixed block (3 = last) 4, 5: dynamic block (5 = last), then header terminated by a 0 byte 6, 7: invalid block (7 = last) 8: end of deflate stream, followed by leftover bits (one byte) 9..0x7e: reserved (not used) 0x7f..0xff: literals 0x7f..0xff dynamic block header: The binary dynamic block header description is terminated by a zero, and does not contain any zeros before that, in order to simplify decoding when the header is not of interest. The raw header is described, in order to permit exact reconstruction if desired. The header is this sequence of bytes: nlen - 256 number of length codes minus 256 (1..30, meaning 257..286) ndist number of distance codes (1..30) ncode number of code length codes (4..19) ncode * ncode bytes follow: len+1 code length plus one (1..8, meaning 0..7) opcodes * enough opcodes follow to describe nlen + ndist codes opcode each byte is 1..16 for lengths 0..15, or 17..20 to repeat the the last length 3..6 times, or 21..156 to repeat zeros 3..138 times 0 a zero byte terminates the header description Literals are coded on average to 1.5 bytes, though often less since low literals are more common. Length-distance pairs are coded as three bytes. The coded form will be approximately 20% to 40% larger than the compressed form. --- Extensions to -b format when the -bb option is given --- The leftover bits after a stored block header or the end of the stream have a 1 bit above them so that the number of leftover bits can be determined. For example 0x80 means seven 0 bits, and 0x01 means no leftover bits. A variable-length unsigned integer is represented in little-endian order with seven bits in each byte and the high bit set, except for the last byte which has the high bit clear. The last byte cannot be zero unless the value being represented is 0. If the value is 1 or more, then there are no zero bytes in the representation. The bit counts below are written as variable- length unsigned integers with values assured to be greater than zero. 0xff: prefix byte, followed by ... 9: total number of bits in the preceding block - 9 number of bits in the header - 2 number of bits in the literal codes + 1 (0 + 1 for stored) number of bits in the match codes + 1 (0 + 1 for stored) a terminating 0 byte 10..0x3f: reserved (not used) -- assume these are followed by a zero- terminated sequence of bytes, like 4, 5, and 9 above (this permits compatible future extensions) 0x40..0x7e: reserved (not used) -- assume these are followed by nothing */ /* Version history: 1.0 20 Jan 2005 First version 1.1 27 Feb 2005 Clean up for distribution 1.2 27 Feb 2005 Remove comments for non-existent return code Check for distances too far back and issue warning 1.3 23 Jul 2006 Provide option to turn off dynamic trees Add option for statistics comments in output Process concatenated streams Show the gzip file name if present Replace cryptic error codes with descriptive messages Correct error messages for incomplete deflate stream 1.4 21 Mar 2007 Add -d option for showing the raw dynamic block header information as it comes in, as comments (for checking initial gzip/deflate fragments for sensibility) Allow multiple options after the initial dash 1.5 9 Jan 2008 Treat no symbol for end-of-block as an error Fix error in use of error message table (inferr[]) 1.6 12 Apr 2008 Add stored block length comment for -s option 1.7 25 Jul 2008 Add some diagnostic information to distance too far back Synchronize stdout and stderr for error messages 1.8 5 Dec 2008 Fix output header to match version Add -r (raw) option to ignore faux zlib/gzip headers Check distance too far back vs. zlib header window size 1.9 9 Jun 2009 Add hack to avoid MSDOS end-of-line conversions Avoid VC compiler warning 2.0 4 Nov 2011 Change fprintf to fputs to avoid warning Add block statistics on literals Allow bad zlib header method to proceed as possible raw Fix incorrect lenlen and distlen comments with -d 2.1 13 Jan 2013 Use uintmax_t for counts instead of unsigned long Fix bug: show block end stats only when -s specified Make the inout comment format a tad more readable Fix stored length stat comment to start with stats Add -q (quiet) option to not output literals and matches Add maximum reach before current block to stats inout Add -i (info) and extra, name, and comment gzip directives Check for ungetc() failure (only one guaranteed) Normally put out only litlen and dist for dynamic header Put out codes, lenlen, distlen, and repeat for -d For -d, still write litlen and dist, but as comments Delete extraneous code comments for -d Have repeat directive use -1 to indicate copy last length Remove extraneous symbol index from lenlen and distlen Replace repeat directive with repeat and zeros directives Add window size to zlib directive, if not 15 Add level and dict directives for zlib headers Add extensive comments on the infgen output format 2.2 10 Feb 2013 Don't show gzip header extra directive if -i not given Don't show zlib header info directives if -i not given Note hcrc directive in format description Add "text" directive for that bit in gzip header flags Add "count" directive for dynamic headers Change "lenlen" and "distlen" directives to just "lens" Check for invalid code length codes in dynamic blocks Change "static" to "fixed" to be consistent with RFC 1951 Add a compact binary output format (-b) Support an input path on the command line Detect when input is from tty, show help in that case Change options -n to -q, and -q to -qq Build struct state in main() Add local time description as a comment in time directive 2.3 18 Jul 2015 Distinguish incomplete from oversubscribed codes Use symbols for error codes Move all if statement actions to next line Show version in help 2.4 2 Jan 2017 Fix erroneous declaration of i/o error on devices 2.5 24 Jul 2021 Set window size from zlib header Add -dd option to show the bit sequences for each item 2.6 22 Aug 2021 Fix bug in binary (-b) output for repeats and zeros 2.7 7 Jan 2022 Fix bit ordering in comments with the -dd option 2.8 9 Jan 2022 Fix bug for gzip header extra field when -i not given Add annotations in comments for gzip extra sub-fields Discriminate non-binary comment with brackets 3.0 10 Aug 2022 Update to zlib license 3.1 19 Jul 2023 Detect and extract the zlib data from PNG files 3.2 26 Jul 2023 Check PNG chunk CRCs 3.3 20 Jun 2024 Add -bb option to include bit counts in binary output 3.4 9 Dec 2024 Add -m option to show data copied by each match Leave input and output pipes open when done 3.5 26 Dec 2024 Add -c, -cc options to colorize textual output */ #define IG_VERSION "3.5" #include // putc(), getc(), ungetc(), fputs(), fflush(), // fopen(), fclose(), fprintf(), vfprintf(), // fread(), fwrite(), stdout, stderr, FILE, EOF #include // exit() #include // strerror(), memcmp(), memset() #include // errno #include // time_t, gmtime(), asctime() #include // va_list, va_start(), va_end() #include // intmax_t, PRIuMAX #include // jmp_buf, setjmp(), longjmp() #include // isatty() #include "zlib.h" // crc32(), get_crc_table() #if defined(MSDOS) || defined(OS2) || defined(_WIN32) || defined(__CYGWIN__) # include # include # define SET_BINARY_MODE(file) _setmode(_fileno(file), O_BINARY) #else # define SET_BINARY_MODE(file) #endif #define local static /* * infgen() return codes: * * 1: available deflate data did not terminate * 0: successful inflate * -1: invalid block type (type == 3) * -2: stored block length did not match one's complement * -3: dynamic block code description: too many length or distance codes * -4: dynamic block code description: code lengths codes oversubscribed * -5: dynamic block code description: code lengths codes incomplete * -6: dynamic block code description: repeat lengths with no first length * -7: dynamic block code description: repeat more than specified lengths * -8: dynamic block code description: literal/length code oversubscribed * -9: dynamic block code description: literal/length code incomplete * -10: dynamic block code description: distance code oversubscribed * -11: dynamic block code description: distance code incomplete * -12: dynamic block code description: missing end-of-block code * -13: invalid literal/length or distance code in fixed or dynamic block */ // infgen() return code symbols. #define IG_INCOMPLETE 1 #define IG_OK 0 #define IG_BLOCK_TYPE_ERR -1 #define IG_STORED_LENGTH_ERR -2 #define IG_TOO_MANY_CODES_ERR -3 #define IG_CODE_LENGTHS_CODE_OVER_ERR -4 #define IG_CODE_LENGTHS_CODE_UNDER_ERR -5 #define IG_REPEAT_NO_FIRST_ERR -6 #define IG_REPEAT_TOO_MANY_ERR -7 #define IG_LITLEN_CODE_OVER_ERR -8 #define IG_LITLEN_CODE_UNDER_ERR -9 #define IG_DIST_CODE_OVER_ERR -10 #define IG_DIST_CODE_UNDER_ERR -11 #define IG_NO_END_CODE_ERR -12 #define IG_BAD_CODE_ERR -13 // infgen() negative return code messages. local const char *inferr[] = { /* -1 */ "invalid block type (3)", /* -2 */ "stored block length complement mismatch", /* -3 */ "too many length or distance codes", /* -4 */ "code lengths code is oversubscribed", /* -5 */ "code lengths code is incomplete", /* -6 */ "length repeat with no first length", /* -7 */ "repeat more lengths than available", /* -8 */ "literal/length code is oversubscribed", /* -9 */ "literal/length code is incomplete", /* -10 */ "distance code is oversubscribed", /* -11 */ "distance code is incomplete", /* -12 */ "missing end-of-block code", /* -13 */ "invalid code" }; #define IG_ERRS (sizeof(inferr)/sizeof(char *)) // Maximums for allocations and loops. It is not useful to change these -- // they are fixed by the deflate format. #define MAXBITS 15 // maximum bits in a code #define MAXLCODES 286 // maximum number of literal/length codes #define MAXDCODES 30 // maximum number of distance codes #define MAXCODES (MAXLCODES+MAXDCODES) // maximum codes lengths to read #define FIXLCODES 288 // number of fixed literal/length codes #define MAXDIST 32768 // maximum match distance #define MAXSEQS 20 // maximum number of bit groups to save // infgen() input and output state. struct state { // Output state. int binary; // true to write compact binary format int info; // true to write informational comments int data; // true to output literals and matches int tree; // true to output dynamic tree int draw; // true to output dynamic descriptor int copy; // true to output match data int stats; // true to output statistics int color; // 0 no color, 1 standard, >1 high intensity int col; // state within data line unsigned max; // maximum distance (bytes so far) unsigned win; // window size from zlib header or 32K FILE *out; // output file // Input state. int bitcnt; // number of bits in bit buffer int bitbuf; // bit buffer long chunk; // bytes left in this png chunk, or -1 uint32_t crc; // running ~CRC of current chunk data z_crc_t const *table; // CRC table for one-byte updates FILE *in; // input file int seqs; // number of bit sequences saved short seq[MAXSEQS]; // bits in each sequence char len[MAXSEQS]; // length of each sequence in bits // Current block statistics. unsigned reach; // maximum distance before current block unsigned headlen; // bits in block header uintmax_t blockin; // bits in for current block uintmax_t blockout; // bytes out for current block uintmax_t symbols; // number of symbols (or stored bytes) uintmax_t matches; // number of matches uintmax_t matchlen; // total length of matches uintmax_t litbits; // number of bits in literals uintmax_t matbits; // number of bits in matches // Total statistics. uintmax_t blocks; // total number of deflate blocks uintmax_t inbits; // total deflate bits in uintmax_t outbytes; // total uncompressed bytes out uintmax_t symbnum; // total number of symbols uintmax_t matchnum; // total number of matches uintmax_t matchtot; // total length of matches uintmax_t littot; // total bits in literals // Input limit error return state for bits() and decode(). jmp_buf env; // Uncompressed data, if needed. int reap; // true to collect uncompressed data FILE *put; // file to write uncompressed data to, or NULL size_t next; // next index in window[] uint8_t window[MAXDIST]; // sliding uncompressed data window }; // Return a string to set or reset the terminal output color. This uses the // xterm-256 escape codes. enum hue { RESET = -1, BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, GRAY }; local char const *color(enum hue h, struct state *s) { if (s->color == 0) return ""; if (s->color > 1 && h >= BLACK && h <= WHITE) // High intensity. h += 8; switch ((int)h) { case 0: return "\033[38;5;0m"; case 1: return "\033[38;5;1m"; case 2: return "\033[38;5;2m"; case 3: return "\033[38;5;3m"; case 4: return "\033[38;5;4m"; case 5: return "\033[38;5;5m"; case 6: return "\033[38;5;6m"; case 7: return "\033[38;5;7m"; case 8: return "\033[38;5;8m"; case 9: return "\033[38;5;9m"; case 10: return "\033[38;5;10m"; case 11: return "\033[38;5;11m"; case 12: return "\033[38;5;12m"; case 13: return "\033[38;5;13m"; case 14: return "\033[38;5;14m"; case 15: return "\033[38;5;15m"; default: return "\033[0m"; // reset } } #define KEY CYAN // color for keywords #define ARG GREEN // color for numeric arguments #define TEXT YELLOW // color for string literals #define CODE GRAY // color for Huffman codes in comments #define ERROR RED // color for error messages #define WARN MAGENTA // color for warning messages // Comments are in the default color, except for Huffman codes. // Print an error message and exit. Return a value to use in an expression, // even though the function will never return. local inline int bail(struct state *s, char *fmt, ...) { fflush(s->out); fprintf(stderr, "%sinfgen error: ", color(ERROR, s)); va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\n%s", color(RESET, s)); exit(1); return 0; } // Print a warning to stderr. local inline void warn(struct state *s, char *fmt, ...) { fflush(s->out); fprintf(stderr, "%sinfgen warning: ", color(WARN, s)); va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\n%s", color(RESET, s)); } #define LINELEN 79 // target line length for data and literal commands #define SEQCOL 24 // column in which to start bit sequence comments // Go to column SEQCOL using tabs. local void seqtab(struct state *s) { s->col = abs(s->col); putc('\t', s->out); // at least one tab to end literal string s->col = (s->col & ~7) + 8; while (s->col + 8 <= SEQCOL) { putc('\t', s->out); s->col += 8; } while (s->col < SEQCOL) { putc(' ', s->out); s->col++; } } // Write the bits that composed the last item, as a comment starting in column // SEQCOL. This assumes that tab stops are at multiples of eight. local inline void putbits(struct state *s) { fputs(color(RESET, s), s->out); if (s->draw > 1) { // Start a comment at column SEQCOL. seqtab(s); putc('!', s->out); // Write the sequences in reverse order, since they were read from // bottom up. In each sequence, write the most to least significant // bit, i.e. the usual order. Show Huffman codes in gray. while (s->seqs) { s->seqs--; short seq = s->seq[s->seqs]; int len = s->len[s->seqs]; if (len) { putc(' ', s->out); if (len < 0) fputs(color(CODE, s), s->out); int n = abs(len); do { fputc('0' + ((seq >> --n) & 1), s->out); } while (n); if (len < 0) fputs(color(RESET, s), s->out); } } } // End the comment (if any) and the line. putc('\n', s->out); s->col = 0; } // Write token at the start of a line and val as a character or decimal value, // continuing the line. Keep the line length reasonable and using string // literals whenever possible. If seq is true and s->draw > 1, also display the // sequences of bits that led to this value. local inline void putval(int val, char *token, int seq, struct state *s) { // seq is true to show bits. seq = seq && s->draw > 1; // New line if too long or decimal after string. if (s->col == 0 || abs(s->col) > LINELEN - 4 || (s->col < 0 && (val < 0x20 || val > 0x7e)) || seq) { if (s->col) putc('\n', s->out); fputs(color(KEY, s), s->out); s->col = fprintf(s->out, "%s", token); fputs(color(ARG, s), s->out); } // String literal (already range-checked above). if (s->col < 0) { putc(val, s->out); s->col--; } // New string literal (mark with negative lit). else if (val >= 0x20 && val <= 0x7e) { fprintf(s->out, " '%s%c", color(TEXT, s), val); s->col += 3; s->col = -s->col; } // Decimal literal. else s->col += fprintf(s->out, " %u", val); // Append a comment with the sequences of bits, if requested. if (seq) putbits(s); } // Return the first byte of the next IDAT chunk, or EOF if there are no more // non-empty IDAT chunks. Set s->chunk to the number of remaining bytes in the // chunk. local int idat(struct state *s) { for (;;) { uint8_t head[13]; // preceding CRC + next length and type head[12] = 0; size_t got = fread(head, 1, 12, s->in); if (got >= 4) { // check CRC uint32_t crc = head[3] + ((uint32_t)head[2] << 8) + ((uint32_t)head[1] << 16) + ((uint32_t)head[0] << 24); if (crc != ~s->crc) warn(s, "corrupt PNG"); } if (got < 12) { if (got != 4) warn(s, "invalid PNG structure"); s->chunk = 0; return EOF; } // Get the chunk length. s->chunk = head[7] + ((long)head[6] << 8) + ((long)head[5] << 16) + ((long)head[4] << 24); if (s->info) { // Show the chunk information. if (s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } fprintf(s->out, "! PNG %s (%ld)\n", head + 8, s->chunk); } // Initialize the CRC with the chunk type. s->crc = ~crc32(crc32(0, Z_NULL, 0), head + 8, 4); if (s->chunk == 0) // Even if this is an IDAT, an empty one is useless. Get the next // chunk. continue; if (memcmp(head + 8, "IDAT", 4) == 0) { // Found an IDAT chunk -- return the first byte. s->chunk--; return getc(s->in); } // Skip over the non-IDAT chunk data, updating the CRC. The chunk CRC // will remain to be read. uint8_t junk[8192]; // read buffer for a non-seekable skip do { long get = s->chunk > (long)sizeof(junk) ? sizeof(junk) : s->chunk; long got = fread(junk, 1, get, s->in); s->crc = ~crc32(~s->crc, junk, got); s->chunk -= got; if (got != get) { warn(s, "invalid PNG structure"); return EOF; } } while (s->chunk); } } // Return the next byte of the deflate data, or EOF on end of input. For png // files, this will read deflate data from each IDAT chunk until it is // exhausted, and then will look for the next IDAT chunk. The chunk CRC is // updated. local inline int get(struct state *s) { if (s->chunk == -1) return getc(s->in); int ch = s->chunk-- ? getc(s->in) : idat(s); if (ch == EOF) return ch; s->crc = (s->crc >> 8) ^ s->table[(s->crc ^ ch) & 0xff]; return ch; } // Return need bits from the input stream. This always leaves less than // eight bits in the buffer. bits() works properly for need == 0. // // Format notes: // // - Bits are stored in bytes from the least significant bit to the most // significant bit. Therefore bits are dropped from the bottom of the bit // buffer, using shift right, and new bytes are appended to the top of the // bit buffer, using shift left. local inline int bits(struct state *s, int need) { // Load at least need bits into val. long val = s->bitbuf; while (s->bitcnt < need) { int next = get(s); if (next == EOF) longjmp(s->env, 1); // out of input val |= (long)(next) << s->bitcnt; // load eight bits s->bitcnt += 8; } // Drop need bits and update buffer, always with 0..7 bits left. Leave need // bits in val. s->bitbuf = (int)(val >> need); s->bitcnt -= need; s->blockin += need; val &= (1L << need) - 1; // Save bit sequence. if (s->draw > 1 && s->seqs < MAXSEQS) { s->seq[s->seqs] = val; s->len[s->seqs] = need; s->seqs++; } // Return need bits. return (int)val; } // Show and accumulate statistics at end of block. local void end(struct state *s) { if (s->stats) fprintf(s->out, "! stats inout %" PRIuMAX ":%" PRIuMAX " (%" PRIuMAX ") %" PRIuMAX " %s%u\n", s->blockin >> 3, s->blockin & 7, s->symbols, s->blockout, s->reach ? "-" : "", s->reach); s->blocks++; s->inbits += s->blockin; s->outbytes += s->blockout; s->symbnum += s->symbols; } // Process a stored block. local int stored(struct state *s) { // Discard leftover bits from current byte (assumes s->bitcnt < 8). (void)bits(s, s->bitcnt); if (s->draw > 1) { s->col = 0; putbits(s); } // Get length and check against its one's complement. unsigned len = bits(s, 16); unsigned cmp = bits(s, 16); if (len != (~cmp & 0xffff)) return IG_STORED_LENGTH_ERR; // didn't match complement! if (s->stats) { if (s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } fprintf(s->out, "! stats stored length %u\n", len); } // Update max distance. if (s->max < s->win) { if (len > s->win - s->max) s->max = s->win; else s->max += len; } // Copy len bytes from in to out. s->headlen = s->blockin; while (len--) { int octet = get(s); s->blockin += 8; if (octet == EOF) return IG_INCOMPLETE; // not enough input if (s->reap) { s->window[s->next++] = octet; s->next &= MAXDIST-1; if (s->next == 0 && s->put != NULL) fwrite(s->window, 1, MAXDIST, s->put); } if (s->binary) { if (octet < 0x7f) putc(octet + 0x80, s->out); else { putc(0xff, s->out); putc(octet, s->out); } } if (s->data) putval(octet, "data", 0, s); s->blockout++; s->symbols++; } // Done with a valid stored block. if (s->data) { if (s->col) { putc('\n', s->out); s->col = 0; } fprintf(s->out, "%send\n%s", color(KEY, s), color(RESET, s)); } if (s->stats) end(s); return IG_OK; } // Huffman code decoding tables. count[1..MAXBITS] is the number of symbols of // each length, which for a canonical code are stepped through in order. // symbol[] are the symbol values in canonical order, where the number of // entries is the sum of the counts in count[]. The decoding process can be // seen in the function decode() below. struct huffman { short *count; // number of symbols of each length short *symbol; // canonically ordered symbols }; // Decode a code from the stream s using Huffman table h. Return the symbol or // a negative value if there is an error. If all of the lengths are zero, i.e. // an empty code, or if the code is incomplete and an invalid code is received, // then IG_BAD_CODE_ERR is returned after reading MAXBITS bits. local inline int decode(struct state *s, struct huffman *h) { int bitbuf = s->bitbuf; // bits to decode from the input int left = s->bitcnt; // number of bits in bitbuf int len = 1; // length of code in consideration int code = 0; // len bits pulled from bitbuf int first = 0; // first code of length len int index = 0; // index of that code in the symbol table short *next = h->count + 1; // pointer to number of codes of next length for (;;) { while (left--) { code |= bitbuf & 1; bitbuf >>= 1; int count = *next++; if (code < first + count) { // This code is length len. Save bit sequence. if (s->draw > 1 && s->seqs < MAXSEQS) { // Reverse the code for showing in the comment. int rev = 0; for (int i = 0; i < len; i++) rev = (rev << 1) | ((code >> i) & 1); s->seq[s->seqs] = rev; s->len[s->seqs] = -len; // note as a Huffman code s->seqs++; } // Update state. s->bitbuf = bitbuf; s->bitcnt = (s->bitcnt - len) & 7; s->blockin += len; // Return symbol. return h->symbol[index + (code - first)]; } // Update to find a code of the next length. index += count; first += count; first <<= 1; code <<= 1; len++; } // Need to load more bits from the input into bitbuf. left = (MAXBITS+1) - len; if (left == 0) break; bitbuf = get(s); if (bitbuf == EOF) longjmp(s->env, 1); // out of input if (left > 8) left = 8; } return IG_BAD_CODE_ERR; // ran out of codes } // Given the list of code lengths length[0..n-1] representing a canonical // Huffman code for n symbols, construct the tables required to decode those // codes. Those tables are the number of codes of each length, and the symbols // sorted by length, retaining their original order within each length. The // return value is zero for a complete code set, negative for an over- // subscribed code set, and positive for an incomplete code set. The tables can // be used if the return value is zero or positive, but they cannot be used if // the return value is negative. If the return value is zero, it is not // possible for decode() using that table to return an error--any stream of // enough bits will resolve to a symbol. If the return value is positive, then // it is possible for decode() using that table to return an error for received // codes past the end of the incomplete lengths. // // Not used by decode(), but used for error checking, h->count[0] is the number // of the n symbols not in the code. So n - h->count[0] is the number of codes. // This is useful for checking for incomplete codes that have more than one // symbol, which is an error in a dynamic block. // // Assumption: for all i in 0..n-1, 0 <= length[i] <= MAXBITS // // This is assured by the construction of the length arrays in dynamic() and // fixed() and is not verified by construct(). local int construct(struct huffman *h, short *length, int n) { // Count the number of codes of each length. for (int len = 0; len <= MAXBITS; len++) h->count[len] = 0; for (int symbol = 0; symbol < n; symbol++) (h->count[length[symbol]])++; // assumes lengths are within bounds if (h->count[0] == n) // no codes! return 0; // complete, but decode() will fail // Check for an over-subscribed or incomplete set of lengths. int left = 1; // one possible code of zero length for (int len = 1; len <= MAXBITS; len++) { left <<= 1; // one more bit, double codes left left -= h->count[len]; // deduct count from possible codes if (left < 0) return left; // over-subscribed--return negative } // left > 0 means incomplete // Generate offsets into symbol table for each length for sorting. short offs[MAXBITS+1]; // offsets in symbol table for each length offs[1] = 0; for (int len = 1; len < MAXBITS; len++) offs[len + 1] = offs[len] + h->count[len]; // Put symbols in table sorted by length, by symbol order within each // length. for (int symbol = 0; symbol < n; symbol++) if (length[symbol] != 0) h->symbol[offs[length[symbol]]++] = symbol; // Return zero for complete set, positive for incomplete set. return left; } // Decode literal/length and distance codes until an end-of-block code. local int codes(struct state *s, struct huffman *lencode, struct huffman *distcode) { static const short lens[29] = { // size base for length codes 257..285 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258}; static const short lext[29] = { // extra bits for length codes 257..285 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}; static const short dists[30] = { // offset base for distance codes 0..29 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577}; static const short dext[30] = { // extra bits for distance codes 0..29 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; // Decode literals and length/distance pairs. int symbol; do { uintmax_t beg = s->blockin; symbol = decode(s, lencode); s->symbols++; if (symbol < 0) return symbol; // invalid symbol if (symbol < 256) { // literal: symbol is the byte // Write out the literal. if (s->reap) { s->window[s->next++] = symbol; s->next &= MAXDIST-1; if (s->next == 0 && s->put != NULL) fwrite(s->window, 1, MAXDIST, s->put); } if (s->binary) { if (symbol < 0x7f) putc(symbol + 0x80, s->out); else { putc(0xff, s->out); putc(symbol, s->out); } } if (s->data) putval(symbol, "literal", 1, s); s->blockout += 1; if (s->max < s->win) s->max++; s->litbits += s->blockin - beg; } else if (symbol > 256) { // length // Get and compute length. if (symbol >= MAXLCODES) return IG_BAD_CODE_ERR; // invalid fixed code symbol -= 257; int len = lens[symbol] + bits(s, lext[symbol]); // Get distance. symbol = decode(s, distcode); if (symbol < 0) return symbol; // invalid symbol unsigned dist = dists[symbol] + bits(s, dext[symbol]); // Check distance and write match. if (s->reap) { int n = len; do { s->window[s->next] = s->window[(s->next - dist) & (MAXDIST-1)]; s->next = (s->next + 1) & (MAXDIST-1); if (s->next == 0 && s->put != NULL) fwrite(s->window, 1, MAXDIST, s->put); } while (--n); } if (s->binary) { putc((dist - 1) >> 8, s->out); putc(dist - 1, s->out); putc(len - 3, s->out); } if (s->data) { if (s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } fprintf(s->out, "%smatch %s", color(KEY, s), color(ARG, s)); s->col = 6 + fprintf(s->out, "%d %u", len, dist); putbits(s); if (s->copy) { // Show the data copied by the match. size_t i = (s->next - len) & (MAXDIST-1); do { putval(s->window[i++], "copy", 0, s); i &= MAXDIST-1; } while (i != s->next); if (s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } } } if (dist > s->max) { warn(s, "distance too far back (%u/%u)", dist, s->max); s->max = MAXDIST; // issue warning only once } // Update state for match. if (dist > s->blockout) { dist -= s->blockout; if (dist > s->reach) s->reach = dist; } s->blockout += len; s->matches++; s->matchlen += len; if (s->max < s->win) { if (len > (int)(s->win - s->max)) s->max = s->win; else s->max += len; } s->matbits += s->blockin - beg; } } while (symbol != 256); // end of block symbol s->symbols--; // Write end of block code. if (s->data) { if (s->col) { putc('\n', s->out); s->col = 0; } fprintf(s->out, "%send", color(KEY, s)); s->col = 3; putbits(s); } if (s->stats) { if (s->symbols != s->matches) fprintf(s->out, "! stats literals %.1f bits each (%" PRIuMAX "/%" PRIuMAX ")\n", s->litbits / (double)(s->symbols - s->matches), s->litbits, s->symbols - s->matches); else fputs("! stats literals none\n", s->out); s->littot += s->litbits; if (s->matches) { fprintf(s->out, "! stats matches %.1f%% (%" PRIuMAX " x %.1f)\n", 100 * (s->matchlen / (double)(s->blockout)), s->matches, s->matchlen / (double)(s->matches)); s->matchnum += s->matches; s->matchtot += s->matchlen; } else fputs("! stats matches none\n", s->out); end(s); } // Done with a valid fixed or dynamic block. return IG_OK; } // Process a fixed codes block. local int fixed(struct state *s) { static short lencnt[MAXBITS+1], lensym[FIXLCODES]; static short distcnt[MAXBITS+1], distsym[MAXDCODES]; static struct huffman lencode = {lencnt, lensym}; static struct huffman distcode = {distcnt, distsym}; // Build fixed Huffman tables if first call (not thread safe). static int virgin = 1; if (virgin) { int symbol; short lengths[FIXLCODES]; // Literal/length table. for (symbol = 0; symbol < 144; symbol++) lengths[symbol] = 8; for (; symbol < 256; symbol++) lengths[symbol] = 9; for (; symbol < 280; symbol++) lengths[symbol] = 7; for (; symbol < FIXLCODES; symbol++) lengths[symbol] = 8; construct(&lencode, lengths, FIXLCODES); // Distance table. for (symbol = 0; symbol < MAXDCODES; symbol++) lengths[symbol] = 5; construct(&distcode, lengths, MAXDCODES); // Do this just once. virgin = 0; } // Decode data until end-of-block code. s->headlen = s->blockin; return codes(s, &lencode, &distcode); } // Process a dynamic codes block. local int dynamic(struct state *s) { // Get number of lengths in each table, check lengths. if (s->data && s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } int nlen = bits(s, 5) + 257; int ndist = bits(s, 5) + 1; int ncode = bits(s, 4) + 4; if (nlen > MAXLCODES || ndist > MAXDCODES) return IG_TOO_MANY_CODES_ERR; // bad counts if (s->binary) { putc(nlen - 256, s->out); putc(ndist, s->out); putc(ncode, s->out); } if (s->draw) { fprintf(s->out, "%scount %s", color(KEY, s), color(ARG, s)); s->col = 6 + fprintf(s->out, "%d %d %d", nlen, ndist, ncode); putbits(s); } // Read code length code lengths (really), missing lengths are zero. short lengths[MAXCODES]; // descriptor code lengths static const short order[19] = // permutation of code length codes {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; int index; for (index = 0; index < ncode; index++) { int len = bits(s, 3); lengths[order[index]] = len; if (s->binary) putc(len + 1, s->out); if (s->draw && len) { fprintf(s->out, "%scode %s", color(KEY, s), color(ARG, s)); s->col = 5 + fprintf(s->out, "%d %d", order[index], len); putbits(s); } } for (; index < 19; index++) lengths[order[index]] = 0; // Build Huffman table for code lengths codes (use lencode temporarily). short lencnt[MAXBITS+1], lensym[MAXLCODES]; // lencode memory struct huffman lencode = {lencnt, lensym}; // length code int err = construct(&lencode, lengths, 19); if (err < 0) return IG_CODE_LENGTHS_CODE_OVER_ERR; // oversubscribed else if (err > 0) return IG_CODE_LENGTHS_CODE_UNDER_ERR; // incomplete // Read length/literal and distance code length tables. index = 0; while (index < nlen + ndist) { int symbol = decode(s, &lencode); if (symbol < 0) return symbol; // invalid symbol if (symbol < 16) { // length in 0..15 if (s->binary) putc(symbol + 1, s->out); if (s->draw) putval(symbol, "lens", 1, s); lengths[index++] = symbol; } else { // repeat instruction int len = -1; // assume repeating zeros if (symbol == 16) { // repeat last length 3..6 times if (index == 0) return IG_REPEAT_NO_FIRST_ERR; // no last length! len = lengths[index - 1]; // last length symbol = 3 + bits(s, 2); } else if (symbol == 17) // repeat zero 3..10 times symbol = 3 + bits(s, 3); else // == 18, repeat zero 11..138 times symbol = 11 + bits(s, 7); if (index + symbol > nlen + ndist) return IG_REPEAT_TOO_MANY_ERR; // too many lengths! if (s->binary) putc(symbol + (len == -1 ? 18 : 14), s->out); if (s->draw) { if (s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } fputs(color(KEY, s), s->out); s->col = fprintf(s->out, "%s ", len == -1 ? "zeros" : "repeat"); fputs(color(ARG, s), s->out); s->col += fprintf(s->out, "%d", symbol); putbits(s); } if (len == -1) len = 0; while (symbol--) // repeat last or zero symbol times lengths[index++] = len; } } if (s->binary) putc(0, s->out); if (s->draw && s->col) { fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; } if (s->stats) fprintf(s->out, "! stats table %" PRIuMAX ":%" PRIuMAX "\n", (s->blockin - 3) >> 3, (s->blockin - 3) & 7); // Write literal/length and distance code lengths. if (s->tree) { for (index = 0; index < nlen; index++) if (lengths[index] != 0) fprintf(s->out, "%slitlen %s%d %d\n%s", s->draw ? "! " : color(KEY, s), s->draw ? "" : color(ARG, s), index, lengths[index], s->draw ? "" : color(RESET, s)); for (; index < nlen + ndist; index++) if (lengths[index] != 0) fprintf(s->out, "%sdist %s%d %d\n%s", s->draw ? "! " : color(KEY, s), s->draw ? "" : color(ARG, s), index - nlen, lengths[index], s->draw ? "" : color(RESET, s)); } // Check for end-of-block code -- there better be one! if (lengths[256] == 0) return IG_NO_END_CODE_ERR; // Build Huffman table for literal/length codes. err = construct(&lencode, lengths, nlen); if (err < 0) return IG_LITLEN_CODE_OVER_ERR; else if (err > 0 && nlen - lencode.count[0] != 1) return IG_LITLEN_CODE_UNDER_ERR; // incomplete with one code ok // Build Huffman table for distance codes. short distcnt[MAXBITS+1], distsym[MAXDCODES]; // distcode memory struct huffman distcode = {distcnt, distsym}; // distance code err = construct(&distcode, lengths + nlen, ndist); if (err < 0) return IG_DIST_CODE_OVER_ERR; else if (err > 0 && ndist - distcode.count[0] != 1) return IG_DIST_CODE_UNDER_ERR; // incomplete with one code ok // Decode data until end-of-block code. s->headlen = s->blockin; return codes(s, &lencode, &distcode); } // Write val as a variable-length integer to out. local void putvar(uintmax_t u, FILE *out) { while (u > 0x7f) { putc((u & 0x7f) | 0x80, out); u >>= 7; } putc(u, out); } // Inflate in to out, writing a defgen description of the input stream. On // success, the return value of infgen() is IG_OK (0). If there is an error in // the source data, i.e. it is not in the deflate format, then a negative value // is returned. If there is not enough input available, then IG_INCOMPLETE is // returned. // // infgen()'s return codes are documented near the top of this source file. local int infgen(struct state *s) { // Initialize input state. s->bitcnt = 0; s->bitbuf = 0; s->seqs = 0; // Initialize output state. s->col = 0; s->max = 0; // Initialize sliding window. if (s->reap) { s->next = 0; memset(s->window, 0, MAXDIST); // copy zeros for distance too far back } // Initialize statistics. s->blocks = 0; s->inbits = 0; s->outbytes = 0; s->symbnum = 0; s->matchnum = 0; s->matchtot = 0; s->littot = 0; // Return if bits() or decode() tries to read past available input. int err = 0; if (setjmp(s->env) != 0) // if came back here via longjmp() err = IG_INCOMPLETE; // then skip do-loop, return error else { // Process blocks until last block or error. int last; do { if (s->data) fputs("!\n", s->out); s->reach = 0; s->blockin = 0; s->blockout = 0; s->symbols = 0; s->matches = 0; s->matchlen = 0; s->litbits = 0; s->matbits = 0; last = bits(s, 1); // one if last block if (s->data && last) { fprintf(s->out, "%slast", color(KEY, s)); s->col = 4; putbits(s); } int type = bits(s, 2); // block type 0..3 if (s->binary) { putc(0xff, s->out); putc((type << 1) + last, s->out); } switch (type) { case 0: if (s->binary) putc(s->bitbuf + (s->binary > 1 ? 1 << s->bitcnt : 0), s->out); if (s->data) { fputs(color(KEY, s), s->out); s->col = fprintf(s->out, "stored"); if (s->bitbuf) { fputs(color(ARG, s), s->out); s->col += fprintf(s->out, " %d", s->bitbuf); } putbits(s); } err = stored(s); break; case 1: if (s->data) { fprintf(s->out, "%sfixed", color(KEY, s)); s->col = 5; putbits(s); } err = fixed(s); break; case 2: if (s->data) { fprintf(s->out, "%sdynamic", color(KEY, s)); s->col = 7; putbits(s); } err = dynamic(s); break; default: // 3 if (s->data) { fprintf(s->out, "%sblock3", color(KEY, s)); s->col = 6; putbits(s); } err = IG_BLOCK_TYPE_ERR; } if (err != IG_OK) break; // return with error if (s->binary > 1) { putc(0xff, s->out); putc(9, s->out); putvar(s->blockin - 9, s->out); putvar(s->headlen - 2, s->out); putvar(s->litbits + 1, s->out); putvar(s->matbits + 1, s->out); putc(0, s->out); } } while (!last); } // Write out what's left in the window. if (s->reap && s->put != NULL) { fwrite(s->window, 1, s->next, s->put); s->next = 0; } // Finish off dangling literal line. if (s->data && s->col) fprintf(s->out, "\n%s", color(RESET, s)); s->col = 0; // Write the leftovers information. if (s->binary) { putc(0xff, s->out); putc(8, s->out); putc(s->bitbuf + (s->binary > 1 ? 1 << s->bitcnt : 0), s->out); } if (s->data && s->bitcnt && s->bitbuf) { fprintf(s->out, "%sbound %s", color(KEY, s), color(ARG, s)); s->col = 6 + fprintf(s->out, "%d", s->bitbuf); } if (s->draw > 1 && s->bitcnt && s->seqs < MAXSEQS) { s->seq[s->seqs] = s->bitbuf; s->len[s->seqs] = s->bitcnt; s->seqs++; putbits(s); } else if (s->data && s->bitcnt && s->bitbuf) fprintf(s->out, "\n%s", color(RESET, s)); // Write final statistics. if (s->stats) { fprintf(s->out, "! stats total inout %" PRIuMAX ":%" PRIuMAX " (%" PRIuMAX ") %" PRIuMAX "\n", s->inbits >> 3, s->inbits & 7, s->symbnum, s->outbytes); fprintf(s->out, "! stats total block average %.1f uncompressed\n", s->outbytes / (double)s->blocks); fprintf(s->out, "! stats total block average %.1f symbols\n", s->symbnum / (double)s->blocks); fprintf(s->out, "! stats total literals %.1f bits each\n", s->littot / (double)(s->symbnum - s->matchnum)); if (s->matchnum) fprintf(s->out, "! stats total matches %.1f%% (%" PRIuMAX " x %.1f)\n", 100 * (s->matchtot / (double)(s->outbytes)), s->matchnum, s->matchtot / (double)(s->matchnum)); else fputs("! stats total no matches\n", s->out); } // Return error state. return err; } // Provide help for the command options. local void help(void) { fputs( "\n" "infgen " IG_VERSION "\n" "Usage:\n" "\n" " infgen [-d[d]misc[c]q[q]rb[b]] input_path > output_path\n" " infgen [-d[d]misc[c]q[q]rb[b]] < input_path > output_path\n" "\n" " -d Write raw dynamic header (code lengths in comments)\n" " -dd Also show the bits for each element displayed\n" " -m Show copied data after each match\n" " -i Include detailed gzip / zlib header descriptions\n" " -s Include deflate block statistics (as comments)\n" " -c Color the output components (if terminal supports)\n" " -cc Use high-intensity instead of standard colors\n" " -q Do not write dynamic code lengths (comments or not)\n" " -qq Do not write deflate stream description at all\n" " -r Assume raw deflate data -- do not look for headers\n" " -b Write compact binary format (only -r honored)\n" " -bb Write compact binary format with bit counts\n" "\n", stderr); } // Get the next byte of input, or abort if none. #define NEXT(in) ((n = getc(in)) != EOF ? n : (s.col ? putc('\n', s.out) : 0, \ bail(&s, "unexpected end of input"))) // Read a gzip, zlib, or raw deflate stream from stdin or a provided path, and // write a defgen description of the stream to stdout. int main(int argc, char **argv) { // Process command line options. char *path = NULL; int head = 1; int wrap = 1; struct state s; s.info = 0; s.binary = 0; s.data = 1; s.copy = 0; s.tree = 1; s.draw = 0; s.stats = 0; s.color = 0; s.win = MAXDIST; s.chunk = -1; while (--argc) { char *arg = *++argv; if (*arg++ != '-') { if (path != NULL) bail(&s, "only one input file permitted (%s)", arg - 1); path = arg - 1; continue; } while (*arg) switch (*arg++) { case 'i': s.info = 1; break; case 'b': s.binary++; break; case 'q': if (s.tree) s.tree = 0; else s.data = 0; break; case 'd': s.draw++; break; case 'm': s.copy = 1; break; case 's': s.stats = 1; break; case 'c': s.color++; break; case 'r': head = 0; break; case 'h': help(); return 0; default: bail(&s, "invalid option '%c' (type infgen for help)", *--arg); } } if (s.data == 0) s.draw = 0; // Set input and output. if (path == NULL) { if (isatty(0)) { help(); return 0; } errno = 0; // isatty(0) false leaves errno as ENOTTY s.in = stdin; SET_BINARY_MODE(s.in); } else { s.in = fopen(path, "rb"); if (s.in == NULL) bail(&s, "could not open input file %s", path); } s.out = stdout; if (s.binary) { wrap = s.info = s.data = s.tree = s.draw = s.stats = 0; SET_BINARY_MODE(s.out); } s.col = 0; s.put = NULL; // decompressed output for verification // s.put = fopen("infgen.dat", "wb"); s.reap = s.copy || s.put != NULL; // Say what wrote this. if (wrap) fputs("! infgen " IG_VERSION " output\n", s.out); // Process concatenated streams. int ret; do { // Skip header, if any, save header type as trailer size. ret = getc(s.in); int n = getc(s.in); unsigned val = ((unsigned)ret << 8) + (unsigned)n; int trail; if (ret == EOF) { // nothing after the last stream, or empty file ret = 0; break; } else if (head && n != EOF && val == 0x1f8b) { // gzip header if (wrap) fputs("!\n", s.out); if (NEXT(s.in) != 8) bail(&s, "unknown gzip compression method %d", n); ret = NEXT(s.in); if (ret & 0xe0) bail(&s, "reserved gzip flags set (%02x)", ret); if (s.info && (ret & 1)) fprintf(s.out, "%stext\n%s", color(KEY, &s), color(RESET, &s)); unsigned long num = NEXT(s.in); num += NEXT(s.in) << 8; num += NEXT(s.in) << 16; num += NEXT(s.in) << 24; if (s.info && num) { time_t t = num; fprintf(s.out, "%stime %s", color(KEY, &s), color(ARG, &s)); s.col = 5 + fprintf(s.out, "%lu", num); fputs(color(RESET, &s), s.out); seqtab(&s); char at[64]; strncpy(at, asctime(gmtime(&t)), sizeof(at) - 1); at[sizeof(at) - 1] = 0; char *end = at + strlen(at) - 1; if (*end == '\n') *end = 0; fprintf(s.out, "! [UTC %s]\n", at); s.col = 0; } val = NEXT(s.in); if (s.info && val) fprintf(s.out, "%sxfl %s%u\n%s", color(KEY, &s), color(ARG, &s), val, color(RESET, &s)); val = NEXT(s.in); if (s.info && val != 3) fprintf(s.out, "%sos %s%u\n%s", color(KEY, &s), color(ARG, &s), val, color(RESET, &s)); if (ret & 4) { // extra field val = NEXT(s.in); val += NEXT(s.in) << 8; if (val == 0) { if (s.info) fprintf(s.out, "%sextra %s'\n%s", color(KEY, &s), color(ARG, &s), color(RESET, &s)); } else { unsigned sub = 0; // offset within sub-field char id[3] = {0}; // sub-field ID unsigned len = 0; // sub-field content length int ok = 1; // false if sub-fields invalid do { NEXT(s.in); if (s.info) { putval(n, "extra", 0, &s); if (ok) { if (sub < 2) // sub-field ID byte id[sub] = n; else if (sub == 2) // low byte of sub-field content length len = n; else if (sub == 3) { // high byte of sub-field content length len += (unsigned)n << 8; if (len < val) { // sub-field fits in extra field seqtab(&s); fprintf(s.out, "%s! [id='%s' len=%u]\n", color(RESET, &s), id, len); s.col = 0; if (len == 0) { sub = 0; continue; } } else // sub-field doesn't fit -- invalid ok = 0; } else { // sub-field content if (--len == 0) { if (s.col) { fprintf(s.out, "\n%s", color(RESET, &s)); s.col = 0; } sub = 0; continue; } } sub++; } } } while (--val); if (s.info && (!ok || (sub > 0 && sub < 4) || len)) { // invalid sub-field structure if (s.col) { fprintf(s.out, "\n%s", color(RESET, &s)); s.col = 0; } seqtab(&s); fputs("! [invalid sub-field structure]\n", s.out); s.col = 0; } } if (s.info && s.col) { fprintf(s.out, "\n%s", color(RESET, &s)); s.col = 0; } } if (ret & 8) { // file name if (NEXT(s.in) == 0) { if (s.info) fprintf(s.out, "%sname %s'\n%s", color(KEY, &s), color(ARG, &s), color(RESET, &s)); } else do { if (s.info) putval(n, "name", 0, &s); } while (NEXT(s.in) != 0); if (s.info && s.col) { fprintf(s.out, "\n%s", color(RESET, &s)); s.col = 0; } } if (ret & 16) { // comment field if (NEXT(s.in) == 0) { if (s.info) fprintf(s.out, "%scomment %s'\n%s", color(KEY, &s), color(ARG, &s), color(RESET, &s)); } else do { if (s.info) putval(n, "comment", 0, &s); } while (NEXT(s.in) != 0); if (s.info && s.col) { fprintf(s.out, "\n%s", color(RESET, &s)); s.col = 0; } } if (ret & 2) { // header CRC NEXT(s.in); NEXT(s.in); if (s.info) fprintf(s.out, "%shcrc\n%s", color(KEY, &s), color(RESET, &s)); } trail = 8; if (wrap) fprintf(s.out, "%sgzip\n%s", color(KEY, &s), color(RESET, &s)); } else if (head && n != EOF && val == (137 << 8) + 'P') { // png file. Verify the remainder of "PNG". if (NEXT(s.in) != 'N' || NEXT(s.in) != 'G') bail(&s, "invalid PNG header"); if (s.info) fputs("!\n", s.out); // Now we are four bytes before the start of first png chunk. We // set those four bytes of header to be checked as if they are the // CRC of a preceding chunk. s.crc = ~0x0d0a1a0a; s.table = get_crc_table(); // Get what should be a zlib header. s.chunk = 0; ret = get(&s); n = get(&s); val = ((unsigned)ret << 8) + (unsigned)n; if (n == EOF || val % 31 || (ret & 0xf) != 8 || (ret >> 4) > 7) bail(&s, "invalid zlib header in IDAT"); goto zlib; } else if (head && n != EOF && val % 31 == 0 && (ret & 0xf) == 8 && (ret >> 4) < 8) { // zlib header. zlib: if (wrap) fputs("!\n", s.out); if (s.info && (val & 0xe0) != 0x80) // compression level fprintf(s.out, "%slevel %s%d\n%s", color(KEY, &s), color(ARG, &s), (val >> 6) & 3, color(RESET, &s)); if (val & 0x20) { // preset dictionary if (s.chunk != -1) bail(&s, "preset dictionary not valid in PNG"); unsigned long num = NEXT(s.in); num = (num << 8) + NEXT(s.in); num = (num << 8) + NEXT(s.in); num = (num << 8) + NEXT(s.in); if (s.info) fprintf(s.out, "%sdict %s%lu\n%s", color(KEY, &s), color(ARG, &s), num, color(RESET, &s)); } ret = (ret >> 4) + 8; s.win = 1U << ret; // set window size from header trail = 4; if (s.info && ret != 15) fprintf(s.out, "%szlib %s%d\n%s", color(KEY, &s), color(ARG, &s), ret, color(RESET, &s)); else if (wrap) fprintf(s.out, "%szlib\n%s", color(KEY, &s), color(RESET, &s)); } else { // Raw deflate data, put non-header bytes back (assumes two ok). ungetc(n, s.in); ret = ungetc(ret, s.in); // this should work, but ... if (ret == EOF) // only one ungetc() guaranteed bail(&s, "could not ungetc() a second time (!)"); trail = 0; } // Process compressed data to produce a defgen description. ret = infgen(&s); // Check return value and trailer size. if (ret > 0) warn(&s, "incomplete deflate data"); else if (ret < 0) warn(&s, "invalid deflate data -- %s", -ret > 0 && -ret <= (int)IG_ERRS ? inferr[-1 - ret] : "unknown"); else { n = 0; while (n < trail && get(&s) != EOF) n++; if (n < trail) { warn(&s, "incomplete %s trailer", trail == 4 ? "zlib" : "gzip"); ret = 2; } } // Write defgen trailer (note: trailer is not validated). if (ret == 0 && wrap) { if (trail == 4) fprintf(s.out, "!\n%sadler\n%s", color(KEY, &s), color(RESET, &s)); else if (trail == 8) fprintf(s.out, "!\n%slength\n%s", color(KEY, &s), color(RESET, &s)); } if (s.chunk != -1) { // Parse remainder of PNG file. fputs("!\n", s.out); if (s.chunk || get(&s) != EOF) warn(&s, "invalid PNG file structure"); break; } } while (ret == 0); // Done. if (s.put != NULL) fclose(s.put); fflush(s.out); if (path != NULL) fclose(s.in); if ((ferror(s.in) || ferror(s.out)) && errno) bail(&s, "i/o error: %s", strerror(errno)); return ret; } infgen-3.5/infstats.c000066400000000000000000000407641473341405600146500ustar00rootroot00000000000000/* infstats version 1.0, 21 June 2024 Copyright (C) 2005-2024 Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the author be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Mark Adler madler@alumni.caltech.edu */ // Take as input the binary format generated by infgen -b or infgen -bb. // Generate and show statistics on the compressed and uncompressed data. // // First the number of deflate streams are shown. The subsequent statistics are // across all of the streams, as if they were a single stream. The statistics // are broken up into static block, fixed block, and dynamic block groups. // // For each block type is the number of those blocks, the number of compressed // bytes(bits), the number of uncompressed bytes, and the compression ratio of // input bytes divided by output bytes. For fixed and dynamic blocks that is // followed by the same thing, but broken out into literals ("lits") and // matches ("mats"). bytes(bits) is the number of whole bytes followed by the // remaining number of bits in 0..7. // // The subsequent lines for the fixed and dynamic block types are statistics // for the bits per block ("bits"), the bits in the header ("head") for dynamic // blocks only, the number of symbols per block ("syms"), the distribution of // match lengths ("lens"), and the uncompressed bytes per block ("data"). Each // line of statistics is the mean ("μ ="), standard deviation ("σ ="), and // [minimum..maximum]. // // Information about the compressed bits will not be included if infgen -b was // used instead of infgen -bb. #include #include #include #include // Compile with -DTEST to write the uncompressed data to stdout, instead of // the statistics. #ifdef TEST // Window for writing decompressed output. static char window[32768]; static size_t pos = 0; #endif // Accumulated statistics of a given value. typedef struct { uintmax_t num; // count of values uintmax_t sum; // sum of values double sqr; // sum of the values squared uintmax_t min; // minimum value uintmax_t max; // maximum value } tally_t; // Initialize t. static void tally_init(tally_t *t) { t->num = 0; t->sum = 0; t->sqr = 0.; t->min = -1; t->max = 0; } // Update the total, min, and max in stats with value. static void tally(tally_t *t, uintmax_t value) { t->num++; t->sum += value; t->sqr += value * (double)value; if (value < t->min) t->min = value; if (value > t->max) t->max = value; } // Show the tally mean, standard deviation, min, and max on out, with the // prefix pre. static void show_tally(FILE *out, char const *pre, tally_t const *t) { if (t->num == 1) fprintf(out, "%s%'ju\n", pre, t->sum); else if (t->num > 1) fprintf(out, "%sμ = %'.1f, σ = %'.1f, in [%'ju..%'ju]\n", pre, t->sum / (double)t->num, sqrt((t->sqr - (t->sum * (double)t->sum) / t->num) / (t->num - 1)), t->min, t->max); } typedef struct { // Temporary block-local statistics. uintmax_t lit; // number of literals in current block uintmax_t mat; // number of matches in current block tally_t writ; // data bytes per symbol stats for current block // infgen -bb statistics. int detail; // true if infgen -bb was used tally_t bits[3]; // bits in stats per block type tally_t head[3]; // header bits stats per block type uintmax_t litb[3]; // number of literal bits for fixed and dynamic uintmax_t matb[3]; // number of match bits for fixed and dynamic // Statistics valid for both infgen -b and infgen -bb. uintmax_t lits[3]; // number of literals per block type uintmax_t mats[3]; // number of matches per block type tally_t syms[3]; // number of symbols stats per block type tally_t lens[3]; // match length stats per block type tally_t data[3]; // bytes of uncompressed data stats per block type uintmax_t streams; // number of deflate streams } stat_t; // Initialize the statistics in s. static void init(stat_t *s) { s->detail = 0; for (int i = 0; i < 3; i++) { tally_init(&s->bits[i]); tally_init(&s->head[i]); s->litb[i] = 0; s->matb[i] = 0; s->lits[i] = 0; s->mats[i] = 0; tally_init(&s->syms[i]); tally_init(&s->lens[i]); tally_init(&s->data[i]); } s->streams = 0; } // Show the statistics in s on out. static void show_stats(FILE *out, stat_t *s) { static const char *name[] = {"stored", "fixed", "dynamic"}; fprintf(out, "%'ju stream%s\n", s->streams, s->streams == 1 ? "" : "s"); for (int i = 0; i < 3; i++) if (s->data[i].num != 0) { fprintf(out, "%'ju %s block%s: ", s->data[i].num, name[i], s->data[i].num == 1 ? "" : "s"); if (s->detail) fprintf(out, "%'ju(%ju) -> ", s->bits[i].sum >> 3, s->bits[i].sum & 7); fprintf(out, "%'ju", s->data[i].sum); if (s->detail) fprintf(out, " (%.3f)", s->bits[i].sum / (8 * (double)s->data[i].sum)); putc('\n', out); if (i > 0) { fprintf(out, " lits: "); if (s->detail) fprintf(out, "%'ju(%ju) -> ", s->litb[i] >> 3, s->litb[i] & 7); fprintf(out, "%'ju", s->lits[i]); if (s->detail) fprintf(out, " (%.3f)", s->litb[i] / (8 * (double)s->lits[i])); putc('\n', out); fprintf(out, " mats: "); if (s->detail) fprintf(out, "%'ju(%ju) -> ", s->matb[i] >> 3, s->matb[i] & 7); fprintf(out, "%'ju", s->lens[i].sum); if (s->detail) fprintf(out, " (%.3f)", s->matb[i] / (8 * (double)(s->lens[i].sum))); putc('\n', out); show_tally(out, " bits: ", &s->bits[i]); } if (i == 2) show_tally(out, " head: ", &s->head[i]); if (i > 0) { show_tally(out, " syms: ", &s->syms[i]); show_tally(out, " lens: ", &s->lens[i]); } show_tally(out, " data: ", &s->data[i]); } } // At the start of a new block. type == 0 is stored, 1, is fixed, 2 is dynamic. // last is 1 if this block is the last block. static void block(int type, int last, stat_t *s) { (void)type; (void)last; s->lit = 0; s->mat = 0; tally_init(&s->writ); } // head[] is the description of the dynamic block header. Verify the integrity // of the description, returning 0 on success or 1 on failure. (This does not // check for the completeness of the resulting Huffman codes.) static int dynamic(uint8_t *head, stat_t *s) { (void)s; if (head[0] < 1 || head[0] > 30 || head[1] < 1 || head[1] > 30 || head[2] < 4 || head[2] > 19) return 1; int i = 3, k = head[2]; do { if (head[i] < 1 || head[i] > 8) return 1; i++; } while (--k); k = 256 + head[0] + head[1]; do { if (head[i] < 0) return 1; else if (head[i] <= 16) k--; else if (head[i] <= 20) k -= head[i] - 14; else if (head[i] <= 156) k -= head[i] - 18; else return 1; i++; } while (k > 0); if (k < 0 || head[i] != 0) return 1; return 0; } // Literal byte in 0..255. static void literal(int lit, stat_t *s) { #ifdef TEST (void)s; putc(lit, stdout); window[pos++] = lit; pos &= 0x7fff; #else (void)lit; s->lit++; tally(&s->writ, 1); #endif } // Match of previous bytes dist bytes back (1..32768) of length len (3..258). // type is 1 for a fixed block or 2 for a dynamic block. static void match(unsigned dist, int len, int type, stat_t *s) { #ifdef TEST (void)s; do { literal(window[(pos - dist) & 0x7fff]); } while (--len); #else (void)dist; s->mat++; tally(&s->writ, len); tally(&s->lens[type], len); #endif } // Update bit counts at the end of a block (if -bb was used). static void counts(uintmax_t bits, uintmax_t head, uintmax_t litb, uintmax_t matb, int type, int last, stat_t *s) { (void)last; s->detail = 1; tally(&s->bits[type], bits); tally(&s->head[type], head); s->litb[type] += litb; s->matb[type] += matb; } // At the end of a deflate block of type type. last is 1 if this is the last // block in the stream. static void end(int type, int last, stat_t *s) { if (last) s->streams++; s->lits[type] += s->lit; s->mats[type] += s->mat; tally(&s->syms[type], s->writ.num); tally(&s->data[type], s->writ.sum); } // Read a variable-length integer from beg[0..end-beg-1]. Return the integer in // *val. Return a pointer to what follows the integer, or NULL if the input // ends before the variable-length integer does. static inline uint8_t *getvar(uintmax_t *val, uint8_t *beg, uint8_t *end) { *val = 0; uintmax_t octet; int bits = 0; for (;;) { if (beg >= end) return NULL; octet = *beg++; if ((octet & 0x80) == 0) break; *val |= (octet & 0x7f) << bits; bits += 7; } *val |= octet << bits; return beg; } // Declare and get variable-length integer name from next[0..next-end-1], // updating next. #define GET(name, next, end) \ uintmax_t name; \ next = getvar(&name, next, end); \ if (next == NULL) \ return 6; // invalid bit counts // Parse the output of infgen -b or -bb from in. The input is expected to // represent a series of complete deflate streams. Call block() for the start // of each new block or each end of a deflate stream, dynamic() with the // description of a dynamic block header, literal() for each literal byte, and // match() for each match. Return 0 on success, 1 if the input ended // prematurely, or >1 if invalid input is encountered. static int parse(FILE *in, stat_t *s) { // State. enum { TOP, // next byte is distance high, low literal, or prefix LOW, // next byte is distance low LEN, // next byte is length BLK, // next byte is block type or end, or high literal STO, // next byte is stored leftover bits DYN, // next byte continues the dynamic header description BIT, // next byte continues the bit counts UNK, // next byte continues an unknown zero-terminated block END // next byte is stream-end leftover bits } st = TOP; // State transitions: // TOP -> TOP, LOW, or BLK // LOW -> LEN // LEN -> TOP // BLK -> TOP, STO, DYN, or END, and may set last // STO -> TOP // DYN -> DYN or TOP // BIT -> BIT or TOP // UNK -> UNK or TOP // END -> TOP to start over, sets last to zero uint8_t info[512]; // bytes after 4, 5, or 9..63 unsigned dist, have; int type = -1, last = 0, ch; while ((ch = getc(in)) != EOF) { // Parse one byte of input at a time, in the context of the state. switch (st) { case TOP: if (ch < 0x80) { // First byte of a match. dist = ch << 8; st = LOW; } else if (ch == 0xff) // Prefix for the next level of instructions. st = BLK; else { // Low literal. if (type == -1) return 2; // literal outside of a block literal(ch - 0x80, s); } break; case LOW: // Second byte of a match. dist |= ch; st = LEN; break; case LEN: // Third and final byte of a match. match(dist + 1, ch + 3, type, s); st = TOP; break; case BLK: have = 0; if (ch < 6) { // Start of a deflate block. if (last) return 3; // another block after the last block if (type != -1) end(type, last, s); type = ch >> 1; last = ch & 1; block(type, last, s); } else if (type == -1) return 4; // code outside of a block if (ch < 2) // Stored block. st = STO; else if (ch < 4) // Fixed block. st = TOP; else if (ch < 6) // Dynamic block. st = DYN; else if (ch == 8) // End of stream. st = END; else if (ch == 9) // Bit counts. st = BIT; else if (ch < 0x40) // Unknown instruction with following information. Skip over // the zero-terminated information. st = UNK; else if (ch < 0x7f) // Unknown instruction with nothing after it. st = TOP; else { // High literal. literal(ch, s); st = TOP; } break; case STO: // Ignore the leftover bits. st = TOP; break; case DYN: case BIT: case UNK: if (have < sizeof(info)) info[have++] = ch; if (ch == 0) { // Zero-terminated block of information is complete. if (st == DYN) { // Dynamic block header description. if (info[have - 1] != 0 || dynamic(info, s)) return 5; // invalid description } else if (st == BIT) { // Bit counts at the end of the block. if (info[have - 1] != 0) return 6; // invalid bit counts // Get the bit counts. GET() will return 6 on error. uint8_t *next = info; uint8_t *end = next + have; GET(bits, next, end); GET(head, next, end); GET(litb, next, end); GET(matb, next, end); if (next != end - 1) return 6; // invalid bit counts // Update the statistics. counts(bits + 9, head + 2, litb - 1, matb - 1, type, last, s); } st = TOP; } break; case END: if (last != 1) return 7; // ended in a non-last block end(type, last, s); type = -1; // reset to start a new deflate stream last = 0; st = TOP; break; } } return type == -1 ? 0 : 1; // 1 -> input ended prematurely } // Parse infgen -b or -bb output from stdin. int main(void) { // Enable thousands separated by commas with single quote in format. setlocale(LC_NUMERIC, ""); // Process the input and gather statistics. stat_t s; init(&s); int ret = parse(stdin, &s); if (ret) fprintf(stderr, "** %s input (%d)\n", ret == 1 ? "premature end of" : "invalid", ret); #ifndef TEST else // Show the statistics. show_stats(stdout, &s); #endif return ret != 0; }