![]() |
Unishox
A hybrid encoder for Short Unicode Strings
|
Main code of Unishox2 Compression and Decompression library. More...
#include <stdlib.h>#include <string.h>#include <ctype.h>#include <stdint.h>#include <limits.h>#include "unishox2.h"Macros | |
| #define | USX_TEMPLATES (const char *[]) {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0} |
| Commonly occuring templates (ISO Date/Time, ISO Date, US Phone number, ISO Time, Unused) | |
| #define | NICE_LEN 5 |
| Minimum length to consider as repeating sequence. | |
| #define | RPT_CODE ((2 << 5) + 26) |
| Set (USX_NUM - 2) and vertical code (26) for encoding repeating letters. | |
| #define | TERM_CODE ((2 << 5) + 27) |
| Set (USX_NUM - 2) and vertical code (27) for encoding terminator. | |
| #define | LF_CODE ((1 << 5) + 7) |
| Set (USX_SYM - 1) and vertical code (7) for encoding Line feed \n. | |
| #define | CRLF_CODE ((1 << 5) + 8) |
| Set (USX_NUM - 1) and vertical code (8) for encoding \r\n. | |
| #define | CR_CODE ((1 << 5) + 22) |
| Set (USX_NUM - 1) and vertical code (22) for encoding \r. | |
| #define | TAB_CODE ((1 << 5) + 14) |
| Set (USX_NUM - 1) and vertical code (14) for encoding \t. | |
| #define | NUM_SPC_CODE ((2 << 5) + 17) |
| Set (USX_NUM - 2) and vertical code (17) for space character when it appears in USX_NUM state \r. | |
| #define | UNI_STATE_SPL_CODE 0xF8 |
| Code for special code (11111) when state=USX_DELTA. | |
| #define | UNI_STATE_SPL_CODE_LEN 5 |
| Length of Code for special code when state=USX_DELTA. | |
| #define | UNI_STATE_SW_CODE 0x80 |
| Code for switch code when state=USX_DELTA. | |
| #define | UNI_STATE_SW_CODE_LEN 2 |
| Length of Code for Switch code when state=USX_DELTA. | |
| #define | SW_CODE 0 |
| Switch code in USX_ALPHA and USX_NUM 00. | |
| #define | SW_CODE_LEN 2 |
| Length of Switch code. | |
| #define | TERM_BYTE_PRESET_1 0 |
| Terminator bit sequence for Preset 1. Length varies depending on state as per following macros. | |
| #define | TERM_BYTE_PRESET_1_LEN_LOWER 6 |
| Length of Terminator bit sequence when state is lower. | |
| #define | TERM_BYTE_PRESET_1_LEN_UPPER 4 |
| Length of Terminator bit sequence when state is upper. | |
| #define | USX_OFFSET_94 33 |
| Offset at which usx_code_94 starts. | |
| #define | SAFE_APPEND_BITS(exp) |
| This is a safe call to append_bits() making sure it does not write past olen. | |
| #define | SAFE_APPEND_BITS2(olen, exp) |
| Macro used in the main compress function so that if the output len exceeds given maximum length (olen) it can exit. | |
| #define | SECTION_COUNT 5 |
| The list of veritical codes is split into 5 sections. Used by readVCodeIdx() | |
| #define | DEC_OUTPUT_CHAR(out, olen, ol, c) |
| Macro to ensure that the decoder does not append more than olen bytes to out. | |
| #define | DEC_OUTPUT_CHARS(olen, exp) |
| Macro to ensure that the decoder does not append more than olen bytes to out. | |
Typedefs | |
| typedef unsigned char | uint8_t |
| uint8_t is unsigned char | |
Functions | |
| void | init_coder () |
| int | append_bits (char *out, int olen, int ol, uint8_t code, int clen) |
| int | append_switch_code (char *out, int olen, int ol, uint8_t state) |
| Appends switch code to out depending on the state (USX_DELTA or other) | |
| int | append_code (char *out, int olen, int ol, uint8_t code, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[]) |
| Appends given horizontal and veritical code bits to out. | |
| int | encodeCount (char *out, int olen, int ol, int count) |
| Encodes given count to out. | |
| int | encodeUnicode (char *out, int olen, int ol, int32_t code, int32_t prev_code) |
| Encodes the unicode code point given by code to out. prev_code is used to calculate the delta. | |
| int32_t | readUTF8 (const char *in, int len, int l, int *utf8len) |
| Reads UTF-8 character from in. Also returns the number of bytes occupied by the UTF-8 character in utf8len. | |
| int | matchOccurance (const char *in, int len, int l, char *out, int olen, int *ol, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[]) |
| int | matchLine (const char *in, int len, int l, char *out, int olen, int *ol, struct us_lnk_lst *prev_lines, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[]) |
| uint8_t | getBaseCode (char ch) |
| char | getNibbleType (char ch) |
| int | append_nibble_escape (char *out, int olen, int ol, uint8_t state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[]) |
| Starts coding of nibble sets. | |
| long | min_of (long c, long i) |
| Returns minimum value of two longs. | |
| int | append_final_bits (char *const out, const int olen, int ol, const uint8_t state, const uint8_t is_all_upper, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[]) |
| Appends the terminator code depending on the state, preset and whether full terminator needs to be encoded to out or not . | |
| int | unishox2_compress_lines (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines) |
| int | unishox2_compress (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[]) |
| int | unishox2_compress_simple (const char *in, int len, char *out) |
| int | readBit (const char *in, int bit_no) |
| int | read8bitCode (const char *in, int len, int bit_no) |
| int | readVCodeIdx (const char *in, int len, int *bit_no_p) |
| int | readHCodeIdx (const char *in, int len, int *bit_no_p, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[]) |
| int | getStepCodeIdx (const char *in, int len, int *bit_no_p, int limit) |
| Returns the position of step code (0, 10, 110, etc.) encountered in the stream. | |
| int32_t | getNumFromBits (const char *in, int len, int bit_no, int count) |
| Reads specified number of bits and builds the corresponding integer. | |
| int32_t | readCount (const char *in, int *bit_no_p, int len) |
| Decodes the count from the given bit stream at in. Also updates bit_no_p. | |
| int32_t | readUnicode (const char *in, int *bit_no_p, int len) |
| int | writeUTF8 (char *out, int olen, int ol, int uni) |
| Write given unicode code point to out as a UTF-8 sequence. | |
| int | decodeRepeat (const char *in, int len, char *out, int olen, int ol, int *bit_no, struct us_lnk_lst *prev_lines) |
| Decode repeating sequence and appends to out. | |
| char | getHexChar (int32_t nibble, int hex_type) |
| Returns hex character corresponding to the 4 bit nibble. | |
| int | unishox2_decompress_lines (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines) |
| int | unishox2_decompress (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[]) |
| int | unishox2_decompress_simple (const char *in, int len, char *out) |
Variables | |
| const char * | USX_FREQ_SEQ_DFLT [] = {"\": \"", "\": ", "</", "=\"", "\":\"", "://"} |
| Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression. | |
| const char * | USX_FREQ_SEQ_TXT [] = {" the ", " and ", "tion", " with", "ing", "ment"} |
| Frequently occuring sequences in text content. | |
| const char * | USX_FREQ_SEQ_URL [] = {"https://", "www.", ".com", "http://", ".org", ".net"} |
| Frequently occuring sequences in URL content. | |
| const char * | USX_FREQ_SEQ_JSON [] = {"\": \"", "\": ", "\",", "}}}", "\":\"", "}}"} |
| Frequently occuring sequences in JSON content. | |
| const char * | USX_FREQ_SEQ_HTML [] = {"</", "=\"", "div", "href", "class", "<p>"} |
| Frequently occuring sequences in HTML content. | |
| const char * | USX_FREQ_SEQ_XML [] = {"</", "=\"", "\">", "<?xml version=\"1.0\"", "xmlns:", "://"} |
| Frequently occuring sequences in XML content. | |
| const char * | USX_TEMPLATES [] = {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0} |
| uint8_t | usx_sets [][28] |
| This 2D array has the characters for the sets USX_ALPHA, USX_SYM and USX_NUM. Where a character cannot fit into a uint8_t, 0 is used and handled in code. | |
| uint8_t | usx_code_94 [94] |
| uint8_t | usx_vcodes [] |
| Vertical codes starting from the MSB. | |
| uint8_t | usx_vcode_lens [] |
| Length of each veritical code. | |
| uint8_t | usx_freq_codes [] = {(1 << 5) + 25, (1 << 5) + 26, (1 << 5) + 27, (2 << 5) + 23, (2 << 5) + 24, (2 << 5) + 25} |
| Vertical Codes and Set number for frequent sequences in sets USX_SYM and USX_NUM. First 3 bits indicate set (USX_SYM/USX_NUM) and rest are vcode positions. | |
| const int | UTF8_MASK [] = {0xE0, 0xF0, 0xF8} |
| Not used. | |
| const int | UTF8_PREFIX [] = {0xC0, 0xE0, 0xF0} |
| Not used. | |
| uint8_t | is_inited = 0 |
| global to indicate whether initialization is complete or not | |
| unsigned int | usx_mask [] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF} |
| Mask for retrieving each code to be encoded according to its length. | |
| const uint8_t | count_bit_lens [5] = {2, 4, 7, 11, 16} |
| Length of bits used to represent count for each level. | |
| const int32_t | count_adder [5] = {4, 20, 148, 2196, 67732} |
| Cumulative counts represented at each level. | |
| const uint8_t | count_codes [] = {0x01, 0x82, 0xC3, 0xE4, 0xF4} |
| Codes used to specify the level that the count belongs to. | |
| const uint8_t | uni_bit_len [5] = {6, 12, 14, 16, 21} |
| Length of bits used to represent delta code for each level. | |
| const int32_t | uni_adder [5] = {0, 64, 4160, 20544, 86080} |
| Cumulative delta codes represented at each level. | |
| uint8_t | usx_vsections [] = {0x7F, 0xBF, 0xDF, 0xEF, 0xFF} |
| Used by readVCodeIdx() for finding the section under which the code read using read8bitCode() falls. | |
| uint8_t | usx_vsection_pos [] = {0, 4, 8, 12, 20} |
| Used by readVCodeIdx() for finding the section vertical position offset. | |
| uint8_t | usx_vsection_mask [] = {0x7F, 0x3F, 0x1F, 0x0F, 0x0F} |
| Used by readVCodeIdx() for masking the code read by read8bitCode() | |
| uint8_t | usx_vsection_shift [] = {5, 4, 3, 1, 0} |
| Used by readVCodeIdx() for shifting the code read by read8bitCode() to obtain the vpos. | |
| uint8_t | usx_vcode_lookup [36] |
| uint8_t | len_masks [] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF} |
Main code of Unishox2 Compression and Decompression library.
This file implements the code for the Unishox API function
defined in unishox2.h
| #define DEC_OUTPUT_CHAR | ( | out, | |
| olen, | |||
| ol, | |||
| c ) |
Macro to ensure that the decoder does not append more than olen bytes to out.
| #define DEC_OUTPUT_CHARS | ( | olen, | |
| exp ) |
Macro to ensure that the decoder does not append more than olen bytes to out.
| #define SAFE_APPEND_BITS | ( | exp | ) |
This is a safe call to append_bits() making sure it does not write past olen.
| #define SAFE_APPEND_BITS2 | ( | olen, | |
| exp ) |
Macro used in the main compress function so that if the output len exceeds given maximum length (olen) it can exit.
| anonymous enum |
Enum indicating nibble type - USX_NIB_NUM means ch is a number '0' to '9',
USX_NIB_HEX_LOWER means ch is between 'a' to 'f',
USX_NIB_HEX_UPPER means ch is between 'A' to 'F'
| int append_bits | ( | char * | out, |
| int | olen, | ||
| int | ol, | ||
| uint8_t | code, | ||
| int | clen ) |
Appends specified number of bits to the output (out)
If maximum limit (olen) is reached, -1 is returned
Otherwise clen bits in code are appended to out starting with MSB
| uint8_t getBaseCode | ( | char | ch | ) |
Returns 4 bit code assuming ch falls between '0' to '9',
'A' to 'F' or 'a' to 'f'
| char getNibbleType | ( | char | ch | ) |
Gets 4 bit code assuming ch falls between '0' to '9',
'A' to 'F' or 'a' to 'f'
| void init_coder | ( | ) |
Fills the usx_code_94 94 letter array based on sets of characters at usx_sets
For each element in usx_code_94, first 3 msb bits is set (USX_ALPHA / USX_SYM / USX_NUM)
and the rest 5 bits indicate the vertical position in the corresponding set
| int matchLine | ( | const char * | in, |
| int | len, | ||
| int | l, | ||
| char * | out, | ||
| int | olen, | ||
| int * | ol, | ||
| struct us_lnk_lst * | prev_lines, | ||
| uint8_t * | state, | ||
| const uint8_t | usx_hcodes[], | ||
| const uint8_t | usx_hcode_lens[] ) |
This is used only when encoding a string array Finds the longest matching sequence from the previous array element to the beginning of the string array.
If a match is found and it is longer than NICE_LEN, it is encoded as a repeating sequence to out
This is also used for Unicode strings
This is a crude implementation that is not optimized. Assuming only short strings
are encoded, this is not much of an issue.
| int matchOccurance | ( | const char * | in, |
| int | len, | ||
| int | l, | ||
| char * | out, | ||
| int | olen, | ||
| int * | ol, | ||
| uint8_t * | state, | ||
| const uint8_t | usx_hcodes[], | ||
| const uint8_t | usx_hcode_lens[] ) |
Finds the longest matching sequence from the beginning of the string.
If a match is found and it is longer than NICE_LEN, it is encoded as a repeating sequence to out
This is also used for Unicode strings
This is a crude implementation that is not optimized. Assuming only short strings
are encoded, this is not much of an issue.
| int readHCodeIdx | ( | const char * | in, |
| int | len, | ||
| int * | bit_no_p, | ||
| const uint8_t | usx_hcodes[], | ||
| const uint8_t | usx_hcode_lens[] ) |
Decodes the horizontal code from the given bitstream at in
depending on the hcodes defined using usx_hcodes and usx_hcode_lens
Returns the horizontal code index or 99 if match could not be found.
Also updates bit_no_p with how many ever bits used by the horizontal code.
| int32_t readUnicode | ( | const char * | in, |
| int * | bit_no_p, | ||
| int | len ) |
Decodes the Unicode codepoint from the given bit stream at in. Also updates bit_no_p
When the step code is 5, reads the next step code to find out the special code.
| int readVCodeIdx | ( | const char * | in, |
| int | len, | ||
| int * | bit_no_p ) |
Decodes the vertical code from the given bitstream at in
This is designed to use less memory using a 36 uint8_t buffer
compared to using a 256 uint8_t buffer to decode the next 8 bits read by read8bitCode()
by splitting the list of vertical codes.
Decoder is designed for using less memory, not speed.
Returns the veritical code index or 99 if match could not be found.
Also updates bit_no_p with how many ever bits used by the vertical code.
| uint8_t len_masks[] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF} |
Mask for retrieving each code to be decoded according to its length
Same as usx_mask so redundant
| uint8_t usx_code_94[94] |
Stores position of letter in usx_sets. First 3 bits - position in usx_hcodes Next 5 bits - position in usx_vcodes
| uint8_t usx_sets[][28] |
This 2D array has the characters for the sets USX_ALPHA, USX_SYM and USX_NUM. Where a character cannot fit into a uint8_t, 0 is used and handled in code.
| uint8_t usx_vcode_lens[] |
Length of each veritical code.
| uint8_t usx_vcode_lookup[36] |
Vertical decoder lookup table - 3 bits code len, 5 bytes vertical pos code len is one less as 8 cannot be accommodated in 3 bits
| uint8_t usx_vcodes[] |
Vertical codes starting from the MSB.