Unishox
A hybrid encoder for Short Unicode Strings
Loading...
Searching...
No Matches
unishox2.c File Reference

Main code of Unishox2 Compression and Decompression library. More...

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdint.h>
#include <limits.h>
#include "unishox2.h"

Macros

#define USX_TEMPLATES   (const char *[]) {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0}
 Commonly occuring templates (ISO Date/Time, ISO Date, US Phone number, ISO Time, Unused)
#define NICE_LEN   5
 Minimum length to consider as repeating sequence.
#define RPT_CODE   ((2 << 5) + 26)
 Set (USX_NUM - 2) and vertical code (26) for encoding repeating letters.
#define TERM_CODE   ((2 << 5) + 27)
 Set (USX_NUM - 2) and vertical code (27) for encoding terminator.
#define LF_CODE   ((1 << 5) + 7)
 Set (USX_SYM - 1) and vertical code (7) for encoding Line feed \n.
#define CRLF_CODE   ((1 << 5) + 8)
 Set (USX_NUM - 1) and vertical code (8) for encoding \r\n.
#define CR_CODE   ((1 << 5) + 22)
 Set (USX_NUM - 1) and vertical code (22) for encoding \r.
#define TAB_CODE   ((1 << 5) + 14)
 Set (USX_NUM - 1) and vertical code (14) for encoding \t.
#define NUM_SPC_CODE   ((2 << 5) + 17)
 Set (USX_NUM - 2) and vertical code (17) for space character when it appears in USX_NUM state \r.
#define UNI_STATE_SPL_CODE   0xF8
 Code for special code (11111) when state=USX_DELTA.
#define UNI_STATE_SPL_CODE_LEN   5
 Length of Code for special code when state=USX_DELTA.
#define UNI_STATE_SW_CODE   0x80
 Code for switch code when state=USX_DELTA.
#define UNI_STATE_SW_CODE_LEN   2
 Length of Code for Switch code when state=USX_DELTA.
#define SW_CODE   0
 Switch code in USX_ALPHA and USX_NUM 00.
#define SW_CODE_LEN   2
 Length of Switch code.
#define TERM_BYTE_PRESET_1   0
 Terminator bit sequence for Preset 1. Length varies depending on state as per following macros.
#define TERM_BYTE_PRESET_1_LEN_LOWER   6
 Length of Terminator bit sequence when state is lower.
#define TERM_BYTE_PRESET_1_LEN_UPPER   4
 Length of Terminator bit sequence when state is upper.
#define USX_OFFSET_94   33
 Offset at which usx_code_94 starts.
#define SAFE_APPEND_BITS(exp)
 This is a safe call to append_bits() making sure it does not write past olen.
#define SAFE_APPEND_BITS2(olen, exp)
 Macro used in the main compress function so that if the output len exceeds given maximum length (olen) it can exit.
#define SECTION_COUNT   5
 The list of veritical codes is split into 5 sections. Used by readVCodeIdx()
#define DEC_OUTPUT_CHAR(out, olen, ol, c)
 Macro to ensure that the decoder does not append more than olen bytes to out.
#define DEC_OUTPUT_CHARS(olen, exp)
 Macro to ensure that the decoder does not append more than olen bytes to out.

Typedefs

typedef unsigned char uint8_t
 uint8_t is unsigned char

Enumerations

enum  {
  USX_ALPHA = 0 , USX_SYM , USX_NUM , USX_DICT ,
  USX_DELTA , USX_NUM_TEMP
}
 possible horizontal sets and states
enum  { USX_NIB_NUM = 0 , USX_NIB_HEX_LOWER , USX_NIB_HEX_UPPER , USX_NIB_NOT }

Functions

void init_coder ()
int append_bits (char *out, int olen, int ol, uint8_t code, int clen)
int append_switch_code (char *out, int olen, int ol, uint8_t state)
 Appends switch code to out depending on the state (USX_DELTA or other)
int append_code (char *out, int olen, int ol, uint8_t code, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
 Appends given horizontal and veritical code bits to out.
int encodeCount (char *out, int olen, int ol, int count)
 Encodes given count to out.
int encodeUnicode (char *out, int olen, int ol, int32_t code, int32_t prev_code)
 Encodes the unicode code point given by code to out. prev_code is used to calculate the delta.
int32_t readUTF8 (const char *in, int len, int l, int *utf8len)
 Reads UTF-8 character from in. Also returns the number of bytes occupied by the UTF-8 character in utf8len.
int matchOccurance (const char *in, int len, int l, char *out, int olen, int *ol, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
int matchLine (const char *in, int len, int l, char *out, int olen, int *ol, struct us_lnk_lst *prev_lines, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
uint8_t getBaseCode (char ch)
char getNibbleType (char ch)
int append_nibble_escape (char *out, int olen, int ol, uint8_t state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
 Starts coding of nibble sets.
long min_of (long c, long i)
 Returns minimum value of two longs.
int append_final_bits (char *const out, const int olen, int ol, const uint8_t state, const uint8_t is_all_upper, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
 Appends the terminator code depending on the state, preset and whether full terminator needs to be encoded to out or not
.
int unishox2_compress_lines (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines)
int unishox2_compress (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[])
int unishox2_compress_simple (const char *in, int len, char *out)
int readBit (const char *in, int bit_no)
int read8bitCode (const char *in, int len, int bit_no)
int readVCodeIdx (const char *in, int len, int *bit_no_p)
int readHCodeIdx (const char *in, int len, int *bit_no_p, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
int getStepCodeIdx (const char *in, int len, int *bit_no_p, int limit)
 Returns the position of step code (0, 10, 110, etc.) encountered in the stream.
int32_t getNumFromBits (const char *in, int len, int bit_no, int count)
 Reads specified number of bits and builds the corresponding integer.
int32_t readCount (const char *in, int *bit_no_p, int len)
 Decodes the count from the given bit stream at in. Also updates bit_no_p.
int32_t readUnicode (const char *in, int *bit_no_p, int len)
int writeUTF8 (char *out, int olen, int ol, int uni)
 Write given unicode code point to out as a UTF-8 sequence.
int decodeRepeat (const char *in, int len, char *out, int olen, int ol, int *bit_no, struct us_lnk_lst *prev_lines)
 Decode repeating sequence and appends to out.
char getHexChar (int32_t nibble, int hex_type)
 Returns hex character corresponding to the 4 bit nibble.
int unishox2_decompress_lines (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines)
int unishox2_decompress (const char *in, int len, char *out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[])
int unishox2_decompress_simple (const char *in, int len, char *out)

Variables

const char * USX_FREQ_SEQ_DFLT [] = {"\": \"", "\": ", "</", "=\"", "\":\"", "://"}
 Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression.
const char * USX_FREQ_SEQ_TXT [] = {" the ", " and ", "tion", " with", "ing", "ment"}
 Frequently occuring sequences in text content.
const char * USX_FREQ_SEQ_URL [] = {"https://", "www.", ".com", "http://", ".org", ".net"}
 Frequently occuring sequences in URL content.
const char * USX_FREQ_SEQ_JSON [] = {"\": \"", "\": ", "\",", "}}}", "\":\"", "}}"}
 Frequently occuring sequences in JSON content.
const char * USX_FREQ_SEQ_HTML [] = {"</", "=\"", "div", "href", "class", "<p>"}
 Frequently occuring sequences in HTML content.
const char * USX_FREQ_SEQ_XML [] = {"</", "=\"", "\">", "<?xml version=\"1.0\"", "xmlns:", "://"}
 Frequently occuring sequences in XML content.
const char * USX_TEMPLATES [] = {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0}
uint8_t usx_sets [][28]
 This 2D array has the characters for the sets USX_ALPHA, USX_SYM and USX_NUM. Where a character cannot fit into a uint8_t, 0 is used and handled in code.
uint8_t usx_code_94 [94]
uint8_t usx_vcodes []
 Vertical codes starting from the MSB.
uint8_t usx_vcode_lens []
 Length of each veritical code.
uint8_t usx_freq_codes [] = {(1 << 5) + 25, (1 << 5) + 26, (1 << 5) + 27, (2 << 5) + 23, (2 << 5) + 24, (2 << 5) + 25}
 Vertical Codes and Set number for frequent sequences in sets USX_SYM and USX_NUM. First 3 bits indicate set (USX_SYM/USX_NUM) and rest are vcode positions.
const int UTF8_MASK [] = {0xE0, 0xF0, 0xF8}
 Not used.
const int UTF8_PREFIX [] = {0xC0, 0xE0, 0xF0}
 Not used.
uint8_t is_inited = 0
 global to indicate whether initialization is complete or not
unsigned int usx_mask [] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF}
 Mask for retrieving each code to be encoded according to its length.
const uint8_t count_bit_lens [5] = {2, 4, 7, 11, 16}
 Length of bits used to represent count for each level.
const int32_t count_adder [5] = {4, 20, 148, 2196, 67732}
 Cumulative counts represented at each level.
const uint8_t count_codes [] = {0x01, 0x82, 0xC3, 0xE4, 0xF4}
 Codes used to specify the level that the count belongs to.
const uint8_t uni_bit_len [5] = {6, 12, 14, 16, 21}
 Length of bits used to represent delta code for each level.
const int32_t uni_adder [5] = {0, 64, 4160, 20544, 86080}
 Cumulative delta codes represented at each level.
uint8_t usx_vsections [] = {0x7F, 0xBF, 0xDF, 0xEF, 0xFF}
 Used by readVCodeIdx() for finding the section under which the code read using read8bitCode() falls.
uint8_t usx_vsection_pos [] = {0, 4, 8, 12, 20}
 Used by readVCodeIdx() for finding the section vertical position offset.
uint8_t usx_vsection_mask [] = {0x7F, 0x3F, 0x1F, 0x0F, 0x0F}
 Used by readVCodeIdx() for masking the code read by read8bitCode()
uint8_t usx_vsection_shift [] = {5, 4, 3, 1, 0}
 Used by readVCodeIdx() for shifting the code read by read8bitCode() to obtain the vpos.
uint8_t usx_vcode_lookup [36]
uint8_t len_masks [] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF}

Detailed Description

Main code of Unishox2 Compression and Decompression library.

Author
Arundale Ramanathan, James Z. M. Gao

This file implements the code for the Unishox API function
defined in unishox2.h

Macro Definition Documentation

◆ DEC_OUTPUT_CHAR

#define DEC_OUTPUT_CHAR ( out,
olen,
ol,
c )
Value:
do { \
char *const obuf = (out); \
const int oidx = (ol); \
const int limit = (olen); \
if (limit <= oidx) return limit + 1; \
else if (oidx < 0) return 0; \
else obuf[oidx] = (c); \
} while (0)

Macro to ensure that the decoder does not append more than olen bytes to out.

◆ DEC_OUTPUT_CHARS

#define DEC_OUTPUT_CHARS ( olen,
exp )
Value:
do { \
const int newidx = (exp); \
const int limit = (olen); \
if (newidx > limit) return limit + 1; \
} while (0)

Macro to ensure that the decoder does not append more than olen bytes to out.

◆ SAFE_APPEND_BITS

#define SAFE_APPEND_BITS ( exp)
Value:
do { \
const int newidx = (exp); \
if (newidx < 0) return newidx; \
} while (0)

This is a safe call to append_bits() making sure it does not write past olen.

◆ SAFE_APPEND_BITS2

#define SAFE_APPEND_BITS2 ( olen,
exp )
Value:
do { \
const int newidx = (exp); \
const int __olen = (olen); \
if (newidx < 0) return __olen >= 0 ? __olen + 1 : (1 - __olen) * 4; \
} while (0)

Macro used in the main compress function so that if the output len exceeds given maximum length (olen) it can exit.

Enumeration Type Documentation

◆ anonymous enum

anonymous enum

Enum indicating nibble type - USX_NIB_NUM means ch is a number '0' to '9',
USX_NIB_HEX_LOWER means ch is between 'a' to 'f',
USX_NIB_HEX_UPPER means ch is between 'A' to 'F'

Function Documentation

◆ append_bits()

int append_bits ( char * out,
int olen,
int ol,
uint8_t code,
int clen )

Appends specified number of bits to the output (out)
If maximum limit (olen) is reached, -1 is returned
Otherwise clen bits in code are appended to out starting with MSB

◆ getBaseCode()

uint8_t getBaseCode ( char ch)

Returns 4 bit code assuming ch falls between '0' to '9',
'A' to 'F' or 'a' to 'f'

◆ getNibbleType()

char getNibbleType ( char ch)

Gets 4 bit code assuming ch falls between '0' to '9',
'A' to 'F' or 'a' to 'f'

◆ init_coder()

void init_coder ( )

Fills the usx_code_94 94 letter array based on sets of characters at usx_sets
For each element in usx_code_94, first 3 msb bits is set (USX_ALPHA / USX_SYM / USX_NUM)
and the rest 5 bits indicate the vertical position in the corresponding set

◆ matchLine()

int matchLine ( const char * in,
int len,
int l,
char * out,
int olen,
int * ol,
struct us_lnk_lst * prev_lines,
uint8_t * state,
const uint8_t usx_hcodes[],
const uint8_t usx_hcode_lens[] )

This is used only when encoding a string array Finds the longest matching sequence from the previous array element to the beginning of the string array.
If a match is found and it is longer than NICE_LEN, it is encoded as a repeating sequence to out
This is also used for Unicode strings
This is a crude implementation that is not optimized. Assuming only short strings
are encoded, this is not much of an issue.

◆ matchOccurance()

int matchOccurance ( const char * in,
int len,
int l,
char * out,
int olen,
int * ol,
uint8_t * state,
const uint8_t usx_hcodes[],
const uint8_t usx_hcode_lens[] )

Finds the longest matching sequence from the beginning of the string.
If a match is found and it is longer than NICE_LEN, it is encoded as a repeating sequence to out
This is also used for Unicode strings
This is a crude implementation that is not optimized. Assuming only short strings
are encoded, this is not much of an issue.

◆ readHCodeIdx()

int readHCodeIdx ( const char * in,
int len,
int * bit_no_p,
const uint8_t usx_hcodes[],
const uint8_t usx_hcode_lens[] )

Decodes the horizontal code from the given bitstream at in
depending on the hcodes defined using usx_hcodes and usx_hcode_lens
Returns the horizontal code index or 99 if match could not be found.
Also updates bit_no_p with how many ever bits used by the horizontal code.

◆ readUnicode()

int32_t readUnicode ( const char * in,
int * bit_no_p,
int len )

Decodes the Unicode codepoint from the given bit stream at in. Also updates bit_no_p
When the step code is 5, reads the next step code to find out the special code.

◆ readVCodeIdx()

int readVCodeIdx ( const char * in,
int len,
int * bit_no_p )

Decodes the vertical code from the given bitstream at in
This is designed to use less memory using a 36 uint8_t buffer
compared to using a 256 uint8_t buffer to decode the next 8 bits read by read8bitCode()
by splitting the list of vertical codes.
Decoder is designed for using less memory, not speed.
Returns the veritical code index or 99 if match could not be found.
Also updates bit_no_p with how many ever bits used by the vertical code.

Variable Documentation

◆ len_masks

uint8_t len_masks[] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF}

Mask for retrieving each code to be decoded according to its length
Same as usx_mask so redundant

◆ usx_code_94

uint8_t usx_code_94[94]

Stores position of letter in usx_sets. First 3 bits - position in usx_hcodes Next 5 bits - position in usx_vcodes

◆ usx_sets

uint8_t usx_sets[][28]
Initial value:
= {{ 0, ' ', 'e', 't', 'a', 'o', 'i', 'n',
's', 'r', 'l', 'c', 'd', 'h', 'u', 'p', 'm', 'b',
'g', 'w', 'f', 'y', 'v', 'k', 'q', 'j', 'x', 'z'},
{'"', '{', '}', '_', '<', '>', ':', '\n',
0, '[', ']', '\\', ';', '\'', '\t', '@', '*', '&',
'?', '!', '^', '|', '\r', '~', '`', 0, 0, 0},
{ 0, ',', '.', '0', '1', '9', '2', '5', '-',
'/', '3', '4', '6', '7', '8', '(', ')', ' ',
'=', '+', '$', '%', '#', 0, 0, 0, 0, 0}}

This 2D array has the characters for the sets USX_ALPHA, USX_SYM and USX_NUM. Where a character cannot fit into a uint8_t, 0 is used and handled in code.

◆ usx_vcode_lens

uint8_t usx_vcode_lens[]
Initial value:
= { 2, 3, 3, 4, 4, 4, 4,
4, 5, 5, 6, 6, 6, 7,
7, 7, 7, 7, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8 }

Length of each veritical code.

◆ usx_vcode_lookup

uint8_t usx_vcode_lookup[36]
Initial value:
= {
(1 << 5) + 0, (1 << 5) + 0, (2 << 5) + 1, (2 << 5) + 2,
(3 << 5) + 3, (3 << 5) + 4, (3 << 5) + 5, (3 << 5) + 6,
(3 << 5) + 7, (3 << 5) + 7, (4 << 5) + 8, (4 << 5) + 9,
(5 << 5) + 10, (5 << 5) + 10, (5 << 5) + 11, (5 << 5) + 11,
(5 << 5) + 12, (5 << 5) + 12, (6 << 5) + 13, (6 << 5) + 14,
(6 << 5) + 15, (6 << 5) + 15, (6 << 5) + 16, (6 << 5) + 16,
(6 << 5) + 17, (6 << 5) + 17, (7 << 5) + 18, (7 << 5) + 19,
(7 << 5) + 20, (7 << 5) + 21, (7 << 5) + 22, (7 << 5) + 23,
(7 << 5) + 24, (7 << 5) + 25, (7 << 5) + 26, (7 << 5) + 27
}

Vertical decoder lookup table - 3 bits code len, 5 bytes vertical pos code len is one less as 8 cannot be accommodated in 3 bits

◆ usx_vcodes

uint8_t usx_vcodes[]
Initial value:
= { 0x00, 0x40, 0x60, 0x80, 0x90, 0xA0, 0xB0,
0xC0, 0xD0, 0xD8, 0xE0, 0xE4, 0xE8, 0xEC,
0xEE, 0xF0, 0xF2, 0xF4, 0xF6, 0xF7, 0xF8,
0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }

Vertical codes starting from the MSB.