Main code of Unishox2 Compression and Decompression library. More...

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdint.h>
#include <limits.h>
#include "unishox2.h"

Macros
#define	USX_TEMPLATES (const char *[]) {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0}
	Commonly occuring templates (ISO Date/Time, ISO Date, US Phone number, ISO Time, Unused)
#define	NICE_LEN 5
	Minimum length to consider as repeating sequence.
#define	RPT_CODE ((2 << 5) + 26)
	Set (USX_NUM - 2) and vertical code (26) for encoding repeating letters.
#define	TERM_CODE ((2 << 5) + 27)
	Set (USX_NUM - 2) and vertical code (27) for encoding terminator.
#define	LF_CODE ((1 << 5) + 7)
	Set (USX_SYM - 1) and vertical code (7) for encoding Line feed \n.
#define	CRLF_CODE ((1 << 5) + 8)
	Set (USX_NUM - 1) and vertical code (8) for encoding \r\n.
#define	CR_CODE ((1 << 5) + 22)
	Set (USX_NUM - 1) and vertical code (22) for encoding \r.
#define	TAB_CODE ((1 << 5) + 14)
	Set (USX_NUM - 1) and vertical code (14) for encoding \t.
#define	NUM_SPC_CODE ((2 << 5) + 17)
	Set (USX_NUM - 2) and vertical code (17) for space character when it appears in USX_NUM state \r.
#define	UNI_STATE_SPL_CODE 0xF8
	Code for special code (11111) when state=USX_DELTA.
#define	UNI_STATE_SPL_CODE_LEN 5
	Length of Code for special code when state=USX_DELTA.
#define	UNI_STATE_SW_CODE 0x80
	Code for switch code when state=USX_DELTA.
#define	UNI_STATE_SW_CODE_LEN 2
	Length of Code for Switch code when state=USX_DELTA.
#define	SW_CODE 0
	Switch code in USX_ALPHA and USX_NUM 00.
#define	SW_CODE_LEN 2
	Length of Switch code.
#define	TERM_BYTE_PRESET_1 0
	Terminator bit sequence for Preset 1. Length varies depending on state as per following macros.
#define	TERM_BYTE_PRESET_1_LEN_LOWER 6
	Length of Terminator bit sequence when state is lower.
#define	TERM_BYTE_PRESET_1_LEN_UPPER 4
	Length of Terminator bit sequence when state is upper.
#define	USX_OFFSET_94 33
	Offset at which usx_code_94 starts.
#define	SAFE_APPEND_BITS(exp)
	This is a safe call to append_bits() making sure it does not write past olen.
#define	SAFE_APPEND_BITS2(olen, exp)
	Macro used in the main compress function so that if the output len exceeds given maximum length (olen) it can exit.
#define	SECTION_COUNT 5
	The list of veritical codes is split into 5 sections. Used by readVCodeIdx()
#define	DEC_OUTPUT_CHAR(out, olen, ol, c)
	Macro to ensure that the decoder does not append more than olen bytes to out.
#define	DEC_OUTPUT_CHARS(olen, exp)
	Macro to ensure that the decoder does not append more than olen bytes to out.

Typedefs
typedef unsigned char	uint8_t
	uint8_t is unsigned char

Enumerations
enum	{ USX_ALPHA = 0 , USX_SYM , USX_NUM , USX_DICT , USX_DELTA , USX_NUM_TEMP }
	possible horizontal sets and states
enum	{ USX_NIB_NUM = 0 , USX_NIB_HEX_LOWER , USX_NIB_HEX_UPPER , USX_NIB_NOT }

Functions
void	init_coder ()
int	append_bits (char *out, int olen, int ol, uint8_t code, int clen)
int	append_switch_code (char *out, int olen, int ol, uint8_t state)
	Appends switch code to out depending on the state (USX_DELTA or other)
int	append_code (char out, int olen, int ol, uint8_t code, uint8_t state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
	Appends given horizontal and veritical code bits to out.
int	encodeCount (char *out, int olen, int ol, int count)
	Encodes given count to out.
int	encodeUnicode (char *out, int olen, int ol, int32_t code, int32_t prev_code)
	Encodes the unicode code point given by code to out. prev_code is used to calculate the delta.
int32_t	readUTF8 (const char in, int len, int l, int utf8len)
	Reads UTF-8 character from in. Also returns the number of bytes occupied by the UTF-8 character in utf8len.
int	matchOccurance (const char in, int len, int l, char out, int olen, int ol, uint8_t state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
int	matchLine (const char in, int len, int l, char out, int olen, int ol, struct us_lnk_lst prev_lines, uint8_t *state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
uint8_t	getBaseCode (char ch)
char	getNibbleType (char ch)
int	append_nibble_escape (char *out, int olen, int ol, uint8_t state, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
	Starts coding of nibble sets.
long	min_of (long c, long i)
	Returns minimum value of two longs.
int	append_final_bits (char *const out, const int olen, int ol, const uint8_t state, const uint8_t is_all_upper, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
	Appends the terminator code depending on the state, preset and whether full terminator needs to be encoded to out or not .
int	unishox2_compress_lines (const char in, int len, char out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char usx_freq_seq[], const char usx_templates[], struct us_lnk_lst *prev_lines)
int	unishox2_compress (const char in, int len, char out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char usx_freq_seq[], const char usx_templates[])
int	unishox2_compress_simple (const char in, int len, char out)
int	readBit (const char *in, int bit_no)
int	read8bitCode (const char *in, int len, int bit_no)
int	readVCodeIdx (const char in, int len, int bit_no_p)
int	readHCodeIdx (const char in, int len, int bit_no_p, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[])
int	getStepCodeIdx (const char in, int len, int bit_no_p, int limit)
	Returns the position of step code (0, 10, 110, etc.) encountered in the stream.
int32_t	getNumFromBits (const char *in, int len, int bit_no, int count)
	Reads specified number of bits and builds the corresponding integer.
int32_t	readCount (const char in, int bit_no_p, int len)
	Decodes the count from the given bit stream at in. Also updates bit_no_p.
int32_t	readUnicode (const char in, int bit_no_p, int len)
int	writeUTF8 (char *out, int olen, int ol, int uni)
	Write given unicode code point to out as a UTF-8 sequence.
int	decodeRepeat (const char in, int len, char out, int olen, int ol, int bit_no, struct us_lnk_lst prev_lines)
	Decode repeating sequence and appends to out.
char	getHexChar (int32_t nibble, int hex_type)
	Returns hex character corresponding to the 4 bit nibble.
int	unishox2_decompress_lines (const char in, int len, char out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char usx_freq_seq[], const char usx_templates[], struct us_lnk_lst *prev_lines)
int	unishox2_decompress (const char in, int len, char out, const uint8_t usx_hcodes[], const uint8_t usx_hcode_lens[], const char usx_freq_seq[], const char usx_templates[])
int	unishox2_decompress_simple (const char in, int len, char out)

Variables
const char *	USX_FREQ_SEQ_DFLT [] = {"\": \"", "\": ", "</", "=\"", "\":\"", "://"}
	Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression.
const char *	USX_FREQ_SEQ_TXT [] = {" the ", " and ", "tion", " with", "ing", "ment"}
	Frequently occuring sequences in text content.
const char *	USX_FREQ_SEQ_URL [] = {"https://", "www.", ".com", "http://", ".org", ".net"}
	Frequently occuring sequences in URL content.
const char *	USX_FREQ_SEQ_JSON [] = {"\": \"", "\": ", "\",", "}}}", "\":\"", "}}"}
	Frequently occuring sequences in JSON content.
const char *	USX_FREQ_SEQ_HTML [] = {"</", "=\"", "div", "href", "class", "<p>"}
	Frequently occuring sequences in HTML content.
const char *	USX_FREQ_SEQ_XML [] = {"</", "=\"", "\">", "<?xml version=\"1.0\"", "xmlns:", "://"}
	Frequently occuring sequences in XML content.
const char *	USX_TEMPLATES [] = {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0}
uint8_t	usx_sets [][28]
	This 2D array has the characters for the sets USX_ALPHA, USX_SYM and USX_NUM. Where a character cannot fit into a uint8_t, 0 is used and handled in code.
uint8_t	usx_code_94 [94]
uint8_t	usx_vcodes []
	Vertical codes starting from the MSB.
uint8_t	usx_vcode_lens []
	Length of each veritical code.
uint8_t	usx_freq_codes [] = {(1 << 5) + 25, (1 << 5) + 26, (1 << 5) + 27, (2 << 5) + 23, (2 << 5) + 24, (2 << 5) + 25}
	Vertical Codes and Set number for frequent sequences in sets USX_SYM and USX_NUM. First 3 bits indicate set (USX_SYM/USX_NUM) and rest are vcode positions.
const int	UTF8_MASK [] = {0xE0, 0xF0, 0xF8}
	Not used.
const int	UTF8_PREFIX [] = {0xC0, 0xE0, 0xF0}
	Not used.
uint8_t	is_inited = 0
	global to indicate whether initialization is complete or not
unsigned int	usx_mask [] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF}
	Mask for retrieving each code to be encoded according to its length.
const uint8_t	count_bit_lens [5] = {2, 4, 7, 11, 16}
	Length of bits used to represent count for each level.
const int32_t	count_adder [5] = {4, 20, 148, 2196, 67732}
	Cumulative counts represented at each level.
const uint8_t	count_codes [] = {0x01, 0x82, 0xC3, 0xE4, 0xF4}
	Codes used to specify the level that the count belongs to.
const uint8_t	uni_bit_len [5] = {6, 12, 14, 16, 21}
	Length of bits used to represent delta code for each level.
const int32_t	uni_adder [5] = {0, 64, 4160, 20544, 86080}
	Cumulative delta codes represented at each level.
uint8_t	usx_vsections [] = {0x7F, 0xBF, 0xDF, 0xEF, 0xFF}
	Used by readVCodeIdx() for finding the section under which the code read using read8bitCode() falls.
uint8_t	usx_vsection_pos [] = {0, 4, 8, 12, 20}
	Used by readVCodeIdx() for finding the section vertical position offset.
uint8_t	usx_vsection_mask [] = {0x7F, 0x3F, 0x1F, 0x0F, 0x0F}
	Used by readVCodeIdx() for masking the code read by read8bitCode()
uint8_t	usx_vsection_shift [] = {5, 4, 3, 1, 0}
	Used by readVCodeIdx() for shifting the code read by read8bitCode() to obtain the vpos.
uint8_t	usx_vcode_lookup [36]
uint8_t	len_masks [] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF}

Detailed Description

Main code of Unishox2 Compression and Decompression library.

Author: Arundale Ramanathan, James Z. M. Gao

This file implements the code for the Unishox API function
defined in unishox2.h

Macro Definition Documentation

◆ DEC_OUTPUT_CHAR

#define DEC_OUTPUT_CHAR	(	out,
		olen,
		ol,
		c )

Value:

  do { \
  char *const obuf = (out); \
  const int oidx = (ol); \
  const int limit = (olen); \
  if (limit <= oidx) return limit + 1; \
  else if (oidx < 0) return 0; \
  else obuf[oidx] = (c); \
} while (0)

Macro to ensure that the decoder does not append more than olen bytes to out.

◆ DEC_OUTPUT_CHARS

#define DEC_OUTPUT_CHARS	(		olen,
			exp )

Value:

  do { \
  const int newidx = (exp); \
  const int limit = (olen); \
  if (newidx > limit) return limit + 1; \
} while (0)

Macro to ensure that the decoder does not append more than olen bytes to out.

◆ SAFE_APPEND_BITS

#define SAFE_APPEND_BITS ( exp )

Value:

  do { \
  const int newidx = (exp); \
  if (newidx < 0) return newidx; \
} while (0)

This is a safe call to append_bits() making sure it does not write past olen.

◆ SAFE_APPEND_BITS2

#define SAFE_APPEND_BITS2	(		olen,
			exp )

Value:

  do { \
  const int newidx = (exp); \
  const int __olen = (olen); \
  if (newidx < 0) return __olen >= 0 ? __olen + 1 : (1 - __olen) * 4; \
} while (0)

Macro used in the main compress function so that if the output len exceeds given maximum length (olen) it can exit.

Enumeration Type Documentation

◆ anonymous enum

anonymous enum

Enum indicating nibble type - USX_NIB_NUM means ch is a number '0' to '9',
USX_NIB_HEX_LOWER means ch is between 'a' to 'f',
USX_NIB_HEX_UPPER means ch is between 'A' to 'F'

Function Documentation

◆ append_bits()

int append_bits	(	char *	out,
		int	olen,
		int	ol,
		uint8_t	code,
		int	clen )

Appends specified number of bits to the output (out)
If maximum limit (olen) is reached, -1 is returned
Otherwise clen bits in code are appended to out starting with MSB

◆ getBaseCode()

uint8_t getBaseCode ( char ch )

Returns 4 bit code assuming ch falls between '0' to '9',
'A' to 'F' or 'a' to 'f'

◆ getNibbleType()

char getNibbleType ( char ch )

Gets 4 bit code assuming ch falls between '0' to '9',
'A' to 'F' or 'a' to 'f'

◆ init_coder()

void init_coder ( )

Fills the usx_code_94 94 letter array based on sets of characters at usx_sets
For each element in usx_code_94, first 3 msb bits is set (USX_ALPHA / USX_SYM / USX_NUM)
and the rest 5 bits indicate the vertical position in the corresponding set

◆ matchLine()

int matchLine	(	const char *	in,
		int	len,
		int	l,
		char *	out,
		int	olen,
		int *	ol,
		struct us_lnk_lst *	prev_lines,
		uint8_t *	state,
		const uint8_t	usx_hcodes[],
		const uint8_t	usx_hcode_lens[] )

This is used only when encoding a string array Finds the longest matching sequence from the previous array element to the beginning of the string array.
If a match is found and it is longer than NICE_LEN, it is encoded as a repeating sequence to out
This is also used for Unicode strings
This is a crude implementation that is not optimized. Assuming only short strings
are encoded, this is not much of an issue.

◆ matchOccurance()

int matchOccurance	(	const char *	in,
		int	len,
		int	l,
		char *	out,
		int	olen,
		int *	ol,
		uint8_t *	state,
		const uint8_t	usx_hcodes[],
		const uint8_t	usx_hcode_lens[] )

Finds the longest matching sequence from the beginning of the string.
If a match is found and it is longer than NICE_LEN, it is encoded as a repeating sequence to out
This is also used for Unicode strings
This is a crude implementation that is not optimized. Assuming only short strings
are encoded, this is not much of an issue.

◆ readHCodeIdx()

int readHCodeIdx	(	const char *	in,
		int	len,
		int *	bit_no_p,
		const uint8_t	usx_hcodes[],
		const uint8_t	usx_hcode_lens[] )

Decodes the horizontal code from the given bitstream at in
depending on the hcodes defined using usx_hcodes and usx_hcode_lens
Returns the horizontal code index or 99 if match could not be found.
Also updates bit_no_p with how many ever bits used by the horizontal code.

◆ readUnicode()

int32_t readUnicode	(	const char *	in,
		int *	bit_no_p,
		int	len )

Decodes the Unicode codepoint from the given bit stream at in. Also updates bit_no_p
When the step code is 5, reads the next step code to find out the special code.

◆ readVCodeIdx()

int readVCodeIdx	(	const char *	in,
		int	len,
		int *	bit_no_p )

Decodes the vertical code from the given bitstream at in
This is designed to use less memory using a 36 uint8_t buffer
compared to using a 256 uint8_t buffer to decode the next 8 bits read by read8bitCode()
by splitting the list of vertical codes.
Decoder is designed for using less memory, not speed.
Returns the veritical code index or 99 if match could not be found.
Also updates bit_no_p with how many ever bits used by the vertical code.

Variable Documentation

◆ len_masks

uint8_t len_masks[] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF}

Mask for retrieving each code to be decoded according to its length
Same as usx_mask so redundant

◆ usx_code_94

uint8_t usx_code_94[94]

Stores position of letter in usx_sets. First 3 bits - position in usx_hcodes Next 5 bits - position in usx_vcodes

◆ usx_sets

uint8_t usx_sets[][28]

Initial value:

= {{  0, ' ', 'e', 't', 'a', 'o', 'i', 'n',
                        's', 'r', 'l', 'c', 'd', 'h', 'u', 'p', 'm', 'b',
                        'g', 'w', 'f', 'y', 'v', 'k', 'q', 'j', 'x', 'z'},
                       {'"', '{', '}', '_', '<', '>', ':', '\n',
                          0, '[', ']', '\\', ';', '\'', '\t', '@', '*', '&',
                        '?', '!', '^', '|', '\r', '~', '`', 0, 0, 0},
                       {  0, ',', '.', '0', '1', '9', '2', '5', '-',
                        '/', '3', '4', '6', '7', '8', '(', ')', ' ',
                        '=', '+', '$', '%', '#', 0, 0, 0, 0, 0}}

This 2D array has the characters for the sets USX_ALPHA, USX_SYM and USX_NUM. Where a character cannot fit into a uint8_t, 0 is used and handled in code.

◆ usx_vcode_lens

uint8_t usx_vcode_lens[]

Initial value:

= {  2,    3,    3,    4,    4,    4,    4,
                           4,    5,    5,    6,    6,    6,    7,
                           7,    7,    7,    7,    8,    8,    8,
                           8,    8,    8,    8,    8,    8,    8 }

Length of each veritical code.

◆ usx_vcode_lookup

uint8_t usx_vcode_lookup[36]

Initial value:

= {
  (1 << 5) + 0,  (1 << 5) + 0,  (2 << 5) + 1,  (2 << 5) + 2,  
  (3 << 5) + 3,  (3 << 5) + 4,  (3 << 5) + 5,  (3 << 5) + 6,  
  (3 << 5) + 7,  (3 << 5) + 7,  (4 << 5) + 8,  (4 << 5) + 9,  
  (5 << 5) + 10, (5 << 5) + 10, (5 << 5) + 11, (5 << 5) + 11, 
  (5 << 5) + 12, (5 << 5) + 12, (6 << 5) + 13, (6 << 5) + 14,
  (6 << 5) + 15, (6 << 5) + 15, (6 << 5) + 16, (6 << 5) + 16, 
  (6 << 5) + 17, (6 << 5) + 17, (7 << 5) + 18, (7 << 5) + 19,
  (7 << 5) + 20, (7 << 5) + 21, (7 << 5) + 22, (7 << 5) + 23,
  (7 << 5) + 24, (7 << 5) + 25, (7 << 5) + 26, (7 << 5) + 27
}

Vertical decoder lookup table - 3 bits code len, 5 bytes vertical pos code len is one less as 8 cannot be accommodated in 3 bits

◆ usx_vcodes

uint8_t usx_vcodes[]

Initial value:

= { 0x00, 0x40, 0x60, 0x80, 0x90, 0xA0, 0xB0,
                        0xC0, 0xD0, 0xD8, 0xE0, 0xE4, 0xE8, 0xEC,
                        0xEE, 0xF0, 0xF2, 0xF4, 0xF6, 0xF7, 0xF8,
                        0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }

Vertical codes starting from the MSB.

Macros

Typedefs

Enumerations

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ DEC_OUTPUT_CHAR

◆ DEC_OUTPUT_CHARS

◆ SAFE_APPEND_BITS

◆ SAFE_APPEND_BITS2

Enumeration Type Documentation

◆ anonymous enum

Function Documentation

◆ append_bits()

◆ getBaseCode()

◆ getNibbleType()

◆ init_coder()

◆ matchLine()

◆ matchOccurance()

◆ readHCodeIdx()

◆ readUnicode()

◆ readVCodeIdx()

Variable Documentation

◆ len_masks

◆ usx_code_94

◆ usx_sets

◆ usx_vcode_lens

◆ usx_vcode_lookup

◆ usx_vcodes