Unishox
A hybrid encoder for Short Unicode Strings
unishox2.h File Reference

API for Unishox2 Compression and Decompression. More...

Data Structures

struct  us_lnk_lst
 

Macros

#define UNISHOX_VERSION   "2.0"
 Unicode spec version.
 
#define UNISHOX_API_WITH_OUTPUT_LEN   0
 
#define UNISHOX_MAGIC_BITS   0xFF
 Upto 8 bits of initial magic bit sequence can be included. Bit count can be specified with UNISHOX_MAGIC_BIT_LEN.
 
#define UNISHOX_MAGIC_BIT_LEN   1
 Desired length of Magic bits defined by UNISHOX_MAGIC_BITS.
 
#define USX_HCODES_DFLT   (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0xE0}
 Default Horizontal codes. When composition of text is know beforehand, the other hcodes in this section can be used to achieve more compression.
 
#define USX_HCODE_LENS_DFLT   (const unsigned char[]) {2, 2, 2, 3, 3}
 Length of each default hcode.
 
#define USX_HCODES_ALPHA_ONLY   (const unsigned char[]) {0x00, 0x00, 0x00, 0x00, 0x00}
 Horizontal codes preset for English Alphabet content only.
 
#define USX_HCODE_LENS_ALPHA_ONLY   (const unsigned char[]) {0, 0, 0, 0, 0}
 Length of each Alpha only hcode.
 
#define USX_HCODES_ALPHA_NUM_ONLY   (const unsigned char[]) {0x00, 0x00, 0x80, 0x00, 0x00}
 Horizontal codes preset for Alpha Numeric content only.
 
#define USX_HCODE_LENS_ALPHA_NUM_ONLY   (const unsigned char[]) {1, 0, 1, 0, 0}
 Length of each Alpha numeric hcode.
 
#define USX_HCODES_ALPHA_NUM_SYM_ONLY   (const unsigned char[]) {0x00, 0x80, 0xC0, 0x00, 0x00}
 Horizontal codes preset for Alpha Numeric and Symbol content only.
 
#define USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY   (const unsigned char[]) {1, 2, 2, 0, 0}
 Length of each Alpha numeric and symbol hcodes.
 
#define USX_HCODES_FAVOR_ALPHA   (const unsigned char[]) {0x00, 0x80, 0xA0, 0xC0, 0xE0}
 Horizontal codes preset favouring Alphabet content.
 
#define USX_HCODE_LENS_FAVOR_ALPHA   (const unsigned char[]) {1, 3, 3, 3, 3}
 Length of each hcode favouring Alpha content.
 
#define USX_HCODES_FAVOR_DICT   (const unsigned char[]) {0x00, 0x40, 0xC0, 0x80, 0xE0}
 Horizontal codes preset favouring repeating sequences.
 
#define USX_HCODE_LENS_FAVOR_DICT   (const unsigned char[]) {2, 2, 3, 2, 3}
 Length of each hcode favouring repeating sequences.
 
#define USX_HCODES_FAVOR_SYM   (const unsigned char[]) {0x80, 0x00, 0xA0, 0xC0, 0xE0}
 Horizontal codes preset favouring symbols.
 
#define USX_HCODE_LENS_FAVOR_SYM   (const unsigned char[]) {3, 1, 3, 3, 3}
 Length of each hcode favouring symbols.
 
#define USX_HCODES_FAVOR_UMLAUT   (const unsigned char[]) {0x80, 0xA0, 0xC0, 0xE0, 0x00}
 Horizontal codes preset favouring umlaut letters.
 
#define USX_HCODE_LENS_FAVOR_UMLAUT   (const unsigned char[]) {3, 3, 3, 3, 1}
 Length of each hcode favouring umlaut letters.
 
#define USX_HCODES_NO_DICT   (const unsigned char[]) {0x00, 0x40, 0x80, 0x00, 0xC0}
 Horizontal codes preset for no repeating sequences.
 
#define USX_HCODE_LENS_NO_DICT   (const unsigned char[]) {2, 2, 2, 0, 2}
 Length of each hcode for no repeating sequences.
 
#define USX_HCODES_NO_UNI   (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0x00}
 Horizontal codes preset for no Unicode characters.
 
#define USX_HCODE_LENS_NO_UNI   (const unsigned char[]) {2, 2, 2, 2, 0}
 Length of each hcode for no Unicode characters.
 
#define USX_FREQ_SEQ_DFLT   (const char *[]) {"\": \"", "\": ", "</", "=\"", "\":\"", "://"}
 Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression.
 
#define USX_FREQ_SEQ_TXT   (const char *[]) {" the ", " and ", "tion", " with", "ing", "ment"}
 Frequently occuring sequences in text content.
 
#define USX_FREQ_SEQ_URL   (const char *[]) {"https://", "www.", ".com", "http://", ".org", ".net"}
 Frequently occuring sequences in URL content.
 
#define USX_FREQ_SEQ_JSON   (const char *[]) {"\": \"", "\": ", "\",", "}}}", "\":\"", "}}"}
 Frequently occuring sequences in JSON content.
 
#define USX_FREQ_SEQ_HTML   (const char *[]) {"</", "=\"", "div", "href", "class", "<p>"}
 Frequently occuring sequences in HTML content.
 
#define USX_FREQ_SEQ_XML   (const char *[]) {"</", "=\"", "\">", "<?xml version=\"1.0\"", "xmlns:", "://"}
 Frequently occuring sequences in XML content.
 
#define USX_TEMPLATES   (const char *[]) {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0}
 Commonly occuring templates (ISO Date/Time, ISO Date, US Phone number, ISO Time, Unused)
 
#define USX_PSET_DFLT   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Default preset parameter set. When composition of text is know beforehand, the other parameter sets in this section can be used to achieve more compression.
 
#define USX_PSET_ALPHA_ONLY   USX_HCODES_ALPHA_ONLY, USX_HCODE_LENS_ALPHA_ONLY, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set for English Alphabet only content.
 
#define USX_PSET_ALPHA_NUM_ONLY   USX_HCODES_ALPHA_NUM_ONLY, USX_HCODE_LENS_ALPHA_NUM_ONLY, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set for Alpha numeric content.
 
#define USX_PSET_ALPHA_NUM_SYM_ONLY   USX_HCODES_ALPHA_NUM_SYM_ONLY, USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for Alpha numeric and symbol content.
 
#define USX_PSET_ALPHA_NUM_SYM_ONLY_TXT   USX_HCODES_ALPHA_NUM_SYM_ONLY, USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for Alpha numeric symbol content having predominantly text.
 
#define USX_PSET_FAVOR_ALPHA   USX_HCODES_FAVOR_ALPHA, USX_HCODE_LENS_FAVOR_ALPHA, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set favouring Alphabet content.
 
#define USX_PSET_FAVOR_DICT   USX_HCODES_FAVOR_DICT, USX_HCODE_LENS_FAVOR_DICT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set favouring repeating sequences.
 
#define USX_PSET_FAVOR_SYM   USX_HCODES_FAVOR_SYM, USX_HCODE_LENS_FAVOR_SYM, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set favouring symbols.
 
#define USX_PSET_FAVOR_UMLAUT   USX_HCODES_FAVOR_UMLAUT, USX_HCODE_LENS_FAVOR_UMLAUT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set favouring unlaut letters.
 
#define USX_PSET_NO_DICT   USX_HCODES_NO_DICT, USX_HCODE_LENS_NO_DICT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for when there are no repeating sequences.
 
#define USX_PSET_NO_UNI   USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for when there are no unicode symbols.
 
#define USX_PSET_NO_UNI_FAVOR_TEXT   USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set for when there are no unicode symbols favouring text.
 
#define USX_PSET_URL   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_URL, USX_TEMPLATES
 Preset parameter set favouring URL content.
 
#define USX_PSET_JSON   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_JSON, USX_TEMPLATES
 Preset parameter set favouring JSON content.
 
#define USX_PSET_JSON_NO_UNI   USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_JSON, USX_TEMPLATES
 Preset parameter set favouring JSON content having no Unicode symbols.
 
#define USX_PSET_XML   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_XML, USX_TEMPLATES
 Preset parameter set favouring XML content.
 
#define USX_PSET_HTML   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_HTML, USX_TEMPLATES
 Preset parameter set favouring HTML content.
 
#define UNISHOX_API_OUT_AND_LEN(out, olen)   out
 

Functions

int unishox2_compress_simple (const char *in, int len, char *out)
 
int unishox2_decompress_simple (const char *in, int len, char *out)
 
int unishox2_compress (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[])
 
int unishox2_decompress (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[])
 
int unishox2_compress_lines (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines)
 
int unishox2_decompress_lines (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines)
 

Detailed Description

API for Unishox2 Compression and Decompression.

Author
Arundale Ramanathan, James Z. M. Gao

This file describes each function of the Unishox2 API
For finding out how this API can be used in your program,
please see test_unishox2.c.

Macro Definition Documentation

◆ UNISHOX_API_OUT_AND_LEN

#define UNISHOX_API_OUT_AND_LEN (   out,
  olen 
)    out

This macro is for internal use, but builds upon the macro UNISHOX_API_WITH_OUTPUT_LEN When the macro UNISHOX_API_WITH_OUTPUT_LEN is defined, the all the API functions except the simple API functions accept an additional parameter olen that enables the developer to pass the size of the output buffer provided so that the api function may not write beyond that length. This can be disabled if the developer knows that the buffer provided is sufficient enough so no additional parameter is passed and the program is faster since additional check for output length is not performed at each step

◆ UNISHOX_API_WITH_OUTPUT_LEN

#define UNISHOX_API_WITH_OUTPUT_LEN   0

Macro switch to enable/disable output buffer length parameter in low level api
Disabled by default
When this macro is defined, the all the API functions
except the simple API functions accept an additional parameter olen
that enables the developer to pass the size of the output buffer provided
so that the api function may not write beyond that length.
This can be disabled if the developer knows that the buffer provided is sufficient enough
so no additional parameter is passed and the program is faster since additional check
for output length is not performed at each step
The simple api, i.e. unishox2_(de)compress_simple will always omit the buffer length

Function Documentation

◆ unishox2_compress()

int unishox2_compress ( const char *  in,
int  len,
UNISHOX_API_OUT_AND_LEN(char *out, int olen)  ,
const unsigned char  usx_hcodes[],
const unsigned char  usx_hcode_lens[],
const char *  usx_freq_seq[],
const char *  usx_templates[] 
)

Comprehensive API for compressing a string

Presets are available for the last four parameters so they can be passed as single parameter.
See USX_PSET_* macros. Example call:
unishox2_compress(in, len, out, olen, USX_PSET_ALPHA_ONLY);

Parameters
[in]inInput ASCII / UTF-8 string
[in]lenlength in bytes
[out]outoutput buffer - should be large enough to hold compressed output
[in]olenlength of 'out' buffer in bytes. Can be omitted if sufficient buffer is provided
[in]usx_hcodesHorizontal codes (array of bytes). See macro section for samples.
[in]usx_hcode_lensLength of each element in usx_hcodes array
[in]usx_freq_seqFrequently occuring sequences. See USX_FREQ_SEQ_* macros for samples
[in]usx_templatesTemplates of frequently occuring patterns. See USX_TEMPLATES macro.

◆ unishox2_compress_lines()

int unishox2_compress_lines ( const char *  in,
int  len,
UNISHOX_API_OUT_AND_LEN(char *out, int olen)  ,
const unsigned char  usx_hcodes[],
const unsigned char  usx_hcode_lens[],
const char *  usx_freq_seq[],
const char *  usx_templates[],
struct us_lnk_lst prev_lines 
)

More Comprehensive API for compressing array of strings

See unishox2_compress() function for parameter definitions.
This function takes an additional parameter, i.e. 'prev_lines' - the usx_lnk_lst structure
See -g parameter in test_unishox2.c to find out how this can be used.
This function is used when an array of strings need to be compressed
and stored in a compressed array of bytes for use as a constant in other programs
where each element of the array can be decompressed and used at runtime.

◆ unishox2_compress_simple()

int unishox2_compress_simple ( const char *  in,
int  len,
char *  out 
)

Simple API for compressing a string

Parameters
[in]inInput ASCII / UTF-8 string
[in]lenlength in bytes
[out]outoutput buffer - should be large enough to hold compressed output

◆ unishox2_decompress()

int unishox2_decompress ( const char *  in,
int  len,
UNISHOX_API_OUT_AND_LEN(char *out, int olen)  ,
const unsigned char  usx_hcodes[],
const unsigned char  usx_hcode_lens[],
const char *  usx_freq_seq[],
const char *  usx_templates[] 
)

Comprehensive API for de-compressing a string

Presets are available for the last four parameters so they can be passed as single parameter.
See USX_PSET_* macros. Example call:
unishox2_decompress(in, len, out, olen, USX_PSET_ALPHA_ONLY);

Parameters
[in]inInput compressed bytes (output of unishox2_compress functions)
[in]lenlength of 'in' in bytes
[out]outoutput buffer - should be large enough to hold de-compressed output
[in]olenlength of 'out' buffer in bytes. Can be omitted if sufficient buffer is provided
[in]usx_hcodesHorizontal codes (array of bytes). See macro section for samples.
[in]usx_hcode_lensLength of each element in usx_hcodes array
[in]usx_freq_seqFrequently occuring sequences. See USX_FREQ_SEQ_* macros for samples
[in]usx_templatesTemplates of frequently occuring patterns. See USX_TEMPLATES macro.

◆ unishox2_decompress_lines()

int unishox2_decompress_lines ( const char *  in,
int  len,
UNISHOX_API_OUT_AND_LEN(char *out, int olen)  ,
const unsigned char  usx_hcodes[],
const unsigned char  usx_hcode_lens[],
const char *  usx_freq_seq[],
const char *  usx_templates[],
struct us_lnk_lst prev_lines 
)

More Comprehensive API for de-compressing array of strings
This function is not be used in conjuction with unishox2_compress_lines()

See unishox2_decompress() function for parameter definitions.
Typically an array is compressed using unishox2_compress_lines() and
a header (.h) file is generated using the resultant compressed array.
This header file can be used in another program with another decompress
routine which takes this compressed array as parameter and index to be
decompressed.

◆ unishox2_decompress_simple()

int unishox2_decompress_simple ( const char *  in,
int  len,
char *  out 
)

Simple API for decompressing a string

Parameters
[in]inInput compressed bytes (output of unishox2_compress functions)
[in]lenlength of 'in' in bytes
[out]outoutput buffer for ASCII / UTF-8 string - should be large enough