Unishox
A hybrid encoder for Short Unicode Strings
Loading...
Searching...
No Matches
unishox2.h File Reference

API for Unishox2 Compression and Decompression. More...

Data Structures

struct  us_lnk_lst

Macros

#define UNISHOX_VERSION   "2.0"
 Unicode spec version.
#define UNISHOX_API_WITH_OUTPUT_LEN   0
#define UNISHOX_MAGIC_BITS   0xFF
 Upto 8 bits of initial magic bit sequence can be included. Bit count can be specified with UNISHOX_MAGIC_BIT_LEN.
#define UNISHOX_MAGIC_BIT_LEN   1
 Desired length of Magic bits defined by UNISHOX_MAGIC_BITS.
#define USX_HCODES_DFLT   (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0xE0}
 Default Horizontal codes. When composition of text is know beforehand, the other hcodes in this section can be used to achieve more compression.
#define USX_HCODE_LENS_DFLT   (const unsigned char[]) {2, 2, 2, 3, 3}
 Length of each default hcode.
#define USX_HCODES_ALPHA_ONLY   (const unsigned char[]) {0x00, 0x00, 0x00, 0x00, 0x00}
 Horizontal codes preset for English Alphabet content only.
#define USX_HCODE_LENS_ALPHA_ONLY   (const unsigned char[]) {0, 0, 0, 0, 0}
 Length of each Alpha only hcode.
#define USX_HCODES_ALPHA_NUM_ONLY   (const unsigned char[]) {0x00, 0x00, 0x80, 0x00, 0x00}
 Horizontal codes preset for Alpha Numeric content only.
#define USX_HCODE_LENS_ALPHA_NUM_ONLY   (const unsigned char[]) {1, 0, 1, 0, 0}
 Length of each Alpha numeric hcode.
#define USX_HCODES_ALPHA_NUM_SYM_ONLY   (const unsigned char[]) {0x00, 0x80, 0xC0, 0x00, 0x00}
 Horizontal codes preset for Alpha Numeric and Symbol content only.
#define USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY   (const unsigned char[]) {1, 2, 2, 0, 0}
 Length of each Alpha numeric and symbol hcodes.
#define USX_HCODES_FAVOR_ALPHA   (const unsigned char[]) {0x00, 0x80, 0xA0, 0xC0, 0xE0}
 Horizontal codes preset favouring Alphabet content.
#define USX_HCODE_LENS_FAVOR_ALPHA   (const unsigned char[]) {1, 3, 3, 3, 3}
 Length of each hcode favouring Alpha content.
#define USX_HCODES_FAVOR_DICT   (const unsigned char[]) {0x00, 0x40, 0xC0, 0x80, 0xE0}
 Horizontal codes preset favouring repeating sequences.
#define USX_HCODE_LENS_FAVOR_DICT   (const unsigned char[]) {2, 2, 3, 2, 3}
 Length of each hcode favouring repeating sequences.
#define USX_HCODES_FAVOR_SYM   (const unsigned char[]) {0x80, 0x00, 0xA0, 0xC0, 0xE0}
 Horizontal codes preset favouring symbols.
#define USX_HCODE_LENS_FAVOR_SYM   (const unsigned char[]) {3, 1, 3, 3, 3}
 Length of each hcode favouring symbols.
#define USX_HCODES_FAVOR_UMLAUT   (const unsigned char[]) {0x80, 0xA0, 0xC0, 0xE0, 0x00}
 Horizontal codes preset favouring umlaut letters.
#define USX_HCODE_LENS_FAVOR_UMLAUT   (const unsigned char[]) {3, 3, 3, 3, 1}
 Length of each hcode favouring umlaut letters.
#define USX_HCODES_NO_DICT   (const unsigned char[]) {0x00, 0x40, 0x80, 0x00, 0xC0}
 Horizontal codes preset for no repeating sequences.
#define USX_HCODE_LENS_NO_DICT   (const unsigned char[]) {2, 2, 2, 0, 2}
 Length of each hcode for no repeating sequences.
#define USX_HCODES_NO_UNI   (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0x00}
 Horizontal codes preset for no Unicode characters.
#define USX_HCODE_LENS_NO_UNI   (const unsigned char[]) {2, 2, 2, 2, 0}
 Length of each hcode for no Unicode characters.
#define USX_PSET_DFLT   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Default preset parameter set. When composition of text is know beforehand, the other parameter sets in this section can be used to achieve more compression.
#define USX_PSET_ALPHA_ONLY   USX_HCODES_ALPHA_ONLY, USX_HCODE_LENS_ALPHA_ONLY, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set for English Alphabet only content.
#define USX_PSET_ALPHA_NUM_ONLY   USX_HCODES_ALPHA_NUM_ONLY, USX_HCODE_LENS_ALPHA_NUM_ONLY, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set for Alpha numeric content.
#define USX_PSET_ALPHA_NUM_SYM_ONLY   USX_HCODES_ALPHA_NUM_SYM_ONLY, USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for Alpha numeric and symbol content.
#define USX_PSET_ALPHA_NUM_SYM_ONLY_TXT   USX_HCODES_ALPHA_NUM_SYM_ONLY, USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for Alpha numeric symbol content having predominantly text.
#define USX_PSET_FAVOR_ALPHA   USX_HCODES_FAVOR_ALPHA, USX_HCODE_LENS_FAVOR_ALPHA, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set favouring Alphabet content.
#define USX_PSET_FAVOR_DICT   USX_HCODES_FAVOR_DICT, USX_HCODE_LENS_FAVOR_DICT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set favouring repeating sequences.
#define USX_PSET_FAVOR_SYM   USX_HCODES_FAVOR_SYM, USX_HCODE_LENS_FAVOR_SYM, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set favouring symbols.
#define USX_PSET_FAVOR_UMLAUT   USX_HCODES_FAVOR_UMLAUT, USX_HCODE_LENS_FAVOR_UMLAUT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set favouring unlaut letters.
#define USX_PSET_NO_DICT   USX_HCODES_NO_DICT, USX_HCODE_LENS_NO_DICT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for when there are no repeating sequences.
#define USX_PSET_NO_UNI   USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_DFLT, USX_TEMPLATES
 Preset parameter set for when there are no unicode symbols.
#define USX_PSET_NO_UNI_FAVOR_TEXT   USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_TXT, USX_TEMPLATES
 Preset parameter set for when there are no unicode symbols favouring text.
#define USX_PSET_URL   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_URL, USX_TEMPLATES
 Preset parameter set favouring URL content.
#define USX_PSET_JSON   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_JSON, USX_TEMPLATES
 Preset parameter set favouring JSON content.
#define USX_PSET_JSON_NO_UNI   USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_JSON, USX_TEMPLATES
 Preset parameter set favouring JSON content having no Unicode symbols.
#define USX_PSET_XML   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_XML, USX_TEMPLATES
 Preset parameter set favouring XML content.
#define USX_PSET_HTML   USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_HTML, USX_TEMPLATES
 Preset parameter set favouring HTML content.
#define UNISHOX_API_OUT_AND_LEN(out, olen)

Functions

int unishox2_compress_simple (const char *in, int len, char *out)
int unishox2_decompress_simple (const char *in, int len, char *out)
int unishox2_compress (const char *in, int len, char *out, int olen, const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[])
int unishox2_decompress (const char *in, int len, char *out, int olen, const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[])
int unishox2_compress_lines (const char *in, int len, char *out, int olen, const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines)
int unishox2_decompress_lines (const char *in, int len, char *out, int olen, const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines)

Variables

const char * USX_FREQ_SEQ_DFLT []
 Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression.
const char * USX_FREQ_SEQ_TXT []
 Frequently occuring sequences in text content.
const char * USX_FREQ_SEQ_URL []
 Frequently occuring sequences in URL content.
const char * USX_FREQ_SEQ_JSON []
 Frequently occuring sequences in JSON content.
const char * USX_FREQ_SEQ_HTML []
 Frequently occuring sequences in HTML content.
const char * USX_FREQ_SEQ_XML []
 Frequently occuring sequences in XML content.
const char * USX_TEMPLATES []

Detailed Description

API for Unishox2 Compression and Decompression.

Author
Arundale Ramanathan, James Z. M. Gao

This file describes each function of the Unishox2 API
For finding out how this API can be used in your program,
please see test_unishox2.c.

Macro Definition Documentation

◆ UNISHOX_API_OUT_AND_LEN

#define UNISHOX_API_OUT_AND_LEN ( out,
olen )
Value:
out

This macro is for internal use, but builds upon the macro UNISHOX_API_WITH_OUTPUT_LEN When the macro UNISHOX_API_WITH_OUTPUT_LEN is defined, the all the API functions except the simple API functions accept an additional parameter olen that enables the developer to pass the size of the output buffer provided so that the api function may not write beyond that length. This can be disabled if the developer knows that the buffer provided is sufficient enough so no additional parameter is passed and the program is faster since additional check for output length is not performed at each step

◆ UNISHOX_API_WITH_OUTPUT_LEN

#define UNISHOX_API_WITH_OUTPUT_LEN   0

Macro switch to enable/disable output buffer length parameter in low level api
Disabled by default
When this macro is defined, the all the API functions
except the simple API functions accept an additional parameter olen
that enables the developer to pass the size of the output buffer provided
so that the api function may not write beyond that length.
This can be disabled if the developer knows that the buffer provided is sufficient enough
so no additional parameter is passed and the program is faster since additional check
for output length is not performed at each step
The simple api, i.e. unishox2_(de)compress_simple will always omit the buffer length