Unishox
A hybrid encoder for Short Unicode Strings
|
API for Unishox2 Compression and Decompression. More...
Data Structures | |
struct | us_lnk_lst |
Macros | |
#define | UNISHOX_VERSION "2.0" |
Unicode spec version. | |
#define | UNISHOX_API_WITH_OUTPUT_LEN 0 |
#define | UNISHOX_MAGIC_BITS 0xFF |
Upto 8 bits of initial magic bit sequence can be included. Bit count can be specified with UNISHOX_MAGIC_BIT_LEN. | |
#define | UNISHOX_MAGIC_BIT_LEN 1 |
Desired length of Magic bits defined by UNISHOX_MAGIC_BITS. | |
#define | USX_HCODES_DFLT (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0xE0} |
Default Horizontal codes. When composition of text is know beforehand, the other hcodes in this section can be used to achieve more compression. | |
#define | USX_HCODE_LENS_DFLT (const unsigned char[]) {2, 2, 2, 3, 3} |
Length of each default hcode. | |
#define | USX_HCODES_ALPHA_ONLY (const unsigned char[]) {0x00, 0x00, 0x00, 0x00, 0x00} |
Horizontal codes preset for English Alphabet content only. | |
#define | USX_HCODE_LENS_ALPHA_ONLY (const unsigned char[]) {0, 0, 0, 0, 0} |
Length of each Alpha only hcode. | |
#define | USX_HCODES_ALPHA_NUM_ONLY (const unsigned char[]) {0x00, 0x00, 0x80, 0x00, 0x00} |
Horizontal codes preset for Alpha Numeric content only. | |
#define | USX_HCODE_LENS_ALPHA_NUM_ONLY (const unsigned char[]) {1, 0, 1, 0, 0} |
Length of each Alpha numeric hcode. | |
#define | USX_HCODES_ALPHA_NUM_SYM_ONLY (const unsigned char[]) {0x00, 0x80, 0xC0, 0x00, 0x00} |
Horizontal codes preset for Alpha Numeric and Symbol content only. | |
#define | USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY (const unsigned char[]) {1, 2, 2, 0, 0} |
Length of each Alpha numeric and symbol hcodes. | |
#define | USX_HCODES_FAVOR_ALPHA (const unsigned char[]) {0x00, 0x80, 0xA0, 0xC0, 0xE0} |
Horizontal codes preset favouring Alphabet content. | |
#define | USX_HCODE_LENS_FAVOR_ALPHA (const unsigned char[]) {1, 3, 3, 3, 3} |
Length of each hcode favouring Alpha content. | |
#define | USX_HCODES_FAVOR_DICT (const unsigned char[]) {0x00, 0x40, 0xC0, 0x80, 0xE0} |
Horizontal codes preset favouring repeating sequences. | |
#define | USX_HCODE_LENS_FAVOR_DICT (const unsigned char[]) {2, 2, 3, 2, 3} |
Length of each hcode favouring repeating sequences. | |
#define | USX_HCODES_FAVOR_SYM (const unsigned char[]) {0x80, 0x00, 0xA0, 0xC0, 0xE0} |
Horizontal codes preset favouring symbols. | |
#define | USX_HCODE_LENS_FAVOR_SYM (const unsigned char[]) {3, 1, 3, 3, 3} |
Length of each hcode favouring symbols. | |
#define | USX_HCODES_FAVOR_UMLAUT (const unsigned char[]) {0x80, 0xA0, 0xC0, 0xE0, 0x00} |
Horizontal codes preset favouring umlaut letters. | |
#define | USX_HCODE_LENS_FAVOR_UMLAUT (const unsigned char[]) {3, 3, 3, 3, 1} |
Length of each hcode favouring umlaut letters. | |
#define | USX_HCODES_NO_DICT (const unsigned char[]) {0x00, 0x40, 0x80, 0x00, 0xC0} |
Horizontal codes preset for no repeating sequences. | |
#define | USX_HCODE_LENS_NO_DICT (const unsigned char[]) {2, 2, 2, 0, 2} |
Length of each hcode for no repeating sequences. | |
#define | USX_HCODES_NO_UNI (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0x00} |
Horizontal codes preset for no Unicode characters. | |
#define | USX_HCODE_LENS_NO_UNI (const unsigned char[]) {2, 2, 2, 2, 0} |
Length of each hcode for no Unicode characters. | |
#define | USX_FREQ_SEQ_DFLT (const char *[]) {"\": \"", "\": ", "</", "=\"", "\":\"", "://"} |
Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression. | |
#define | USX_FREQ_SEQ_TXT (const char *[]) {" the ", " and ", "tion", " with", "ing", "ment"} |
Frequently occuring sequences in text content. | |
#define | USX_FREQ_SEQ_URL (const char *[]) {"https://", "www.", ".com", "http://", ".org", ".net"} |
Frequently occuring sequences in URL content. | |
#define | USX_FREQ_SEQ_JSON (const char *[]) {"\": \"", "\": ", "\",", "}}}", "\":\"", "}}"} |
Frequently occuring sequences in JSON content. | |
#define | USX_FREQ_SEQ_HTML (const char *[]) {"</", "=\"", "div", "href", "class", "<p>"} |
Frequently occuring sequences in HTML content. | |
#define | USX_FREQ_SEQ_XML (const char *[]) {"</", "=\"", "\">", "<?xml version=\"1.0\"", "xmlns:", "://"} |
Frequently occuring sequences in XML content. | |
#define | USX_TEMPLATES (const char *[]) {"tfff-of-tfTtf:rf:rf.fffZ", "tfff-of-tf", "(fff) fff-ffff", "tf:rf:rf", 0} |
Commonly occuring templates (ISO Date/Time, ISO Date, US Phone number, ISO Time, Unused) | |
#define | USX_PSET_DFLT USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Default preset parameter set. When composition of text is know beforehand, the other parameter sets in this section can be used to achieve more compression. | |
#define | USX_PSET_ALPHA_ONLY USX_HCODES_ALPHA_ONLY, USX_HCODE_LENS_ALPHA_ONLY, USX_FREQ_SEQ_TXT, USX_TEMPLATES |
Preset parameter set for English Alphabet only content. | |
#define | USX_PSET_ALPHA_NUM_ONLY USX_HCODES_ALPHA_NUM_ONLY, USX_HCODE_LENS_ALPHA_NUM_ONLY, USX_FREQ_SEQ_TXT, USX_TEMPLATES |
Preset parameter set for Alpha numeric content. | |
#define | USX_PSET_ALPHA_NUM_SYM_ONLY USX_HCODES_ALPHA_NUM_SYM_ONLY, USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set for Alpha numeric and symbol content. | |
#define | USX_PSET_ALPHA_NUM_SYM_ONLY_TXT USX_HCODES_ALPHA_NUM_SYM_ONLY, USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set for Alpha numeric symbol content having predominantly text. | |
#define | USX_PSET_FAVOR_ALPHA USX_HCODES_FAVOR_ALPHA, USX_HCODE_LENS_FAVOR_ALPHA, USX_FREQ_SEQ_TXT, USX_TEMPLATES |
Preset parameter set favouring Alphabet content. | |
#define | USX_PSET_FAVOR_DICT USX_HCODES_FAVOR_DICT, USX_HCODE_LENS_FAVOR_DICT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set favouring repeating sequences. | |
#define | USX_PSET_FAVOR_SYM USX_HCODES_FAVOR_SYM, USX_HCODE_LENS_FAVOR_SYM, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set favouring symbols. | |
#define | USX_PSET_FAVOR_UMLAUT USX_HCODES_FAVOR_UMLAUT, USX_HCODE_LENS_FAVOR_UMLAUT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set favouring unlaut letters. | |
#define | USX_PSET_NO_DICT USX_HCODES_NO_DICT, USX_HCODE_LENS_NO_DICT, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set for when there are no repeating sequences. | |
#define | USX_PSET_NO_UNI USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_DFLT, USX_TEMPLATES |
Preset parameter set for when there are no unicode symbols. | |
#define | USX_PSET_NO_UNI_FAVOR_TEXT USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_TXT, USX_TEMPLATES |
Preset parameter set for when there are no unicode symbols favouring text. | |
#define | USX_PSET_URL USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_URL, USX_TEMPLATES |
Preset parameter set favouring URL content. | |
#define | USX_PSET_JSON USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_JSON, USX_TEMPLATES |
Preset parameter set favouring JSON content. | |
#define | USX_PSET_JSON_NO_UNI USX_HCODES_NO_UNI, USX_HCODE_LENS_NO_UNI, USX_FREQ_SEQ_JSON, USX_TEMPLATES |
Preset parameter set favouring JSON content having no Unicode symbols. | |
#define | USX_PSET_XML USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_XML, USX_TEMPLATES |
Preset parameter set favouring XML content. | |
#define | USX_PSET_HTML USX_HCODES_DFLT, USX_HCODE_LENS_DFLT, USX_FREQ_SEQ_HTML, USX_TEMPLATES |
Preset parameter set favouring HTML content. | |
#define | UNISHOX_API_OUT_AND_LEN(out, olen) out |
Functions | |
int | unishox2_compress_simple (const char *in, int len, char *out) |
int | unishox2_decompress_simple (const char *in, int len, char *out) |
int | unishox2_compress (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[]) |
int | unishox2_decompress (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[]) |
int | unishox2_compress_lines (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines) |
int | unishox2_decompress_lines (const char *in, int len, UNISHOX_API_OUT_AND_LEN(char *out, int olen), const unsigned char usx_hcodes[], const unsigned char usx_hcode_lens[], const char *usx_freq_seq[], const char *usx_templates[], struct us_lnk_lst *prev_lines) |
API for Unishox2 Compression and Decompression.
This file describes each function of the Unishox2 API
For finding out how this API can be used in your program,
please see test_unishox2.c.
#define UNISHOX_API_OUT_AND_LEN | ( | out, | |
olen | |||
) | out |
This macro is for internal use, but builds upon the macro UNISHOX_API_WITH_OUTPUT_LEN When the macro UNISHOX_API_WITH_OUTPUT_LEN is defined, the all the API functions except the simple API functions accept an additional parameter olen that enables the developer to pass the size of the output buffer provided so that the api function may not write beyond that length. This can be disabled if the developer knows that the buffer provided is sufficient enough so no additional parameter is passed and the program is faster since additional check for output length is not performed at each step
#define UNISHOX_API_WITH_OUTPUT_LEN 0 |
Macro switch to enable/disable output buffer length parameter in low level api
Disabled by default
When this macro is defined, the all the API functions
except the simple API functions accept an additional parameter olen
that enables the developer to pass the size of the output buffer provided
so that the api function may not write beyond that length.
This can be disabled if the developer knows that the buffer provided is sufficient enough
so no additional parameter is passed and the program is faster since additional check
for output length is not performed at each step
The simple api, i.e. unishox2_(de)compress_simple will always omit the buffer length
int unishox2_compress | ( | const char * | in, |
int | len, | ||
UNISHOX_API_OUT_AND_LEN(char *out, int olen) | , | ||
const unsigned char | usx_hcodes[], | ||
const unsigned char | usx_hcode_lens[], | ||
const char * | usx_freq_seq[], | ||
const char * | usx_templates[] | ||
) |
Comprehensive API for compressing a string
Presets are available for the last four parameters so they can be passed as single parameter.
See USX_PSET_* macros. Example call:
unishox2_compress(in, len, out, olen, USX_PSET_ALPHA_ONLY);
[in] | in | Input ASCII / UTF-8 string |
[in] | len | length in bytes |
[out] | out | output buffer - should be large enough to hold compressed output |
[in] | olen | length of 'out' buffer in bytes. Can be omitted if sufficient buffer is provided |
[in] | usx_hcodes | Horizontal codes (array of bytes). See macro section for samples. |
[in] | usx_hcode_lens | Length of each element in usx_hcodes array |
[in] | usx_freq_seq | Frequently occuring sequences. See USX_FREQ_SEQ_* macros for samples |
[in] | usx_templates | Templates of frequently occuring patterns. See USX_TEMPLATES macro. |
int unishox2_compress_lines | ( | const char * | in, |
int | len, | ||
UNISHOX_API_OUT_AND_LEN(char *out, int olen) | , | ||
const unsigned char | usx_hcodes[], | ||
const unsigned char | usx_hcode_lens[], | ||
const char * | usx_freq_seq[], | ||
const char * | usx_templates[], | ||
struct us_lnk_lst * | prev_lines | ||
) |
More Comprehensive API for compressing array of strings
See unishox2_compress() function for parameter definitions.
This function takes an additional parameter, i.e. 'prev_lines' - the usx_lnk_lst structure
See -g parameter in test_unishox2.c to find out how this can be used.
This function is used when an array of strings need to be compressed
and stored in a compressed array of bytes for use as a constant in other programs
where each element of the array can be decompressed and used at runtime.
int unishox2_compress_simple | ( | const char * | in, |
int | len, | ||
char * | out | ||
) |
Simple API for compressing a string
[in] | in | Input ASCII / UTF-8 string |
[in] | len | length in bytes |
[out] | out | output buffer - should be large enough to hold compressed output |
int unishox2_decompress | ( | const char * | in, |
int | len, | ||
UNISHOX_API_OUT_AND_LEN(char *out, int olen) | , | ||
const unsigned char | usx_hcodes[], | ||
const unsigned char | usx_hcode_lens[], | ||
const char * | usx_freq_seq[], | ||
const char * | usx_templates[] | ||
) |
Comprehensive API for de-compressing a string
Presets are available for the last four parameters so they can be passed as single parameter.
See USX_PSET_* macros. Example call:
unishox2_decompress(in, len, out, olen, USX_PSET_ALPHA_ONLY);
[in] | in | Input compressed bytes (output of unishox2_compress functions) |
[in] | len | length of 'in' in bytes |
[out] | out | output buffer - should be large enough to hold de-compressed output |
[in] | olen | length of 'out' buffer in bytes. Can be omitted if sufficient buffer is provided |
[in] | usx_hcodes | Horizontal codes (array of bytes). See macro section for samples. |
[in] | usx_hcode_lens | Length of each element in usx_hcodes array |
[in] | usx_freq_seq | Frequently occuring sequences. See USX_FREQ_SEQ_* macros for samples |
[in] | usx_templates | Templates of frequently occuring patterns. See USX_TEMPLATES macro. |
int unishox2_decompress_lines | ( | const char * | in, |
int | len, | ||
UNISHOX_API_OUT_AND_LEN(char *out, int olen) | , | ||
const unsigned char | usx_hcodes[], | ||
const unsigned char | usx_hcode_lens[], | ||
const char * | usx_freq_seq[], | ||
const char * | usx_templates[], | ||
struct us_lnk_lst * | prev_lines | ||
) |
More Comprehensive API for de-compressing array of strings
This function is not be used in conjuction with unishox2_compress_lines()
See unishox2_decompress() function for parameter definitions.
Typically an array is compressed using unishox2_compress_lines() and
a header (.h) file is generated using the resultant compressed array.
This header file can be used in another program with another decompress
routine which takes this compressed array as parameter and index to be
decompressed.
int unishox2_decompress_simple | ( | const char * | in, |
int | len, | ||
char * | out | ||
) |
Simple API for decompressing a string
[in] | in | Input compressed bytes (output of unishox2_compress functions) |
[in] | len | length of 'in' in bytes |
[out] | out | output buffer for ASCII / UTF-8 string - should be large enough |