VCODEX(3)C LIBRARY FUNCTIONSVCODEX(3)


NAME

Vcodex - Data transformation including compression, encryption, etc.

SYNOPSIS

     #include <vcodex.h>
     libvcodex.a      -lvcodex

VCODEX TYPES
     Void_t*;
     Vcchar_t;
     Vcint_t;
     Vcbits_t;
     Vcdisc_t;
     Vcmethod_t;
     Vcmtarg_t;
     Vcodex_t;

VCODEX OPERATIONS

     Vcodex_t*   vcopen(Vcdisc_t* disc, Vcmethod_t* meth, Void_t* init,
                             Vcodex_t* coder, int flags);
ssize_t vcextract(Vcodex_t* vc, Void_t** datap, int state); Vcodex_t* vcrestore(Vcodex_t* vc, Void_t* data, size_t size); int vcclose(Vcodex_t* vc); ssize_t vcapply(Vcodex_t* vc, Void_t* data, size_t size,
                              Void_t** output);
size_t vcundone(Vcodex_t* vc); Vcdisc_t* vcdisc(Vcodex_t* vc, Vcdisc_t* disc); Vcchar_t* vcsetbuf(Vcodex_t* vc, Vcchar_t*, ssize_t size, ssize_t head); int vccontext(Vcodex_t* vc, int context); Vcmethod_t* vcgetmeth(const char* name, const char* ident); Vcmethod_t* vcnextmeth(Vcmethod_t* meth);

DATA TRANSFORM METHODS

     Vcmethod_t* Vcdelta;
     Vcmethod_t* Vchuffman;
     Vcmethod_t* Vchuffgroup;
     Vcmethod_t* Vchuffpart;
     Vcmethod_t* Vcrle;
     Vcmethod_t* Vcmtf;
     Vcmethod_t* Vcbw;
     Vcmethod_t* Vcmap;
     Vcmethod_t* Vctranspose;

DISCIPLINE AND EVENT HANDLING

     typedef int (*Vcevent_f)(Vcodex_t*, int, Void_t*, Vcdisc_t*);
     typedef struct
     { Void_t*   data;        /* source or key string         */
            size_t    size;        /* size of string               */
            Vcevent_f eventf;      /* event handler                */
} Vcdisc_t; #define VC_OPENING #define VC_CLOSING #define VC_DISC

DELTA COMPRESSION TYPES AND FUNCTIONS

     typedef struct
     { Vcchar_t    type;
            Vcchar_t    size;
            Vcchar_t    mode;
} Vcdinst_t; typedef struct { Vcinst_t inst1;
            Vcinst_t    inst2;
} Vcdcode_t; typedef struct { Vcchar_t s_near;
            Vcchar_t    s_same;
            Vcdcode_t   code[256];
} Vcdtable_t; ssize_t vcdputtable(Vcdtable_t* tbl, Void_t* buf, size_t n); int vcdgettable(Vcdtable_t* tbl, Void_t* buf, size_t n);

MATCHING WINDOWS TO DELTA-COMPRESS LARGE FILES

     typedef struct {...}      Vcwindow_t;
     typedef struct {...}      Vcwmethod_t;
     typedef int (*Vcwevent_f)(Vcw_t* vcw, int type, Void_t* data,
                                    Vcwdisc_t* disc);
typedef struct { Sfio_t* srcf; /* source file */
            Sfio_t*    tarf;       /* target file                 */
            Vcwevent_f eventf;     /* event handler               */
} Vcwdisc_t; typedef struct { int type; /* VCD_[SOURCE|TARGET]FILE */
            Void_t*    wdata;      /* window data                 */
            ssize_t    wsize;      /* window length               */
            Sfoff_t    wpos;       /* starting position in file   */
            ssize_t    msize;      /* data size to be matched     */
} Vcwmatch_t; Vcwmethod_t* Vcwmirror; /* match by file positions */ Vcwmethod_t* Vcwvote; /* match by n-gram frequencies */ Vcwmethod_t* Vcwprefix; /* use prefix matching */ Vcwindow_t* vcwopen(Vcwdisc_t* disc, Vcwmethod_t* meth); int vcwclose(Vcwindow_t* vcw); Vcwmatch_t* vcwapply(Vcwindow_t* vcw, Void_t* data, size_t size,
                                Sfoff_t pos);
ssize_t vcwfeedback(Vcwindow_t* vcw, ssize_t size); Vcwmethod_t* vcwgetmeth(char* name); Vcwmethod_t* vcwnextmeth(Vcwmethod_t* meth);

HUFFMAN CODING TYPES AND FUNCTIONS

     ssize_t      vchsize(ssize_t nsym, ssize_t* freq, ssize_t* size, int* run);
     ssize_t      vchbits(ssize_t nsym, ssize_t* size, Vcbits_t* bits);
     ssize_t      vchputcode(ssize_t nsym, ssize_t* size, ssize_t maxs,
                               Vcchar_t* data, size_t dtsz);
ssize_t vchgetcode(ssize_t nsym, ssize_t* size, ssize_t maxs,
                               Vcchar_t* data, size_t dtsz);

TABLE COMPRESSION TYPES AND FUNCTIONS

     typedef struct _vcpltrans_s /* column specifications    */
     { ssize_t       index;  /* column index                 */
            ssize_t       pred1;  /* <0 if self compressing       */
            ssize_t       pred2;  /* >=0 if supporting pred1      */
} Vcpltrans_t; typedef struct _vcplan_s /* transform plan */ { ssize_t ncols; /* # of columns or row size */
            Vcpltrans_t*  trans;  /* the plan to transform data   */
            Vcodex_t*     bwt;    /* BWT to encode the plan       */
} Vcplan_t; Vcplan_t* vcplopen(Void_t* data, size_t size, size_t ncols); void vcplclose(Vcplan_t* plan); ssize_t vcplencode(Vcplan_t* plan, Void_t** codep); Vcplan_t* vcpldecode(Void_t* code, size_t cdsz);

SUFFIX ARRAY TYPES AND FUNCTIONS

     typedef struct _vcsfx_s
     { ssize_t*      idx;    /* sorted suffix array          */
            ssize_t*      inv;    /* inverted index/rank          */
            Vcchar_t*     str;    /* string to be sorted          */
            size_t        nstr;   /* length of string             */
} Vcsfx_t; Vcsfx_t* vcsfxsort(const Void_t* str, size_t nstr); ssize_t vcperiod(const Void_t* data, size_t dtsz);

BLOCK-MOVE PARSING

     typedef int (*Vcparse_f)(Vcparse_t* vcpa, int type, ssize_t undt,
                                   ssize_t here, ssize_t mtsz, ssize_t mtch);
typedef struct _vcparse_s { } Vcparse_t; int vcparse(Vcparse_t* vcpa, ssize_t prune);

BIT/BYTE/STRING FAST I/O SUBSYSTEM
     typedef struct _vcio_s
     { ...
            Vcbits_t  bits;  /* bits to be read or being cached    */
            ssize_t   nbits; /* number of bits in the above        */
} Vcio_t; void vcioinit(Vcio_t* io, Vcchar_t* buf, int n); size_t vciosize(Vcio_t* io); size_t vciomore(Vcio_t* io); size_t vcioextent(Vcio_t* io); Vcchar_t* vciodata(Vcio_t* io); Vcchar_t* vcionext(Vcio_t* io); Vcchar_t* vcioskip(Vcio_t* io, int n); void vciomove(Vcio_t* to, Vcio_t* from, int n); ssize_t vcioputc(Vcio_t* io, int c); int vciogetc(Vcio_t* io); ssize_t vcioputu(Vcio_t* io, Vcint_t v); Vcint_t vciogetu(Vcio_t* io); ssize_t vcioputm(Vcio_t* io, Vcint_t v, Vcint_t max); Vcint_t vciogetm(Vcio_t* io, Vcint_t max); ssize_t vcioput2(Vcio_t* io, Vcint_t v, Vcchar_t a, Vcchar_t b); Vcint_t vcioget2(Vcio_t* io, Vcchar_t a, Vcchar_t b); ssize_t vcioputg(Vcio_t* io, Vcint_t v); Vcint_t vciogetg(Vcio_t* io);" ssize_t vcioputs(Vcio_t* io, Void_t* s, size_t len); ssize_t vciogets(Vcio_t* io, Void_t* s, size_t len); void vciosetb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, int type); void vcioendb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, int type); void vciofilb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, ssize_t need); Vcbits_t vciodelb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, ssize_t ndel); void vcioflsb(Vcio_t* io, Vcbits_t bits, ssize_t nbit); Vcbits_t vcioaddb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
                             Vcbits_t add, ssize_t nadd);

FILE I/O

     Vcsfio_t;
     typedef struct _vcsfmeth_s
     { Vcmethod_t*  meth;   /* requested method to invoke    */
            Void_t*      data;   /* argument or init parameters   */
            ssize_t      size;   /* size information              */
} Vcsfmeth_t; Vcsfmeth_t* VCSF_WINDOW; /* signal setting window data */ Vcsfmeth_t* VCSF_STATES; /* specifying file for states */ Vcsfio_t* vcsfio(Sfio_t* sf, Vcsfmeth_t* list, ssize_t n, int flags); ssize_t vcsfread(Vcsfio_t* vcsf, Void_t* buf, size_t n); ssize_t vcsfwrite(Vcsfio_t* vcsf, const Void_t* data, size_t n); int vcsfsync(Vcsfio_t* vcsf); int vcsfclose(Vcsfio_t* vcsf); Vcsfmeth_t* vcsfmeth(Vcsfio_t* vcsf, ssize_t* nmeth);

MISCELLANIES
     double    vclog(unsigned int v);

DESCRIPTION

Vcodex provides a collection of data transforms including compression, differencing and encryption via a discipline and method interface (see The Discipline and Method Architecture for Reusable Libraries, Software Practice & Experience, v.30, pp.107-128, 2000).

VCODEX TYPES

Void_t*
This type is used to pass data between Vcodex and application code. Void_t is defined as void for ANSI-C and C++ and char for older compilation environments.

Vcchar_t
This defines the type of bytes as manipulated by Vcodex.

Vcint_t
This defines an integer type of maximum supported size.

Vcbits_t
This defines an integer type suitable for bit manipulation and I/O.

Vcodex_t
This defines the type of a Vcodex handle.

Vcdisc_t
This defines the type of a discipline structure.

Vcmethod_t
This defines the type of a data transform.

Vcmtarg_t
This defines the type of a structure describing arguments to a method.

VCODEX OPERATIONS

Vcodex_t* vcopen(Vcdisc_t* disc, Vcmethod_t* meth, Void_t* init, Vcodex_t* coder, int flags)
vcopen() creates a handle to process data. It returns the new handle on success or NULL on failure.

disc:
a discipline structure to describe data (see DISCIPLINE section).
meth:
a data transform method (see METHODS section).
init:
any additional parameters for the given method.
coder:
Many transforms (e.g., the Burrow-Wheeler transform Vcbw) only rearrange data to be processed by other transforms. The coder argument specifies a continuation coder, i.e., one to continue data processing of the various data parts produced by the main transform. Note that this is one way to compose different transforms to process data.
flags:
This gives the bit values to control behaviors of the new handle. The bits are composed from VC_ENCODE, VC_DECODE, VC_CLOSECODER and VC_CLOSEPARAMS. Exactly one of VC_ENCODE and VC_DECODE should be used to indicate whether the handle is used for encoding or decoding. The bit VC_CLOSECODER, if specified, indicates the continuation coder, if any, should be closed on closing the handle. The bit VC_CLOSEPARAMS indicates that init parameter argument should be closed on closing the handle.

ssize_t vcextract(Vcodex_t* vc, Void_t** datap, int state)
This function extract the states of the encoder vc. The data is returned in *storep while vcextract() returns the data length on success and -1 on failure.

state == 0:
The result can be used to reconstruct the encoder vc and its associated continuation coders.
state != 0:
Only the states of the top level coder will be extracted. See vcrestore() below for details.

Vcodex_t* vcrestore(Vcodex_t* vc, Void_t* data, size_t size)
vcrestore() reconstructs or re-initializes a handle from the given data. It returns the handle on success and NULL on failure.

vc == NULL:
In this case, if the data describes a sequence of coders, they will be all reconstructed and composed to form a final handle.
vc != NULL:
The handle vc is re-initialized with the given data. In this case, only the top handle in a composed sequence of handles is initialized.

int vcclose(Vcodex_t* vc)
This closes the vc handle, meaning to free all associated memory resources. vcclose() returns 0 on success or -1 on error.

ssize_t vcapply(Vcodex_t* vc, Void_t* data, size_t size, Void_t** output)
This applies the data transform associated with vc to the given data. If the handle was opened for decoding, the given data and size should match exactly the previously encoded result. vcapply() returns the length of the output data buffer or -1 on failure. If output == NULL, only the size of the result will be returned. If output != NULL, the transformed data will be returned in *output (see also vcundone() below.) Note that each call to vcapply() returns a separate buffer of encoded or decoded data. These buffers should be freed as soon as the respective data have been consumed. See vcsetbuf() for how to do that.

size_t vcundone(Vcodex_t* vc)
Certain methods (e.g., Vctable) may not process all given data. After a successful vcapply() call (i.e., it returns a non-negative value), the call vcundone() tells the amount of data left unprocessed.

Vcdisc_t* vcdisc(Vcodex_t* vc, Vcdisc_t* disc)
This changes the discipline of vc to disc. On success, vcdisc() returns the previous discipline if it is not NULL; otherwise, it returns disc. On failure, vcdisc() returns NULL. See also the methods Vctable and Vctranspose.

Vcchar_t* vcsetbuf(Vcodex_t* vc, Vcchar_t* here, ssize_t size, ssize_t head)
This call manages the buffer pool for a handle. It returns NULL on failure. If size is negative, this call frees all buffers allocated via the handle vc and its continuation coders, then return NULL. Otherwise, there are two cases:

here == NULL:
This call allocates a new buffer of the requested size. It is guaranteed that there will be at least head writable bytes in front of the returned buffer. The new buffer is returned.
here != NULL:
In this case, here must be within the bounds of the most recent allocated buffer. The call is to ensure that the size of this buffer is at least size. Note that this may mean reallocating of the buffer to a different location. If successful, the call returns a pointer that is at the same offset as here was to the old buffer location.

int vccontext(Vcodex_t* vc, int context);
A handle may be used to compress different types of data with states carried thru different vcapply() calls per type. For example, a handle vc using the method Vctable would compute a transform plan on the first vcapply() call, then reuse it in subsequent calls. This would not work if vc is used to process tables with different characteristics. To do that, different contexts can be created for different data types. If the argument context is negative, the vccontext() call creates a new context. Otherwise, it sets the states of vc to the specified context. vccontext() returns the context set or -1 on failure.

Vcmethod_t* vcgetmeth(const char* name, const char* ident);

Vcmethod_t* vcnextmeth(Vcmethod_t* meth);
These functions allow searching for particular methods and to iterate through them. vcgetmeth() returns the method with the given name or identification ident. If name and ident are NULL, it returns the first method. vcnextmeth() returns the next method after meth, if any.

VCODEX DATA TRANSFORMING METHODS
Methods are of the type Vcmethod_t*. The type Vcmethod_t contains the below public fields:
     typedef struct _vcmethod_s
     {    ...
               char*          name;   /* the string name of the data transform  */
               unsigned char* ident;  /* identifier to use in persistent data   */
               char*          desc;   /* short description of the transform     */
               Vcmtarg_t*     args;   /* arguments to be used with vcopen()     */
               ssize_t        window; /* suggested window to use on large files */
} Vcmethod_t;
The args field of Vcmethod_t defines a list of arguments usable as the third argument of vcopen(). Note that although init is declared as Void_t* to suit the prototype of vcopen(), for most methods, it is a character string.
     typedef struct _vcmtarg_s
     {    char*          name;   /* the string name of the argument        */
               unsigned char* ident;  /* identifier to use in persistent data   */
               char*          desc;   /* short description of the argument      */
               Void_t*        data;   /* any associated data                    */
} Vcmtarg_t;
Vcodex currently provides the following data transforming methods:

Vcdelta
This method is a delta compressor based on generalized Lempel-Ziv parsing. It compresses a target data segment either by itself or against another source data segment. The source data segment, if any, should be given in the discipline structure. If the init argument to vcopen() is the string "s", a suffix sorting algorithm is used to compute matches. Otherwise, a fast approximation method based on hashing is used.

Vchuffman
This method performs Huffman coding.

Vchuffgroup
This method first divides the given data into small contiguous parts of equal size (except perhaps the last one). Then, the parts are collected into groups so that each group can be compressed separately with its own Huffman code table.

Vchuffpart
This method divides the given data into contiguous sections each of which uses a separate Huffman coding table optimized for it.

Vcrle
This method encodes runs in data. Normally, runs are encoded using a scheme whereby a run is indicated by some escape character and its length is encoded in a compact variable size format. The run lengths are coded in a separate area from the data so that the two types of data can be effectively compressed via a later continuation coder if there is one. An alternative run encoding method is indicated by giving the string "0" as the init argument of vcopen(). In this case, only runs of 0's are encoded using a binary encoding method. This is useful in a compression technique that produces large sequences of zeros such as the Burrows-Wheeler transform Vcbw or the table transform Vctable.

Vcbw
Vcbw performs a Burrows-Wheeler transform on the given data. This data is amenable to better compression via some combination of move-to-front, run-length encoding and entropy coding.

Vcmtf
This method performs a move-to-front transform on the data, i.e., mapping each data byte into their positions in some dynamically updated array of the 256 byte values. By default, the array is updated using a predictive method aiming at creating more zeros. If the init argument of vcopen() is the string "0", the array is updated by moving the encoded byte to position 0 (i.e., the traditional MTF transform).

Vctranspose
This method transposes the rows and columns of a table of data. It only processes the portion of data fitting the table profile, i.e., the maximum multiple of the row size that fits the data size. The amount left unprocessed can be told via vcundone(). By default, the number of columns is encoded at the start of transformed data. If this is not desirable, the string "0" should be given as the init argument of vcopen(). If the disc->size field is positive, it is taken to be the number of columns. Otherwise, the data to be transposed itself is analyzed to compute the number of columns. Note that when a handle is used repeatedly to transpose data, the number of columns is learned only once on the first vcapply() call. This number can be reset or relearned via an appropriate call to vcdisc(). See also vccontext().

Vctable
Vctable transforms the given tabular data into a form that is amenable to compression via combinations of move-to-front (Vcmtf), run-length encoding (Vcrle) and entropy coding (e.g., Vchuffgroup). Vctable only processes the portion of data fitting the table profile, i.e., the maximum multiple of the number of columns. The amount left unprocessed can be told via vcundone(). Normally, the number of columns is learned directly from data. However, it can also be explicitly set by setting disc.size in the associated discipline, if any.

Vcmap
This method maps each input byte to another byte. The mapping is defined in a Vcchar_t array of size 256 where the value at each position is the mapped value of the position. Application-specific mappings can be specified via the field data of the discipline structure. A number of predefined mappings are provided to translate between the ASCII codeset and various versions of the EBCDIC codeset. They can be selected on handle opening by passing an appropriate string as the init argument of vcopen(). These strings are:

a2e, e2a:
These pertain to the EBCDIC character set defined by the dd(1) command in the X/OPEN standard.
a2i, i2a:
These pertain to the EBCDIC character set defined on IBM systems as conforming with the dd(1) command in the X/OPEN standard.
a2h, h2a:
These pertain to the EBCDIC character set defined on IBM-37 AS/400 systems.
a2o, o2a:
These pertain to the EBCDIC character set defined defined on IBM OpenEdition Unix systems.
a2s, s2a:
These pertain to the EBCDIC character set defined on Siemens Posix systems.

VCODEX DISCIPLINE AND EVENT HANDLING

A delta compression method such as Vcdelta requires source data to compare against while other methods such as Vctranspose or Vctable may need training data. Such information must be supplied by an application via the discipline structure Vcdisc_t:

              typedef struct
              { Void_t*    data;
                size_t     size;
                Vcevent_f  eventf;
              } Vcdisc_t;

int (*eventf)(Vcodex_t* dt, int type, Void_t* data, Vcdisc_t* disc)
If not NULL, eventf announces various events. Unless noted otherwise, a negative value causes the calling operation to terminate with failure while a non-negative return value let the calling function proceed normally. Following are the events:

VC_OPENING:
If the argument data is NULL, this announces the opening of the handle vc. Id the argument data is the method used to open the handle, this indicates an error has happened during initialization.
VC_CLOSING:
The handle vc is being closed. The argument data will be NULL in this case.
VC_DISC:
The current discipline of vc is being changed to the new one given in (Vcdisc_t*)data.

VCDELTA TYPES AND FUNCTIONS

The types Vcdinst_t, Vcdcode_t and Vcdtable_t describe the structures of a code table and the instructions. Their usage is described in the paper IETF RFC draft-korn-vcdiff-xx.txt (http://www.ietf.org).

ssize_t vcdputtable(Vcdtable_t* table, Void_t* buf, size_t n)
This function encodes the given table and stores the result in buf. It returns the output size on success and -1 on error.

int vcdgettable(Vcdtable_t* table, Void_t* buf, size_t n)
This function decodes the data in buf into a code table which is stored in the space pointed to by table. It returns 0 on success and -1 on error.

WINDOWING METHODS
This collection of types and functions provides a subpackage for computing matching windows to help with delta compresssion.

Vcwmirror
This method reuses the position of the data in the target file as the tentative start of the computed matching window in the source file. It may add a small amount of data around that position to enhance string matching.

Vcwvote
This method uses the frequencies of the n-grams (for some private n) in a target window of data and a voting algorithm to find a matching window in either the source file or the target file.

Vcwprefix
This method uses prefix matching on blocks of data to determine windows matching portions of target data.

Vcwindow_t* vcwopen(Vcwdisc_t* disc, Vcwmethod_t* meth);
This opens a handle for computing matching windows based on the given method meth. The discipline structure, disc, provides source and target file handles and an event handling function. Source and target file handles, if given, should be opened for reading. Currently, the event handling function is only called with VCW_OPENING and VCW_CLOSING. If meth is NULL, the handle is opened for decoding data.

int vcwclose(Vcwindow_t* vcw);
This closes the handle vcw.

Vcwmatch_t* vcwapply(Vcwindow_t* vcw, Void_t* data, size_t size, Sfoff_t pos);
This applies the windowing method of vcw to compute a suitable matching window to the given data segment. pos gives the starting position of the data segment in the target file. If the handle was opened with both a source and a target files, the computed matching window can be anywhere in the source file but it is restricted to the part of the target file before pos. A matching window, if found, is returned in a Vcwmatch_t structure with elements:

type:
This is either VCD_SOURCEFILE or VCD_TARGETFILE to indicate which file the matching window is from.
wdata, wsize, wpos:
These give the desired window data and the position in the respective file.
msize:
This gives the amount of data from data that should be used for compressing against the computed window data.

ssize_t vcwfeedback(Vcwindow_t* vcw, ssize_t size);
This tells vcw the compression result from the use of the window computed in the last vcwapply() call. Certain methods may use this information to focus window searching.

Vcwmethod_t* vcwgetmeth(char* name);

Vcwmethod_t* vcwnextmeth(Vcwmethod_t* meth);
These functions allow searching for particular window matching methods and to iterate through them. vcwgetmeth() returns the window matching method with the given name if it is not NULL. Otherwise, it returns the first method. vcwnextmeth() returns the next method after meth, if any.

HUFFMAN CODING TYPES AND FUNCTIONS
The compression methods Vchuffman, Vchuffgroup and Vchuffpart are based on the Huffman encoding. The functions described here provide efficient algorithms for encoding and decoding of certain basic data structures required in Huffman encoding.

ssize_t vchsize(ssize_t nsym, ssize_t* freq, ssize_t* size, int* run)
This function computes the lengths of the Huffman codes for all symbols. Both arrays freq[] and size[] should be of size nsym. freq[] has the frequencies of the symbols. size[] will store the lengths in bits of the codes. If a data set consists of a single run, the run symbol is returned in *run. vchsize() returns the maximum size of any code.

ssize_t vchbits(ssize_t nsym, ssize_t* size, Vcbits_t* bits)
This function computes the actual bit strings to encode each symbol based on the sizes given in size[]. It returns the maximum code size.

ssize_t vchputcode(ssize_t nsym, ssize_t* size, ssize_t maxs, Vcchar_t* data, size_t dtsz)
The Huffman code is completely specified by the lengths of the code strings as given in the size array. This function encodes the size[] array in a compressed portable form suitable for output. The data array should have a sufficiently large size dtsz\P to store the result. It returns the length of the result in bytes.

ssize_t vchgetcode(ssize_t nsym, ssize_t* size, ssize_t maxs, Vcchar_t* data, size_t dtsz)
This function decodes the size[] array from the encoded data given in data. It returns the number of bytes consumed from data for decoding size[].

TABLE COMPRESSION TYPES AND FUNCTIONS
The Vctable method provides an efficient algorithm for compressing tabular data. Its execution plan is computed separately by vcplopen(). The data types and functions described here are used for computing execution plans for table compression.
     typedef struct _vcpltrans_s /* column specifications    */
     { ssize_t       index;  /* column index                 */
            ssize_t       pred1;  /* <0 if self compressing       */
            ssize_t       pred2;  /* if pred1<0, 1 for doing MTF  */
} Vcpltrans_t; typedef struct _vcplan_s /* transform plan */ { ssize_t ncols; /* row size or # of columns */
            Vcpltrans_t*  trans;  /* the plan to transform data   */
            Vcodex_t*     bwt;    /* BW-transform to code plan    */
} Vcplan_t;

Vcplan_t* vcplopen(Void_t* data, size_t size, size_t ncols);
This function uses the given data to compute a plan that can be used to compress tabular data of the same type. The argument ncols tells the number of columns in the data table. vcplopen() returns the plan on success or NULL on failure.

void vcplclose(Vcplan_t* plan);
This function frees a plan computed by vcplopen().

ssize_t vcplencode(Vcplan_t* plan, Void_t** codep);

Vcplan_t* vcpldecode(Void_t* code, size_t cdsz);
vcplencode() computes from the given plan a string suitable for permanent storage. It returns the length of the string on success and -1 on failure. The code string itself is returned in *codep on success. vcpldecode() reconstructs the plan encoded in the given string.

SUFFIX ARRAY TYPES AND FUNCTIONS
     typedef struct _vcsfx_s
     { ssize_t*      idx;    /* sorted suffix indices       */
            ssize_t*      inv;    /* inverse permuation of idx[] */
            Vcchar_t*     str;    /* string with sorted suffixes */
            size_t        nstr;   /* length of above string      */
} Vcsfx_t;

Vcsfx_t* vcsfxsort(const Void_t* str, size_t nstr);
vcsfxsort() sorts indices of suffixes of str to create a suffix array. The string is treated as if it ends with an element larger than all other bytes in the string. In this way, the order for all suffixes are well-defined. vcsfxsort() returns a structure of type Vcsfx_t as above on success or NULL on failure. The field Vcsfx_t.idx has the indices of all suffixes sorted lexicographically. The field Vcsfx_t.inv has the inverse permutation of Vcsfx_t.idx. That is, inv[idx[i]] == i.

ssize_t vcperiod(const Void_t* data, size_t dtsz);
vcperiod computes a quasi-period in the given data, if any. This is useful to determine the number of columns in table data.

BLOCK-MOVE PARSING
The delta compression method Vcdelta uses a greedy parsing scheme based on a combination of Lempel-Ziv'77 and Tichy's block-move. Source and target strings are conceptually concatenated into a superstring. The resulting data is then parsed starting from target data. Matching data is restricted to within the respective source or target strings. For example, consider the source sequence: abcde and the target sequence: abcdeabcdeabcdf. The target sequence can be parsed into three instructions as follows:
     COPY 5  0 /* copy 5 bytes from 0, the start of source data */
     COPY 9  5 /* copy 9 bytes from 5, the start of target data */
     ADD  1  f /* add 1 literal byte:  f                        */
     typedef int  (*Vcparse_f)(Vcparse_t* vcpa, int type, ssize_t undt,
                                    ssize_t here, ssize_t mtsz, ssize_t mtch);
typedef struct _vcparse_s { } Vcparse_t;

int vcparse(Vcparse_t* vcpa, ssize_t prune);
This function parses the target data and calls vcpa->parsef on each parsed instruction. Each call to vcpa->parsef needs the following arguments:

vcpa:
The handle specifying the data to be processed.
type:
The type of instruction.
unmt:
The start of unmatchable data.
here, mtsz, mtch:
Current data position, matchable length, and the matching address if any.

BIT/BYTE/STRING I/O SUBSYSTEM
The below functions are provided by Vcodex for portable data encoding. These functions perform only minimal bound checking, if any. Applications should take care of such details.

void vcioinit(Vcio_t* io, Vcchar_t* buf, int n);
This initializes a Vcio_t structure io with the given buffer buf with size n bytes.

size_t vciosize(Vcio_t* io);
This returns the amount that the buffer has been written to or read from.

size_t vciomore(Vcio_t* io);
This returns the size of the remaining part of the buffer that has not yet been written to or read from.

size_t vcioextent(Vcio_t* io);
This returns the size of the buffer.

Vcchar_t* vciodata(Vcio_t* io);
This returns the original data buffer.

Vcchar_t* vcionext(Vcio_t* io);
This returns the pointer to the next position in the buffer that can be written to or read from.

Vcchar_t* vcioskip(Vcio_t* io, int n);
This skips ahead n bytes and returns the resulting pointer to the buffer.

void vciomove(Vcio_t* to, Vcio_t* from, int n);
This moves n bytes between the given buffer and advances their positions accordingly.

ssize_t vcioputc(Vcio_t* io, int c);

int vciogetc(Vcio_t* io);
These functions write and read bytes respectively.

ssize_t vcioputs(Vcio_t* io, Void_t* s, size_t len)

ssize_t vciogets(Vcio_t* io, Void_t* s, size_t len)
These functions write and read arrays of bytes respectively. They return the number of bytes written or read.

ssize_t vcioputu(Vcio_t io, Vcint_t v);

Vcint_t vciogetu(Vcio_t* io);
These functions write and read unsigned integers using a 7-bit encoding in the Sfio style. vcputu() returns the number of bytes used to encode the value.

ssize_t vcioputm(Vcio_t* io, Vcint_t v, Vcint_t max);

Vcint_t vciogetm(Vcio_t* io, Vcint_t max)
These functions write and read unsigned integers using an 8-bit encoding in the Sfio style. The size of the encoding depends on the magnitude of max. For example, if max is 255, only one byte is used while, if max is 256, two bytes will be used. vcputm() returns the number of bytes used to encode the value.

ssize_t vcioput2(Vcio_t* io, Vcint_t v, Vcchar_t a, Vcchar_t b);

Vcint_t vcioget2(Vcio_t* io, Vcchar_t a, Vcchar_t b);
These functions write and read unsigned integers using a compact binary coding with a and b as digits. vcioput2() returns the number of bytes used to encode the value.

ssize_t vcioputg(Vcio_t* io, Vcint_t v);

Vcint_t vciogetg(Vcio_t* io);
These functions write and read positive integers using a variation of Elias Gamma code using fast table look-ups. vcioputg() returns the number of bits used to encode the value.

void vciosetb(Vcio_t* io, Vcbits_t bits, ssize_t nbits, int type);

void vcioendb(Vcio_t* io, Vcbits_t bits, ssize_t nbits, int type);

void vciofilb(Vcio_t* io, Vcbits_t bits, ssize_t nbits, need);

void vcioflsb(Vcio_t* io, Vcbits_t bits, ssize_t nbits);

Vcbits_t vcioaddb(Vcio_t* io, Vcbits_t bits, ssize_t nbits, Vcbits_t add, ssize_t nadd);

Vcbits_t vciodelb(Vcio_t* io, Vcbits_t bits, ssize_t nbits, ssize_t ndel);
These macro functions constitute a set of operations for fast bit I/O. The variables bits and nbits are typically declared by the application perhaps by using fast register variables to match hardware speed. These variables will be asociated with the handle io via vciosetb(). The association is for either encoding, i.e., type=VC_ENCODE, or decoding, i.e., type=VC_DECODE, and must be done before any bit operations. The structure Vcio_t also provides two fields bits and nbits that can be used as the variables discussed above. These fields enable applications to keep states in the Vcio_t structure across function calls. As they cannot be register variables, arithmetics done on them would not be at hardware speed. vcioendb() must be called to finalize bit I/O to get ready for other forms of I/O. vciofilb() fills bits with bits from io if nbits < need. nbits is updated to indicate the number of available bits. vcioflsb() flushes the bits available in bits out to io. Bits are flushed in 8-bit aggregates into bytes. nbits is updated to indicate the number of bits remained unflushed in bits. vcioaddb() adds nadd bits from add to bits. It is the caller's respobsibility to ensure that there is room in bits to add the new bits. Otherwise, bits may be lost. vciodelb() takes off ndel number of bits from bits. Such bits are lost forever.

FILE I/O
The functions described thus far enable data transforming in main memory. Vcodex also provides functions for processing file data. Depending on local environments, these functions are either based on the Sfio library or on Stdio, the C standard I/O interface.

Vcsfio_t
The type Vcsfio_t represents a stream for performing I/O. When Stdio is used, it is an opaque type keeping state information needed for data transformation. In this case, the I/O functions vcsfwrite() and vcsfread() should be used to read and write data. When Sfio is used, it is defined as Sfio_t. In this case, all normal Sfio operations could be used on the stream. Functions such as vcsfread() and vcsfwrite() will be macro redefinitions of corresponding Sfio functions.

Vcsfmeth_t
The type Vcsfmeth_t allows application to specify a list of data trandforming methods required for compression.
     typedef struct _vcsfmeth_s
     { Vcmethod_t*  meth;   /* requested method to invoke    */
            Void_t*      data;   /* argument or init parameters   */
            ssize_t      size;   /* size information              */
} Vcsfmeth_t;
The field meth normally specifies a data transform. The data and size fields give the arguments needed for the particular data transform. The field meth can also be VCSF_WINDOW or VCSF_STATES. In the first case, the field data specifies a windowing handle of type Vcwindow_t* while size specifies the window size, i.e., the amount of data to collect before transforming. In the second case, data specifies a file name to use to store data transforming states in a persistent format. Certain data transforms (e.g., Vctable) could store and restore states via this mechanism to reduce initialization cost.

Vcsfio_t* vcsfio(Sfio_t* sf, Vcsfmeth_t* list, ssize_t n, int flags);
This function constructs a Vcsfio_t handle to perform encoding or decoding on the stream sf. When the Sfio library is in use, the same stream sf is returned. When the Stdio library is in use, the type Sfio_t is a macro redefinition of the FILE type while Vcsfio_t is an opaque type as described earlier. The given list with n elements provides windowing data and methods used to encode data. On encoding, the entire list of methods must be supplied. On decoding, the methods will be retrieved from the encoded data itself. The flags argument is a combination of the bits VC_ENCODE, VC_DECODE and VC_CLOSEWINDOW:

VC_ENCODE, VC_DECODE:
Exactly one of these bits should be specified. In the former case, the stream sf should be opened for writing and, in the latter case, for reading.
VC_CLOSEWINDOW:
This bit, if given, indicates that any provided windowing handle should be closed when the stream is closed.

ssize_t vcsfread(Vcsfio_t* vcsf, Void_t* buf, size_t n);
This function reads from a stream of encoded data and decode them into the given buffer buf. It returns the length of the decoded data on success and a negative value on failure.

ssize_t vcsfwrite(Vcsfio_t* vcsf, const Void_t* data, size_t n);
This function writes the raw data given in data to the stream vcsf. Such data will be encoded before written out to the underlying data stream. On success, the amount of raw data written is returned. On failure, a negative value is returned.

int vcsfsync(Vcsfio_t* vcsf);
This function flushes unwritten data out to the underlying stream.

int vcsfclose(Vcsfio_t* vcsf);
This function closes the stream vcsf. Unwritten data will be flushed out to the underlying stream first.

Vcsfmeth_t* vcsfmeth(Vcsfio_t* vcsf, ssize_t* nmeth);
This function returns the list of transforms associated with vcsf. The list is an array terminated with an item whose meth field is NULL. In addition, if nmeth is not NULL, it will be used to return the number of items.

MISCELLANIES

double vclog(unsigned int v)
This function computes the base-2 logarithm of the unsigned integer v.

ACKNOWLEDGEMENTS

David Korn helped designing the VCDIFF data format standardized in RFC3284. Binh Vo helped designing and implementing a number of compression methods including the Huffman coders Vchuffgroup and Vchuffpart and the predictive move-to-front transform Vcmtf. Binh also invented the table compressor Vctable.

SEE ALSO

sfio(3)

AUTHOR

Kiem-Phong Vo, kpv@research.att.com


May 01 2005 November 07, 2006