VCODEX(3)C LIBRARY FUNCTIONSVCODEX(3)


NAME

Vcodex - Data transformation for compression, encryption, etc.

SYNOPSIS

#include <vcodex.h>
libvcodex.a      -lvcodex

BASIC TYPES

Void_t*;
Vcchar_t;
Vcint_t;
Vcbits_t;
Vcdisc_t;
Vcmethod_t;
Vcmtarg_t;
Vcwalk_f;
Vcodex_t;

BASIC OPERATIONS

Vcodex_t*    vcopen(Vcdisc_t* disc, Vcmethod_t* meth,
                         Void_t* init, Vcodex_t* coder, int flags);
Vcodex_t* vccreate(char* spec, int flags); ssize_t vcextract(Vcodex_t* vc, Void_t** datap); Vcodex_t* vcrestore(Vcodex_t* vc, Void_t* data,
                            size_t size);
int vcclose(Vcodex_t* vc);

ssize_t vcapply(Vcodex_t* vc, Void_t* data,

                          size_t size, Void_t** output);
ssize_t vcrecode(Vcodex_t* vc, Vcchar_t** datap,
                           ssize_t* sizep, ssize_t head);
size_t vcundone(Vcodex_t* vc);

Vcchar_t* vcbuffer(Vcodex_t* vc, Vcchar_t* trunc,

                           ssize_t size, ssize_t head);
int vctellbuf(Vcodex_t* vc, Vcbuffer_f bufferf);

Vccontext_t* vcinitcontext(Vcodex_t* vc, Vccontext_t* ctxt); Vccontext_t* vcfreecontext(Vcodex_t* vc, Vccontext_t* ctxt); type vcgetcontext(vc, type);

TYPES AND FUNCTIONS FOR DATA TRANSFORM METHODS

typedef struct _vcmethod_s
{ Vcapply_f  encodef;     /* function to encode a data set  */
       Vcapply_f  decodef;     /* function to decode a data set  */
       int        (*eventf)(Vcodex_t*, int, Void_t*);
       char*      name;        /* the name of the transform      */
       char*      desc;        /* a description of the transform */
       Vcmtarg_t* args;        /* arguments usable with vcopen() */
       ssize_t    window;      /* suggested window size          */
       int        type;        /* VC_MTSOURCE                    */
} Vcmethod_t;

typedef struct _vcmtarg_s
{ char*      name;        /* the name of the argument       */
       char*      desc;        /* a description of the argument  */
       Void_t*    data;        /* any associated data            */
} Vcmtarg_t;

typedef struct _vcmtcode_s
{ Vcchar_t*  data;        /* encoded data of a handle       */
       ssize_t    size;        /* size of encoded data           */
       Vcodex_t*  coder;       /* reconstructed coder            */
} Vcmtcode_t;

#define VC_OPENING #define VC_CLOSING #define VC_INITCONTEXT #define VC_FREECONTEXT #define VC_FREEBUFFER #define VC_SETMTARG #define VC_EXTRACT #define VC_RESTORE #define VC_GETIDENT #define VC_TELLBUFFER

int vcaddmeth(Vcmethod_t** list, ssize_t size); Vcmethod_t* vcgetmeth(char* name, int type); int vcwalkmeth(Vcwalk_f walkf, Void_t* disc);

char* vcgetident(Vcmethod_t* meth, char* buf, ssize_t size);

char* vcgetmtarg(char* data, char* val, ssize_t vlsz,

                             Vcmtarg_t* list, Vcmtarg_t** arg);
int vcsetmtarg(Vcodex_t* vc, char* name, void* val,
                             int type);
type vcgetmtdata(Vcodex_t* vc, type); void vcsetmtdata(Vcodex_t* vc, data);

AVAILABLE METHODS

Vcmethod_t*  Vcdelta;
Vcmethod_t*  Vcsieve;
Vcmethod_t*  Vchamming;
Vcmethod_t*  Vchuffman;
Vcmethod_t*  Vchuffgroup;
Vcmethod_t*  Vchuffpart;
Vcmethod_t*  Vcrle;
Vcmethod_t*  Vcmtf;
Vcmethod_t*  Vcbwt;
Vcmethod_t*  Vcmap;
Vcmethod_t*  Vctranspose;
Vcmethod_t*  Vctable;
Vcmethod_t*  Vcrdb;
Vcmethod_t*  Vcrelation;

DISCIPLINE AND EVENT HANDLING

typedef int  (*Vcevent_f)(Vcodex_t*, int, Void_t*, Vcdisc_t*);

typedef struct { Void_t* data; /* additional data, e.g., source */

       size_t     size;        /* size of the additional data   */
       Vcevent_f  eventf;      /* event handler                 */
} Vcdisc_t;

#define VC_OPENING #define VC_CLOSING #define VC_DISC

Vcdisc_t* vcdisc(Vcodex_t* vc, Vcdisc_t* disc); Vcdisc_t* vcgetdisc(Vcodex_t* vc);

DELTA COMPRESSION TYPES AND FUNCTIONS

typedef struct
{ Vcchar_t   type;
       Vcchar_t   size;
       Vcchar_t   mode;
} Vcdinst_t;

typedef struct { Vcinst_t inst1;

       Vcinst_t   inst2;
} Vcdcode_t;

typedef struct { Vcchar_t s_near;

       Vcchar_t   s_same;
       Vcdcode_t  code[256];
} Vcdtable_t;

ssize_t vcdputtable(Vcdtable_t* tbl, Void_t* buf, size_t n); int vcdgettable(Vcdtable_t* tbl, Void_t* buf, size_t n);

MATCHING WINDOWS TO DELTA-COMPRESS LARGE FILES

typedef struct {...}      Vcwindow_t;
typedef struct {...}      Vcwmethod_t;

typedef int (*Vcwevent_f)(Vcw_t* vcw, int type, Void_t* data,

                               Vcwdisc_t* disc);
typedef struct { Sfio_t* srcf; /* source file */
       Sfio_t*    tarf;       /* target file                 */
       Vcwevent_f eventf;     /* event handler               */
} Vcwdisc_t;

typedef struct
{ int        type;       /* VCD_[SOURCE|TARGET]FILE     */
       Void_t*    wdata;      /* window data                 */
       ssize_t    wsize;      /* window length               */
       Sfoff_t    wpos;       /* starting position in file   */
       ssize_t    msize;      /* data size to be matched     */
} Vcwmatch_t;

Vcwmethod_t* Vcwmirror; /* match by file positions */ Vcwmethod_t* Vcwvote; /* match by n-gram frequencies */ Vcwmethod_t* Vcwprefix; /* use prefix matching */

Vcwindow_t* vcwopen(Vcwdisc_t* disc, Vcwmethod_t* meth); int vcwclose(Vcwindow_t* vcw); Vcwmatch_t* vcwapply(Vcwindow_t* vcw, Void_t* data,

                           size_t size, Sfoff_t pos);
ssize_t vcwfeedback(Vcwindow_t* vcw, ssize_t size); Vcwmethod_t* vcwgetmeth(char* name); int vcwwalkmeth(Vcwalk_f walkf, Void_t* disc);

HUFFMAN CODING TYPES AND FUNCTIONS

ssize_t      vchsize(ssize_t* freq, ssize_t* size, int* runb);
ssize_t      vchbits(ssize_t* size, Vcbits_t* bits);
int          vchcopy(Vcodex_t* vc, ssize_t* freq,
                          ssize_t* size, ssize_t maxs);
ssize_t vchputcode(ssize_t* size, ssize_t maxs,
                             Vcchar_t* data, size_t dtsz);
ssize_t vchgetcode(ssize_t* size, ssize_t maxs,
                             Vcchar_t* data, size_t dtsz);

FIXED-LENGTH TABLE TRANSFORM TYPES AND FUNCTIONS

#define      VCTBL_CEE    /* conditional empirical entropy  */
#define      VCTBL_RLE    /* run-length compressive entropy */
#define      VCTBL_LEFT   /* left to right dependency       */
#define      VCTBL_RIGHT  /* right to left dependency       */
#define      VCTBL_SINGLE /* use a single predictor only    */

typedef struct _vctblcolumn_s /* column specifications      */
{ ssize_t    index;       /* column index                   */
       ssize_t    pred1;       /* <0 if self compressing         */
       ssize_t    pred2;       /* >=0 if supporting pred1        */
} Vctblcolumn_t;

typedef struct _vctblplan_s /* transform plan */ { ssize_t ncols; /* # of columns or row size */

       Vctblcolumn_t* trans;   /* the plan to transform data     */
} Vctblplan_t;

Vctblplan_t* vctblmakeplan(Void_t* data, size_t size,

                                size_t ncols, int flags);
void vctblfreeplan(Vctblplan_t* plan); ssize_t vctblencodeplan(Vctblplan_t* plan, Void_t** codep); Vctblplan_t* vctbldecodeplan(Void_t* code, size_t cdsz);

RELATIONAL DATA TRANSFORM TYPES AND FUNCTIONS

#define VCRD_FIELD        /* given data is field-oriented   */
#define VCRD_FULL         /* make fields or records full    */
#define VCRD_FIXED        /* fixed-length field             */

#define VCRD_DOT /* field was of the form x.y... */ #define VCRD_SLASH /* like above but form x.y.../z */

typedef struct _vcrdsepar_s
{ int           fsep;     /* computed field separator       */
       int           rsep;     /* record separator               */
       ssize_t       size;     /* actual amount of data used     */
       ssize_t       recn;     /* total number of records        */
       ssize_t       recf;     /* # records containing an fsep   */
       double        allz;     /* (entropy)size for all data     */
       double        fldz;     /* total (entropy)size by fields  */
} Vcrdsepar_t;

typedef struct _vcrdinfo_s
{ ssize_t       recn;     /* number of records, if known    */
       ssize_t       fldn;     /* number of fields, if known     */
       ssize_t*      flen;     /* field lengths, if known        */
       int           fsep;     /* field separator, if needed     */
       int           rsep;     /* record separator, if needed    */
} Vcrdinfo_t;

typedef struct _vcrdrecord_s
{ Vcchar_t*     data;     /* data in a field for a record   */
       ssize_t       dtsz;     /* size of data                   */
} Vcrdrecord_t;

typedef struct _vcrdfield_s
{ int           type;     /* field type data                */
       Vcrdrecord_t* rcrd;     /* list of records for this field */
       ssize_t       maxz;     /* maximum size of any record     */
       ssize_t*      vect;     /* transform vector by this field */
       Vcchar_t*     data;     /* field data if allocated here   */
       Vcchar_t*     exdt;     /* space for extracted data       */
} Vcrdfield_t;

typedef struct _vcrdtable_s
{ Vcrdinfo_t*   info;     /* information on table data      */
       ssize_t       parz;     /* size of parsed data            */
       ssize_t       recn;     /* number of records              */
       ssize_t       fldn;     /* number of fields               */
       Vcrdfield_t*  fld;      /* list of fields                 */
       ssize_t*      vtmp;     /* temp space of recn elements    */
       Vcchar_t*     exdt;     /* space for extracted data       */
} Vcrdtable_t;

typedef struct _vcrdplan_s
{ ssize_t    fldn;        /* number of fields               */
       ssize_t    pred[];      /* predictor list                 */
} Vcrdplan_t;

ssize_t      vcrdsepar(Vcrdsepar_t* rds, ssize_t nrds,
                         Vcchar_t* data, ssize_t dtsz, int rsep);
Vcrdtable_t* vcrdparse(Vcrdinfo_t* info,
                         Vcchar_t* data, ssize_t dtsz, int type);
void vcrdclose(Vcrdtable_t* tbl); ssize_t vcrdfield(Vcrdtable_t* tbl, ssize_t f, ssize_t n,
                         Vcchar_t* data, ssize_t dtsz);
int vcrdattrs(Vcrdtable_t* tbl, ssize_t f, int attrs,
                         int type);
ssize_t vcrdsize(Vcrdtable_t* tbl); ssize_t vcrdextract(Vcrdtable_t* tbl, ssize_t f,
                         Vcchar_t** datap, int type);
typedef ssize_t (*Vcrdweight_f)(Vcchar_t* str1, ssize_t n1,
                         Vcchar_t* str2, ssize_t n2);
ssize_t vcrdweight(Vcrdtable_t* tbl, ssize_t f,
                         ssize_t* vect, Vcrdweight_f wghtf);
int vcrdvector(Vcrdtable_t* tbl, ssize_t f,
                         Vcchar_t* trdt, ssize_t dtsz, int type);
Vcrdplan_t* vcrdmakeplan(Vcrdtable_t* tbl, Vcrdweight_t wghtf); void vcrdfreeplan(Vcrdplan_t* plan); int vcrdexecplan(Vcrdtable_t* tbl, Vcrdplan_t* plan,
                         int type);

SUFFIX ARRAY TYPES AND FUNCTIONS

typedef struct _vcsfx_s
{ ssize_t*   idx;        /* sorted suffix array            */
       ssize_t*   inv;        /* inverted index/rank            */
       Vcchar_t*  str;        /* string to be sorted            */
       size_t     nstr;       /* length of string               */
} Vcsfx_t;

Vcsfx_t* vcsfxsort(const Void_t* str, size_t nstr); ssize_t vcperiod(const Void_t* data, size_t dtsz);

GENERALIZED LEMPEL-ZIV PARSING

#define      VCLZ_REVERSE
#define      VCLZ_MAP

typedef ssize_t (*Vclzparse_f)(Vclzparse_t* vcpa, int type,

                                  Vclzmatch_t* mtch, ssize_t n);
typedef struct _vclzmatch_s { ssize_t tpos; /* position in target string */
       ssize_t    mpos;        /* position of matched data      */
       ssize_t    size;        /* length of match               */
} Vclzmatch_t;

typedef struct _vclzparse_s
{ Vclzparse_f  parsef;    /* to process a set of segments  */ 
       Vcchar_t*  src;         /* source string to learn from   */
       ssize_t    nsrc;        /* length of source data         */
       Vcchar_t*  tar;         /* target string to be parsed    */
       ssize_t    ntar;        /* length of target data         */
       ssize_t    mmin;        /* minimum required match length */
       Vcchar_t*  cmap;        /* array to map bytes to bytes   */
       int        type;        /* VCLZ_REVERSE|VCLZ_MAP         */
} Vclzparse_t;

int vclzparse(Vclzparse_t* vcpa, ssize_t bound);

BIT/BYTE/STRING FAST I/O SUBSYSTEM

void         vcioinit(Vcio_t* io, Vcchar_t* buf, int n);
size_t       vciosize(Vcio_t* io);
size_t       vciomore(Vcio_t* io);
Vcchar_t*    vciodata(Vcio_t* io);
Vcchar_t*    vcionext(Vcio_t* io);
Vcchar_t*    vcioskip(Vcio_t* io, int n);
void         vciomove(Vcio_t* to, Vcio_t* from, int n);
ssize_t      vcioputc(Vcio_t* io, int c);
int          vciogetc(Vcio_t* io);
ssize_t      vcioputu(Vcio_t* io, Vcint_t v);
Vcint_t      vciogetu(Vcio_t* io);
ssize_t      vcioputm(Vcio_t* io, Vcint_t v, Vcint_t max);
Vcint_t      vciogetm(Vcio_t* io, Vcint_t max);
ssize_t      vcioput2(Vcio_t* io, Vcint_t v,
                           Vcchar_t a, Vcchar_t b);
Vcint_t vcioget2(Vcio_t* io, Vcchar_t a, Vcchar_t b); ssize_t vcioputg(Vcio_t* io, Vcint_t v); Vcint_t vciogetg(Vcio_t* io); ssize_t vcioputlist(Vcio_t* io, Vcint_t* list, ssize_t n); Vcint_t vciogetlist(Vcio_t* io, Vcint_t* list, ssize_t n); ssize_t vcioputs(Vcio_t* io, Void_t* s, size_t len); ssize_t vciogets(Vcio_t* io, Void_t* s, size_t len);
void         vciosetb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
                           int type);
void vcioendb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
                           int type);
void vciofilb(Vcio_t* io, Vcbits_t bits, ssize_t nbit); Vcbits_t vciodelb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
                           ssize_t ndel);
void vcioflsb(Vcio_t* io, Vcbits_t bits, ssize_t nbit); Vcbits_t vcioaddb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
                           Vcbits_t add, ssize_t nadd);

STREAM PROCESSING

Vcsfio_t;
typedef void (*Vcsferror_f)(const char*);

typedef struct _vcsfdata_s { int type; /* various processing modes */

       char*       trans;        /* transformation specification */
       char*       window;       /* window specification         */
       char*       source;       /* source file to delta against */
       Vcsferror_f errorf;       /* to process errors            */
} Vcsfdata_t;

Vcsfio_t* vcsfio(Sfio_t* sf, Vcsfdata_t* sfdt, int type); ssize_t vcsfread(Vcsfio_t* vcsf, Void_t* buf, size_t n); ssize_t vcsfwrite(Vcsfio_t* vcsf, Void_t* buf, size_t n); int vcsfsync(Vcsfio_t* vcsf); int vcsfclose(Vcsfio_t* vcsf);

ALIAS PROCESSING

void         vcaddalias(char** list);
char*        vcgetalias(char* name, char* spec, ssize_t size);
int          vcwalkalias(Vcwalk_f walkf, Void_t* disc);

MISCELLANIES

double       vclog(size_t v);
size_t       vclogi(size_t v);
size_t       vcsizeu(Vcint_t v);
Vcint_t      vcatoi(char* buf);
ssize_t      vcitoa(Vcint_t v, char* buf, ssize_t n);

Vcint_t vcintcode(Vcint_t v, Vcint_t near,

                            Vcint_t min, Vcint_t max, int type);
char* vcstrcode(char* s, char* buf, ssize_t n);

typedef int (*Vccompare_f)(void* elt1, void* elt2, void* disc); void vcqsort(void* list, ssize_t nelts, ssize_t eltsize,

                          Vccompare_f comparf, void* disc);
ssize_t vcbcktsort(ssize_t* idx, ssize_t* tmp, ssize_t n,
                             Vcchar_t*, ssize_t* maxc);
char* vcsubstring(char* data, int separ,
                              char* buf, ssize_t bufsz, int type);

DESCRIPTION

Vcodex provides a collection of data transforms including compression, differencing and encryption via a discipline and method interface (see The Discipline and Method Architecture for Reusable Libraries, Software Practice & Experience, v.30, pp.107-128, 2000).

BASIC TYPES

Void_t*
This type is used to pass data between Vcodex and application code. Void_t is defined as void for ANSI-C and C++ and char for older compilation environments.

Vcchar_t
This defines the type of bytes as manipulated by Vcodex.

Vcint_t
This defines an integer type of maximum supported size.

Vcbits_t
This defines an integer type suitable for bit manipulation and I/O.

Vcdisc_t
This defines the type of a discipline structure.

Vcmethod_t
This defines the type of a data transform.

Vcmtarg_t
This defines the type of a structure describing arguments to a method.

Vcwalk_f
This defines the type of a function to be invoked while walking some list of data structures. Its prototype is:
         int (*Vcwalk_f)(Void_t* obj, char* name, char* value, Void_t* disc);
obj:
This is the object being walked. For example, if the list of methods is being walked, this would be a pointer to a Vcmethod_t object.
name, value:

The name and value of the object being walked.
disc:

The structure passed along to the invoking walk function. See also vcwalkmeth() and vcwalkalias().

Vcodex_t
This defines the type of a Vcodex handle.

BASIC OPERATIONS

Vcodex_t* vcopen(Vcdisc_t* disc, Vcmethod_t* meth, Void_t* init, Vcodex_t* coder, int flags)
vcopen() creates a handle to process data. It returns the new handle on success or NULL on failure.
disc:

A discipline structure to describe data (see DISCIPLINE section).
meth:

a data transform method (see METHODS section).
init:

Any additional parameters for the given method.
coder:

This specifies a continuation coder, i.e., another handle to process various data parts produced by the main transform.
flags:

This gives the bit values to control behaviors of the new handle. Exactly one of VC_ENCODE and VC_DECODE should be specified to indicate whether the handle is used for encoding or decoding. The bit VC_CLOSECODER, if specified, indicates that the continuation coder, if any, should be closed on handle closing.

vccreate(char* spec, int flags);
This function creates a composite handle based on the specification of a sequence of methods. It returns the composite handle on success and NULL on failure. See section ALIAS PROCESSING for the syntax of a specification.

ssize_t vcextract(Vcodex_t* vc, Void_t** datap)
This function extract data from a composite encoding handle vc that can be used later by vcrestore() to construct a composite decoder. The data is returned in *storep. vcextract() returns the data length on success and -1 on failure.

Vcodex_t* vcrestore(Vcodex_t* vc, Void_t* data, size_t size)
vcrestore() constructs a composite decoding handle from the given data. It returns the handle on success and NULL on failure.

int vcclose(Vcodex_t* vc)
This closes the vc handle, meaning to free all associated memory resources. vcclose() returns 0 on success or -1 on error.

ssize_t vcapply(Vcodex_t* vc, Void_t* data, size_t size, Void_t** output)
This applies the data transform associated with vc to the given data. If the handle was opened for decoding, the given data and size should match exactly the previously encoded result. vcapply() returns the length of the output data buffer or -1 on failure. If output == NULL, only the size of the result will be returned. If output != NULL, the transformed data will be returned in *output (see also vcundone() below.) Note that each call to vcapply() returns a separate buffer of encoded or decoded data. These buffers should be freed as soon as the respective data have been consumed. See vcbuffer() for how to do that.

ssize_t vcrecode(Vcodex_t* vc, Vcchar_t** datap, ssize_t* sizep, ssize_t head);
This function is intended to be used by method writers for continuation of data process in secondary coders. It invokes vcapply() on the continuation coder to transform the data given in *datap and *sizep. The transformed data and its size are returned in the same arguments. head, if positive, causes buffer allocation in secondary coders to leave that much room in front of the transformed data. In this way, successive calls of vcrecode() can leave an accumulated amount of header space for adding data later by the callers. vcrecode() returns the size of the transformed data on success and -1 on error.

size_t vcundone(Vcodex_t* vc);
Certain methods (e.g., Vctable) may not process all given data. After a successful vcapply() call (i.e., it returns a non-negative value), the call vcundone() tells the amount of data left unprocessed.

Vcchar_t* vcbuffer(Vcodex_t* vc, Vcchar_t* trunc, ssize_t size, ssize_t head);
This function manages the buffer pool for a handle. It always returns NULL on freeing buffers. For buffer allocation, it returns the allocated buffer or NULL on failure.

When trunc != NULL, it specifies a buffer in vc or one of the continuation coders to be truncated down to size or freed if size < 0. When trunc == NULL, there are two cases. If size is negative, all buffers allocated via vc and continuation coders are freed. Otherwise, a new buffer of the given size is allocated. This buffer is guaranteed to have at least n writable bytes in front of the allocated buffer where n is head plus some other amount as defined by various vcrecode() calls during continuation processing.

Vcchar_t* vctellbuf(Vcodex_t* vc, Vcbuffer_f bufferf);
This function walks though all handles starting from vc calls bufferf() on the lists of buffers in these handles. The needed types are:

       typedef struct _vcbuffer_s    Vcbuffer_t;
       struct _vcbuffer_s
       {  Vcbuffer_t*     next;
          size_t          size;   /* total buffer size    */
          char*           file;   /* file allocating it   */
          int             line;   /* line number in file  */
          unsigned char   buf[1]; /* actual data buffer   */
       };

       int (*Vcbuffer_f)(Vcodex_t* vc,
                         Vcbuffer_t*  list, Vcmethod_t* meth);

Here, list is a linked list of buffers of the type Vcbuffer_t and meth is the method used for the given handle.

Vccontext_t* vcinitcontext(Vcodex_t* vc, Vccontext_t* ctxt);
The function creates a new context if ctxt is NULL or makes ctxt the current context if it is not NULL. It returns the current context or NULL on error.

In certain situations, states may be retained across calls of vcapply() or vcrecode(). For example, the Vctable transform computes and reuses a transformation plan repeatedly as appropriate. A handle using such a transform should create separate contexts to keep states for different table types. In particular, a transform such as Vctable could extend Vccontext_t by including it as the first element of a private context type. In that case, the method should also supply an eventf function capable of handling the events VC_INITCONTEXT and VC_FREECONTEXT. The first allocates a new context while the latter frees a given one.

int vcfreecontext(Vcodex_t* vc, Vccontext_t* ctxt);
This function deletes the given context ctxt.

type vcgetcontext(vc, type);
This macro function retrieves the current context of vc and coerces that to its given private type.

TYPES AND FUNCTIONS FOR DATA TRANSFORMING METHODS

Methods are of the type Vcmethod_t as defined below:

typedef struct _vcmethod_s
{    Vcapply_f  encodef; /* function to encode data         */
          Vcapply_f  decodef; /* function to decode data         */
          int        (*eventf)(Vcodex_t*, int, Void_t*);
          char*      name;   /* the string name of the transform */
          char*      desc;   /* a short description              */
          Vcmtarg_t* args;   /* arguments usable with vcopen()   */
          ssize_t    window; /* suggested window size            */
          int        type;	/* VC_MTSOURCE                      */
} Vcmethod_t;

encodef, decodef:

These are functions to encode and decode data.
eventf:

If not NULL, this is a function to handle events.
args:

This defines a list of arguments usable to initialize the method. See ALIAS PROCESSING for details on how to specify arguments. Each element of the list is of the type Vcmtarg_t:

typedef struct _vcmtarg_s
{ char*      name;  /* the name of the argument       */
       char*      desc;  /* a description of the argument  */
       Void_t*    data;  /* any associated data            */
} Vcmtarg_t;

type:

This provides flags telling certain characteristics of the method. Currently only VC_MTSOURCE is used to indicate that the method is a delta compressor.

The list of events passable to eventf are:

VC_OPENING:

A new handle is opened. The third argument is data for initialization.
VC_CLOSING;

A handle is being closed. All private method data should be deallocated.
VC_INITCONTEXT:

Create a new context for data transformation.
VC_FREECONTEXT:

Delete the given context.
VC_FREEBUFFER:

Free any allocated buffers for this handle or buffers allocated in other handles created internally.
VC_SETMTARG:

Set a parameter which should be specified in a name=value format. For example, to tell the Vctranspose data transform that the number of columns for the data to be transformed next is 10, the event VC_SETMTARG can be raised with associated data ``columns=10''.
VC_EXTRACT:

VC_RESTORE:

The third argument to the eventf function should be a structure of type Vcmtcode_t:

typedef struct _vcmtcode_s
{ Vcchar_t*  data;  /* encoded data of a handle       */
       ssize_t    size;  /* size of encoded data           */
       Vcodex_t*  coder; /* reconstructed coder            */
} Vcmtarg_t;

For VC_EXTRACT, internal states should be coded in a form suitable for persistent storage and returned in the fields Vcmtcode_t.data and Vcmtcode_t.size. For VC_RESTORE, a coder for decoding data should be constructed based on the data in the fields Vcmtcode_t.data and Vcmtcode_t.size and returned in Vcmtcode_t.coder. In both cases, the eventf function should return a positive value on success.

VC_GETIDENT:

Obtaining the identification string of the method. The handling of this event is optional. If not defined, the identification string will be the method's name translated into the ASCII character set.
VC_TELLBUFFER:

This event is used mostly for debugging memory usage. It is a request to process memory buffers being maintained by a handle and its continuation. The third argument to eventf should be the address of a function defined with the below type. The arguments are: vc the handle creating the buffer, list the list of buffers in that handle and meth the method in use.

typedef struct _vcbuffer_s  Vcbuffer_t;
typedef int    (*Vcbuffer_f)(Vcodex_t* vc,
                               Vcbuffer_t* list, Vcmethod_t* meth);
struct _vcbuffer_s { Vcbuffer_t* next; /* link to next buffer */
       ssize_t      size; /* size of the buffer       */
       char*        file; /* file creating the buffer */
       int          line; /* line number in the file  */
       ...
};

int vcaddmeth(Vcmethod_t** list, ssize_t size);
This function adds a list of methods to the collection of available methods. It is intended to be used by applications that wish to make their own transforms available to a search via vcgetmeth().

Vcmethod_t* vcgetmeth(char* name, int type);
This function returns a method matching name. If type is zero, name is assumed to be in the native codeset (e.g., ASCII or some form of EBCDIC). Otherwise, it should have been previously coded by vcstrcode(). Such coding of a string makes the data portable across platforms using different codesets.

int vcwalkmeth(Vcwalk_f walkf, Void_t* disc);
This function walks the currently available list of methods and invokes walkf as (*walkf)(meth,name,desc,disc). Here, meth is the method being walked while name and desc are its name and description.

char* vcgetident(Vcmethod_t* meth, char* buf, ssize_t size);
This function returns the identification string of the method meth. This string is guaranteed to be in ASCII so that it can be portably stored. buf and size define a buffer that vcgetident() may use to store the identification string. vcgetident() returns NULL on error.

char* vcgetmtarg(char* data, char* val, ssize_t vlsz, Vcmtarg_t* list, Vcmtarg_t** arg);
This call may be used by a method writer to parse the string data which should be composed of elements separated by periods. Each element can be either just a plain string name or a pair name=value. The string name will be matched against list to find a matching argument. If one is found, *arg will be set to it. value, if any, will be copied into the buffer val without exceeding vlsz. vcgetmtarg() returns the remainder of unparsed data or NULL on failure.

int vcsetmtarg(Vcodex_t* vc, char* name, void* val, int type);
This function invokes eventf() to set an argument (parameter). By convention, each transform should ignore any inapplicable parameters. The type argument can be 0, 1,2 to indicate that the val argument is a byte, an integer or a string respectively. A byte value is coded in C-style while an integer is coded in decimal. For example, the call vcsetarg(vc, "columns", 10, 1) set the number of columns to 10 for a method such as Vctranspose or Vctable.

type vcgetmtdata(Vcodex_t* vc, type);

void vcsetmtdata(Vcodex_t* vc, data);
These macro functions retrieve and set the private data structure created by the transform to use in the handle vc. vcgetmtdata() coerces the data structure to its private type.

AVAILABLE METHODS

Vcdelta
This method implements a generalized Lempel-Ziv parsing strategy to compress a target data segment either by itself or against another source data segment. The source data segment, if any, should be specified by the data and size field of a discipline structure. The compressed data are in the standard format defined by the IETF RFC3284. String matching is based on a suffix sorting algorithm if the init argument to vcopen() is the string "s". Otherwise, a hashing algorithm is used.

Vcsieve
This method implement various strategies to condition data for compressibility. The argument init to vcopen() can be composed from:
delta:

This does delta compression with approximate matching. A matched data segment is coded by subtracting it against the matching segment. Thus, in this case, Vcsieve does not reduce data. However, the data that it produces is amenable to compression by other transforms, for example, a run-length compressor together with a Huffman coder. If delta is not specified, only exact matching will be done and matched data will not be coded by subtracting as described.
reverse:

This allows matching data in reverse. For example abc can be matched with cba (as well as another instance of abc).
map:
This allows defining a set of pairs of letters that are allowed to match with one another. For example, map=ATGCatgc allows matching A with T, G with C, etc. For example, such a map specification along with reverse can be used to compress DNA data.
mmin:

This defines a minimum prefix that must be exactly matched before further matching either exact or approximate.

Vchamming
This code a target data segment by subtracting it against another source data segment given in the discipline structure. If the source data segment is short, it will be repeated as necessary.

Vchuffman
This method performs Huffman coding.

Vchuffgroup
This method first divides the given data into small contiguous parts of equal size (except perhaps the last one). Then, the parts are collected into groups so that each group can be compressed separately with its own Huffman code table.

Vchuffpart
This method divides the given data into contiguous sections each of which uses a separate Huffman coding table optimized for it.

Vcrle
This method encodes runs in data. Normally, runs are encoded using a scheme whereby a run is indicated by some escape character and its length is encoded in a compact variable size format. The run lengths are coded in a separate area from the data so that the two types of data can be effectively compressed via a later continuation coder if there is one. An alternative run encoding method is indicated by giving the string "0" as the init argument of vcopen(). In this case, only runs of 0's are encoded using a binary encoding method. This is useful in a compression technique that produces large sequences of zeros such as the Burrows-Wheeler transform Vcbwt or the table transform Vctable.

Vcbwt
Vcbwt performs a Burrows-Wheeler transform on the given data. This data is amenable to better compression via some combination of move-to-front, run-length encoding and entropy coding.

Vcmtf
This method performs a move-to-front transform on the data, i.e., mapping each data byte into their positions in some dynamically updated array of the 256 byte values. By default, the array is updated using a predictive method aiming at creating more zeros. If the init argument of vcopen() is the string "0", the array is updated by moving the encoded byte to position 0 (i.e., the traditional MTF transform).

Vctranspose
This method transposes rows and columns. If the init argument of vcopen() is the string "v", each row ends with some separator character. If the init argument is anything else, row lengths are fixed size. In that case, the row size (i.e., number of columns) is normally encoded at the start of transformed data unless the init argument is "0". The number of columns may be learned directly from the data. But it and the row separator can also be set via vcsetmtarg() using one of columns and rsep or fsep. Only data fitting the table profile, i.e., the maximum number of well-defined rows will be processed. The unprocessed amount can be retrieved via vcundone().

Vctable
This method transforms table data, i.e., two-dimensional byte arrays, into a form that is amenable to compression via a combination of move-to-front (Vcmtf), run-length encoding (Vcrle) and entropy coding (e.g., Vchuffgroup). Normally, the number of columns is learned directly from data. However, it can also be explicitly set via vcsetmtarg() by the parameter columns. Only data fitting the table profile will be processed. The unprocessed amount can be told via vcundone().

Vcrdb
This method transforms a relational data table to make it more compressible. The size of unprocessed data can be told with vcundone(). A relational data table is of one of two forms:
Flat:

This is a collection of records such that each of which terminates with a record separator. Within each record is a sequence of fields each of which terminates with a field separator. Field and record separators can be set via vcmtsetarg() using fsep and rsep. The attribute full can be used to coerce padding of field values so that they all have the same lengths. All attributes can be set in the init argument of vcopen().
Schematic:

This is a collection of records such that each of which consists of a set of fields, each of a fixed length. The lengths of such fields can be set via schema=[f1,f2,...] either via the init argument of vcopen() or via vcmtsetarg().

Vcrelation
This method is similar to Vcrdb for flat relational data tables but uses a faster algorithm with comparable or better performance for this class of data. Field and record separators can be set via vcmtsetarg() using fsep and rsep.

Vcmap
This method maps each byte to another byte. An application-specific mapping can be specified via the field data of the discipline structure. It must be a Vcchar_t array of size 256 where each position has its mapped value. Predefined mappings are provided to translate between ASCII and various versions of EBCDIC. These can be selected via init argument of vcopen():
a2e, e2a:

These pertain to the EBCDIC character set defined by the X/OPEN dd(1) command.
a2i, i2a:

These pertain to the EBCDIC character set defined the dd(1) command on IBM systems.
a2h, h2a:

These pertain to the EBCDIC character set defined on IBM-37 AS/400 systems.
a2o, o2a:

These pertain to the EBCDIC character set defined on IBM OpenEdition Unix systems.
a2s, s2a:

These pertain to the EBCDIC character set defined on Siemens Posix systems.

DISCIPLINE AND EVENT HANDLING

A delta compression method such as Vcdelta requires source data to compare against. Such information must be supplied by an application via the discipline structure Vcdisc_t:

typedef struct
{ Void_t*    data;
       size_t     size;
       Vcevent_f  eventf;
} Vcdisc_t;

int (*eventf)(Vcodex_t* dt, int type, Void_t* data, Vcdisc_t* disc)
If not NULL, eventf announces various events. Unless noted otherwise, a negative value causes the calling operation to terminate with failure while a non-negative return value let the calling function proceed normally. Following are the events:
VC_OPENING:

When data is NULL, this announces the opening of the handle vc. Otherwise, an error has happened during initialization and data was the method used to open the handle,
VC_CLOSING:

The handle vc is being closed. The argument data will be NULL in this case.
VC_DISC:

The current discipline of vc is being changed to the new one given in (Vcdisc_t*)data.

Vcdisc_t* vcdisc(Vcodex_t* vc, Vcdisc_t* disc);
This changes the discipline of vc to disc. On success, vcdisc() returns the previous discipline if it is not NULL; otherwise, it returns disc. On failure, vcdisc() returns NULL. See also the methods Vctable and Vctranspose.

vcgetdisc(Vcodex_t* vc);
This macro function retrieves the discipline for vc.

VCDELTA TYPES AND FUNCTIONS

The types Vcdinst_t, Vcdcode_t and Vcdtable_t describe the structures of a code table and the instructions. Their usage is described in the paper IETF RFC draft-korn-vcdiff-xx.txt (http://www.ietf.org).

ssize_t vcdputtable(Vcdtable_t* table, Void_t* buf, size_t n)
This function encodes the given table and stores the result in buf. It returns the output size on success and -1 on error.

int vcdgettable(Vcdtable_t* table, Void_t* buf, size_t n)
This function decodes the data in buf into a code table which is stored in the space pointed to by table. It returns 0 on success and -1 on error.

WINDOW MATCHING METHODS FOR DELTA COMPRESSION

This collection of types and functions provides a subpackage for computing matching windows to help with delta compresssion.

Vcwindow_t* vcwopen(Vcwdisc_t* disc, Vcwmethod_t* meth);
This opens a handle for computing matching windows based on the given method meth. The discipline structure, disc, provides source and target file handles and an event handling function. Source and target file handles, if given, should be opened for reading. Currently, the event handling function is only called with VCW_OPENING and VCW_CLOSING. If meth is NULL, the handle is opened for decoding data. Below are the methods:
Vcwmirror;

This method reuses the position of the data in the target file as the tentative start of the computed matching window in the source file. It may add a small amount of data around that position to enhance string matching.
Vcwvote:

This method uses the frequencies of the n-grams (for some private n) in a target window of data and a voting algorithm to find a matching window in either the source file or the target file.
Vcwprefix:

This method uses prefix matching on blocks of data to determine windows matching portions of target data.

int vcwclose(Vcwindow_t* vcw);
This closes the handle vcw.

Vcwmatch_t* vcwapply(Vcwindow_t* vcw, Void_t* data, size_t size, Sfoff_t pos);
This applies the windowing method of vcw to compute a suitable matching window to the given data segment. pos gives the starting position of the data segment in the target file. If the handle was opened with both a source and a target files, the computed matching window can be anywhere in the source file but it is restricted to the part of the target file before pos. A matching window, if found, is returned in a Vcwmatch_t structure with elements:
type:

This is either VCD_SOURCEFILE or VCD_TARGETFILE to indicate which file the matching window is from.
wdata, wsize, wpos:

These give the desired window data and the position in the respective file.
msize:

This gives the amount of data from data that should be used for compressing against the computed window data.

ssize_t vcwfeedback(Vcwindow_t* vcw, ssize_t size);
This tells vcw the compression result from the use of the window computed in the last vcwapply() call. Certain methods may use this information to focus window searching.

Vcwmethod_t* vcwgetmeth(char* name);
This function searches for a window matching method matching name. It returns the method or NULL on failure.

int vcwwalkmeth(Vcwalk_f walkf, Void_t* disc);
This walks the list of window matching methods in the same manner as vcwalkmeth().

HUFFMAN CODING TYPES AND FUNCTIONS

The compression methods Vchuffman, Vchuffgroup and Vchuffpart are based on the Huffman encoding. The functions described here provide efficient algorithms for encoding and decoding of certain basic data structures required in Huffman encoding.

ssize_t vchsize(ssize_t* freq, ssize_t* size, int* runb)
This function computes the code lengths in bits for all bytes given their frequency as defined in freq[]. Code lengths are returned in size[]. If there is a single run, the run byte is returned in *runb. vchsize() returns the maximum length of any code.

ssize_t vchbits(ssize_t* size, Vcbits_t* bits)
This function computes the actual codes to encode each byte based on the sizes given in size[]. It returns the maximum code size.

int vchcopy(Vcodex_t* vc, ssize_t* freq, ssize_t* size, in maxs)
This function sets the frequency and size arrays in handle vc if they are not NULL. maxs is the maximum code size as given given in size[]. vc will use these arrays for encoding instead of computing them from the data being encoded. vchcopy() returns 0 on success and -1 on failure.

ssize_t vchputcode(ssize_t* size, ssize_t maxs, Vcchar_t* data, size_t dtsz)
The Huffman code is completely specified by the lengths of the codes as given in the size array. This function encodes the size[] array in a compressed portable form suitable for output. The data array should have a sufficiently large size dtsz to store the result. It returns the length of the result in bytes.

ssize_t vchgetcode(ssize_t* size, ssize_t maxs, Vcchar_t* data, size_t dtsz)
This function decodes the size[] array from the encoded data given in data. It returns the number of bytes consumed from data for decoding size[].

FIXED-LENGTH TABLE TRANSFORM TYPES AND FUNCTIONS

Data in tables with fixed number of columns can be better compressed by transforming the data of each column based on whether or not it can be predicted well by some other column. The data types and functions needed for such a data transformation are described below.

typedef struct _vctblcolumn_s /* column specifications   */
{ ssize_t        index;  /* column index                 */
       ssize_t        pred1;  /* <0 if self compressing       */
       ssize_t        pred2;  /* if pred1<0, 1 for doing MTF  */
} Vctbltrans_t;

typedef struct _vctblplan_s /* transform plan */ { ssize_t ncols; /* row size or # of columns */

       Vctblcolumn_t* trans;  /* the plan to transform data   */
       Vcodex_t*      zip;    /* Vcdelta to code plan         */
} Vctblplan_t;

Vctbplan_t* vctblmakeplan(Void_t* data, size_t size, size_t ncols, int flags);
This function uses the given data to compute a plan that can be used to compress table with fixed-length rows. vctblmakeplan() returns the plan on success or NULL on failure. The argument ncols tells the number of columns. The argument flags should be a combination of the below bits:
VCTBL_RLE: use run-length compressive entropy to measure dependency strength (default).

VCTBL_CEE: use conditional empirical entropy to measure dependency strength.

VCTBL_LEFT: use left to right dependency relations (default).

VCTBL_RIGHT: use right to left dependency relations (default).

VCTBL_SINGLE: use only a single predictor. Default is two predictors.

void vctblfreeplan(Vctblplan_t* plan);
This function frees a plan computed by vctblmakeplan().

ssize_t vctblencodeplan(Vctblplan_t* plan, Void_t** codep);

Vctblplan_t* vctbldecodeplan(Void_t* code, size_t cdsz);
vctblencodeplan() computes from the given plan a string suitable for permanent storage. It returns the length of the string on success and -1 on failure. The code string itself is returned in *codep on success. vctbldecodeplan() reconstructs the plan encoded in the given string.

RELATIONAL DATA TRANSFORM TYPES AND FUNCTIONS

Each relational data table consists of a number of records where each record contains a number of fields. We allow fields to be any combinations of being fixed-length or delineated by some separators. The collection of data types and functions provided here enable construction of on-line data structures suitable for manipulating relational data tables. In particular, the type Vcrdtable_t describes the structure of a relational dataset. In the case when the data are records and fields purely delineated by record and field separators, a record may be allowed to not containing a full complement of fields. In such a case, the on-line structure fills in missing fields with data consisting of single record separators so that the table appears as if all records have the same number of fields.

ssize_t vcrdsepar(Vcrdsepar_t* rds, ssize_t nrds, Vcchar_t* data, ssize_t dtsz, int rsep);
A relational dataset may consist of records separated by some record separator. In addition, each record consists of fields separated by some field separator. Often, a dataset may be given without specification of either of these separators. This function computes candidates for record and field separators for such a dataset.
rds, nrds: These define an array with nrds elements of the type Vcrdsepar_t.

vcrdsepar() returns up to nrds best candidate separators in this array.
data, dtsz: These provide the data used in computing the separator candidates.

rsep: If this is non-negative and less than 256, it defines the record separator.

In that case, only the field separator will be computed.

Vcrdtable_t* vcrdparse(Vcrdinfo_t* info, Vcchar_t* data, ssize_t dtsz, int type);

ssize_t vcrdsize(Vcrdtable_t* tbl);

void vcrdclose(Vcrdtable_t* tbl);
These functions constructs and frees Vcrdtable_t structures to represent data by fields and records. vcrdsize() returns the actual amount of data from data used to construct the table after a vcrdparse() or vcrdfield() call (i.e., not all data may be used in such a call). Below are the arguments for vcrdparse():
info: This structure defines initial information about the table data.

The field flen, if not NULL, should be an array with positive fldn elements. Each element of flen[] can be (1) positive to define a fixed-length field, (2) zero to define a field delineated by a separator, and (3) negative to define a field delineated by some separator but having been filled in so that data across all records have the same length, the absolute value of the given value.
data, dtsz: The raw data of the table.

The fields recn and fldn of info must be positive if either data is NULL or dtsz <= 0. In this case, only a skeleton Vcrdtable_t structure will be constructed. Field data must be filled in later via vcrdfield().
type: This is either 0 or VCRD_FIELD. The latter means

that the given data is in a field-oriented format instead of the default record-oriented format. The fields recn and fldn of info must be positive if data is field-oriented.

ssize_t vcrdfield(Vcrdtable_t* tbl, ssize_t f, ssize_t n, Vcchar_t* data, ssize_t dtsz);
This call changes the data of a field in the table tbl. The field index is given in f while its length is given in n. The treatment of n is exactly the same as described above for elements in the flen[] array. vcrdfield() returns the size of data actually used.

vcrdattrs(Vcrdtable_t* tbl, ssize_t f, int attrs, int type);
This call sets the attributes of fields in the table tbl. If f is nonnegative, it indicates the single field to change. Otherwise, all fields will be considered to change. type can be zero or non-zero to turn off or on the given attributes. attrs is a bit combination of the below attributes:
VCRD_FULL:

If the field is delineated by a separator, values across records may have different lengths. When this attribute is on, such values will be padded with the separator as necessary to make all have the same length. Note that this padding is done virtually on data extraction only. See also vcrdextract().
VCRD_DOT:

Certain fields contain values such as floating point numbers or IP addresses are of the form xxx.yyy.zzz.... In the case of IP addresses, a value may also be ended in /nnn to indicate the number of bits to use. When this attribute is on, such a value will be converted into a fixed-length sequence of bytes, each representing a component of the value. The conversion is done only if all values of the field can be converted uniformly. On a successful conversion, the type attribute of the respective Vcrdfield_t will be set to contain VCRD_DOT and perhaps also VCRD_SLASH if the /nnn pattern was detected and converted.

ssize_t vcrdextract(Vcrdtable_t* tbl, ssize_t f, Vcchar_t** datap, int type);
This call extracts from table tbl the data of a field f if f is non-negative or the data of the entire table otherwise. The data array is returned in *datap and its size is the return value. type can be a bit combination of the following:
VCRD_FULL:

If this bit is on, records with missing fields will be filled in with the record separator as appropriate to make all records have the same number of fields. Note that variable-length fields may also be filled separately by their internal attributes as described in vcrdattrs().
VCRD_FIELD:

The default data orientation is by record. This bit causes data to be extracted in a field-oriented fashion. Note that this bit implies VCRD_FULL.

typedef struct _vcrdplan_s
{ ssize_t         fldn;   /* number of fields             */
       ssize_t         pred[]; /* predictor list               */
} Vcrdplan_t;

Vcrdplan_t defines the dependency relations among fields. fldn is the number of fields while pred[] gives the predictive relations. If pred[i] != i, field pred[i] predicts field i. Otherwise, i is unpredicted.

ssize_t vcrdweight(Vcrdtable_t* tbl, ssize_t f, ssize_t* vect, Vcrdweight_f wghtf);
This function computes the weight of field f in table tbl. The weight of a field is calculated by summing the weights of values in adjacent records. vect, if not NULL, is a vector used to reorder the records of fld before calculating its weight. wghtf, if not NULL, is a function to compute the weight of two given strings of data. When wghtf is NULL, an internal function is used which simply computes the length of the common prefix of the two given strings.

int vcrdvector(Vcrdtable_t* tbl, ssize_t f, Vcchar_t* trdt, ssize_t dtsz, int type);
This function computes a transform vector of field f in table tbl. It returns -1 on error and non-negative on success. On success, the return value 1 signifies that all values are equal.
type == VC_ENCODE:

If trdt is not NULL, it is memory of size dtsz. Record data will be transposed column by column into it based on the sort states up to a particular column. In this case, the internal transform vector, if any, will not be changed. On the other hand, if trdt is NULL, the internal transform vector will be computed if not yet done so.
type == VC_DECODE:

This inverts the data previously transformed by vcrdvector() and repopulates the field f. The transform vector will be updated to reflect the new data.

Vcrdplan_t* vcrdmakeplan(Vcrdtable_t* tbl, Vcrdweight_f wghtf);

void vcrdfreeplan(Vcrdplan_t* plan);
vcrdmakeplan() computes the field dependency in a relational data table to define a plan to transform the data for better compression. It returns NULL on error. wghtf, if not NULL or VCRD_NOPLAN, is a function to compute the weight of a pair of strings. vcrdfreeplan() frees all memory associated with the given plan.

int vcrdexecplan(Vcrdtable_t* tbl, Vcrdplan_t* plan, int type);
This function executes the given plan to transform the fields in table tbl. The argument type is either VC_ENCODE or VC_DECODE to indicate whether the data is being encoded or decoded. The function returns -1 on error.

SUFFIX ARRAY TYPES AND FUNCTIONS

typedef struct _vcsfx_s
{ ssize_t*      idx;    /* sorted suffix indices       */
       ssize_t*      inv;    /* inverse permuation of idx[] */
       Vcchar_t*     str;    /* string with sorted suffixes */
       size_t        nstr;   /* length of above string      */
} Vcsfx_t;

Vcsfx_t* vcsfxsort(const Void_t* str, size_t nstr);
vcsfxsort() sorts indices of suffixes of str to create a suffix array. The string is treated as if it ends with an element larger than all other bytes in the string. In this way, the order for all suffixes are well-defined. vcsfxsort() returns a structure of type Vcsfx_t as above on success or NULL on failure. The field Vcsfx_t.idx has the indices of all suffixes sorted lexicographically. The field Vcsfx_t.inv has the inverse permutation of Vcsfx_t.idx. That is, inv[idx[i]] == i.

ssize_t vcperiod(const Void_t* data, size_t dtsz);
vcperiod computes a quasi-period in the given data, if any. This is useful to determine the number of columns in table data.

GENERALIZED LEMPEL-ZIV PARSING

Certain transforms such as the delta compression method Vcdelta uses a greedy parsing scheme based on a combination of Lempel-Ziv'77 and Tichy's block-move. Source and target strings are conceptually concatenated into a superstring. The resulting data is then parsed starting from target data to construct a sequence of matching fragments.

As an example, consider parsing a target sequence abcde given the source sequence abcdabcdabcdf. The catenated sequence is abcdeabcdabcdabcdf so parsing may produce the below matching segments and a possible reconstruction plan from the Vcdelta transform with the shown COPY and ADD instructions:

5  4  0    /* position 5 matches 4 bytes starting at 0 */
9  8  5    /* position 9 matches 8 bytes starting at 5 */

COPY 4 0 /* abcd......... */ COPY 8 5 /* abcdabcdabcd. */ ADD 1 f /* abcdabcdabcdf */

int vclzparse(Vclzparse_t* vcpa, ssize_t bound);
This function parses the target data and calls vcpa->parsef on each parsed segments. Normally, hashing is used for string matching but if bound < 0 and the parsing does not use reverse matching or character mapping, then a suffix sorting algorithm is used for string matching. The handle vcpa is of type Vclzparse_t and specifies the data to be parsed and parsing parameters. The argument bound, if negative, means that a suffix sorting algorithm will be used for string matching. This tends to be slow but finds longer matches. If bound is non-negative, a hashing method is used for string matching. This is faster but may miss some long matches. Further, if bound is positive it provides a hint to prune the hash table of far away addresses.

typedef struct _vclzparse_s
{ Vclzparse_f parsef;   /* function to process a segment sequence   */
       Vcchar_t*   src;      /* the source string, if not NULL           */
       ssize_t     nsrc;     /* the length of the source data            */
       Vcchar_t*   tar;      /* the target string to be parsed           */
       ssize_t     ntar;     /* the length of the target data            */
       ssize_t     mmin;     /* the minimum length required for a match  */
       Vcchar_t*   cmap;     /* if !NULL, byte pairs to map for matching */
       int         type;     /* a combination of VCLZ_REVERSE|VCLZ_MAP   */
} Vclzparse_t;

typedef struct _vclzmatch_s { ssize_t tpos; /* the position of data in target string */

       ssize_t     mpos;     /* the matching position in source/target   */
       ssize_t     size;     /* the length of the data involved          */
} Vclzmatch_t;

If the bit VCLZ_REVERSE is on in the field Vclzparse_t.type, reverse matching is allowed. For example abc may match with cba as well as abc. Likewise, if the bit VCLZ_MAP is on, the field Vclzparse_t.map should be a string specifying pairs of characters that are allowed to match. For example, if map is "ATGCatcg", then A matches with T, G matches with C, and so on. This example pattern and reverse matching are useful for compressing DNA data.

ssize_t (*Vclzparse_f)(Vclzparse_t* vcpa, int type, Vclzmatch_t* mtch, ssize_t n);
This call-back function is invoked on sequences of data segments. It should return the amount of data actually consumed. A negative return value causes vclzparse() to stop and return immediately. A zero return value means that all the data covered by the given segments should be considered unmatchable.
vcpa:

The same handle that was passed to vclzparse().
type:

This is a combination of VCLZ_REVERSE and VCLZ_MAP to tell what type of matching was done.
mtch, n:

This is the list of parsed segments, each presented in a structure of type Vclzmatch_t. Only mtch[0] may have mpos < 0 to indicate a segment of unmatched data. Other elements should indicate actual matches.

BIT/BYTE/STRING I/O SUBSYSTEM

The below functions are provided by Vcodex for portable data encoding. These functions perform only minimal bound checking, if any. Applications should take care of such details.

void vcioinit(Vcio_t* io, Vcchar_t* buf, int n);
This initializes a Vcio_t structure io with the given buffer buf with size n bytes.

size_t vciosize(Vcio_t* io);
This returns the extent that the buffer has been written to or read from.

size_t vciomore(Vcio_t* io);
This returns the size of the remaining part of the buffer that has not yet been written to or read from.

Vcchar_t* vciodata(Vcio_t* io);
This returns the original data buffer.

Vcchar_t* vcionext(Vcio_t* io);
This returns the pointer to the next position in the buffer that can be written to or read from.

Vcchar_t* vcioskip(Vcio_t* io, int n);
This skips ahead n bytes and returns the resulting pointer to the buffer.

void vciomove(Vcio_t* to, Vcio_t* from, int n);
This moves n bytes between the given buffer and advances their positions accordingly.

ssize_t vcioputc(Vcio_t* io, int c);

int vciogetc(Vcio_t* io);
These functions write and read bytes respectively.

ssize_t vcioputs(Vcio_t* io, Void_t* s, size_t len);

ssize_t vciogets(Vcio_t* io, Void_t* s, size_t len);
These functions write and read arrays of bytes. They return the number of bytes written or read.

ssize_t vcioputu(Vcio_t io, Vcint_t v);

Vcint_t vciogetu(Vcio_t* io);
These functions write and read unsigned integers using a 7-bit encoding in the Sfio style. vcputu() returns the number of bytes used to encode the value.

ssize_t vcioputm(Vcio_t* io, Vcint_t v, Vcint_t max);

Vcint_t vciogetm(Vcio_t* io, Vcint_t max);
These functions write and read unsigned integers using an 8-bit encoding in the Sfio style. The size of the encoding depends on the magnitude of max. For example, if max is 255, only one byte is used while, if max is 256, two bytes will be used. vcputm() returns the number of bytes used to encode the value.

ssize_t vcioput2(Vcio_t* io, Vcint_t v, Vcchar_t a, Vcchar_t b);

Vcint_t vcioget2(Vcio_t* io, Vcchar_t a, Vcchar_t b);
These functions write and read unsigned integers using a compact binary coding with a and b as digits. vcput2() returns the number of bytes used to encode the value.

ssize_t vcioputg(Vcio_t* io, Vcint_t v);

Vcint_t vciogetg(Vcio_t* io);
These functions write and read integers using the Gamma code.

ssize_t vcioputlist(Vcio_t* io, Vcint_t* list, ssize_t n);

Vcint_t vciogetlist(Vcio_t* io, Vcint_t* list, ssize_t n);
These functions write and read lists of non-negative integers using a compact bit code. Note that the list size n given in vciogetl() must match exactly the list size given to vcioputl().

void vciosetb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, int type);

void vcioendb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, int type);

void vciofilb(Vcio_t* io, Vcbits_t bits, ssize_t nbit);

void vcioflsb(Vcio_t* io, Vcbits_t bits, ssize_t nbit);

Vcbits_t vcioaddb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, Vcbits_t add, ssize_t nadd);

Vcbits_t vciodelb(Vcio_t* io, Vcbits_t bits, ssize_t nbit, ssize_t ndel);

These macro functions constitute a set of operations for fast bit I/O.

The variables bits and nbit are declared by the application and will be asociated with the handle io via vciosetb(). The association is for either encoding, i.e., type=VC_ENCODE, or decoding, i.e., type=VC_DECODE, and must be done before any bit operations.

vcioendb() must be called to finalize bit I/O to get ready for other forms of I/O.

vciofilb() fills bits with some number of bits available from io. nbit is updated to indicate the number of available bits.

vcioflsb() flushes the bits available in bits out to io. Bits are flushed in 8-bit aggregates into bytes. nbit is updated to indicate the number of bits remained unflushed in bits.

vcioaddb() adds nadd bits from add to bits. It is the caller's respobsibility to ensure that there is room in bits to add the new bits. Otherwise, bits may be lost.

vciodelb() takes off ndel number of bits from bits. Such bits are lost forever.

STREAM PROCESSING

The functions described in this section process data from a stream. Depending on local environments, these functions are either based on the Sfio library or on Stdio, the ANSI-C I/O interface.

Vcsfio_t
The type Vcsfio_t represents a stream for performing I/O. When Stdio is used, it is an opaque type keeping state information needed for data transformation. In this case, the I/O functions vcsfwrite() and vcsfread() should be used to read and write data. When Sfio is used, it is defined as Sfio_t. In this case, all normal Sfio operations could be used on the stream. Functions such as vcsfread() and vcsfwrite() will be macro redefinitions of their corresponding Sfio functions.

Vcsfdata_t
The type Vcsfdata_t is used to pass initialization data to vcsfio().

typedef struct _vcsfdata_s
{       int         type;   /* various types of processing  */
             char*       trans;  /* transformation specification */
             char*       window; /* window specification         */
             char*       source; /* source file to delta against */
             Vcsferror_f errorf; /* to process errors            */
} Vcsfdata_t;

type:

This is composed from two bits: VCSF_VCDIFF for outputting a header as defined by the IETF RFC3284 for the VCDIFF delta encoding format or VCSF_PLAIN for outputting no header at all. If neither bits are set, a header based on methods used will be output.
trans:

This is the transform specification. See section ALIAS PROCESSING for details.
window:

This specifies window size and window matching method. The window size if defined should come first and be an integer with optional suffix k for kilobyte or m for megabyte. This window size defines the maximum amount of data to be transformed at one time. A window matching method should be defined starting with a period then the name of the method.
source:

Source file to delta-compress against if the first method is a delta compressor.
errorf:

This function announces an error encountered during data processing. It is called as (*errorf)(char* string) where string tells the error type.

Vcsfio_t* vcsfio(Sfio_t* sf, Vcsfdata_t* sfdt, int type);
This function constructs a Vcsfio_t handle to perform encoding or decoding on the stream sf depending on whether type is VC_ENCODE or VC_DECODE. When the Sfio library is in use, the same stream sf is returned. When the Stdio library is in use, the type Sfio_t is a macro redefinition of the FILE type while Vcsfio_t is an opaque type as described earlier.

The structure sfdt provides the specification for methods used to transform data. The method specification sfdt->spec must be supplied on encoding and also on decoding if sfdt->type has the bit VCSF_PLAIN. If type is VC_DECODE, sfdt can be NULL. In that case, the transforms needed to decode data will be obtained from starting data in the stream.

ssize_t vcsfread(Vcsfio_t* vcsf, Void_t* buf, size_t n);
This function reads from a stream of encoded data and decode them into the given buffer buf. It returns the length of the decoded data on success and a negative value on failure.

ssize_t vcsfwrite(Vcsfio_t* vcsf, const Void_t* data, size_t n);
This function writes the raw data given in data to the stream vcsf. Such data will be encoded before written out to the underlying data stream. On success, the amount of raw data written is returned. On failure, a negative value is returned.

int vcsfsync(Vcsfio_t* vcsf);
This function flushes unwritten data out to the underlying stream.

int vcsfclose(Vcsfio_t* vcsf);
This function closes the stream vcsf. Unwritten data will be flushed out to the underlying stream first.

ALIAS PROCESSING

void vcaddalias(char** list);
This function adds a set of aliases or short-hands for longer sequences of method specification. The given list is null-terminated.

Each item on list is a string of the form alias~=~specification. Each specification is a list of comma-separated elements. Each element should be a transform name followed by any arguments. The arguments are separated by '.'. Each argument may be a name or a name=value pair. In the latter case, value can have C characters embedded in them. For example, consider the below two specifications:

         rdb.fsep=[\t],bwt,mtf,rle.0,huffgroup
         rdb.schema=[10,20,30],table,mtf,rle.0,huffgroup

The first specification says that the Vcrdb transform of relational data should be used first. In this case, the data file is a flat file database with text lines as records and the field separator is the tab character as defined by fsep=[\t]. After rdb is done processing, it will pass its output onto the Burrows-Wheeler transform bwt which, in turn, will invoke the move-to-front transform mtf, and so on.

The second specification also invokes the Vcrdb first but this time the data consists of records with three fixed-length fields of lengths 10, 20 and 30 in that order. After rdb is done processing, it will invoke the table transform to process each field.

char* vcgetalias(char* alias, char* spec, ssize_t size);
This function returns the specification corresponding to the given alias. The arguments spec and size define a buffer to return the specification in. vcgetalias() returns the specification or NULL on failure.

int vcwalkalias(Vcwalk_f walkf, Void_t* disc);
This function invokes (*walkf)(0, name, spec, disc) on each defined alias.

MISCELLANIES

double vclog(size_t v);

ssize_t vclogi(size_t v);
vclog() computes the base-2 logarithm of v. vclogi() only returns the integer part.

ssize_t vcsizeu(Vcint_t v);
This macro function returns the number of bytes needed to encode v using vcioputu().

Vcint_t vcatoi(char* buf);

ssize_t vcitoa(Vcint_t v, char* buf, ssize_t n);
These functions convert a string to an integer and vice versa. vcitoa() requires that n be sufficiently large to store the constructed null-terminate string. It returns the string length.

Vcint_t vcintcode(Vcint_t v, Vcint_t near, Vcint_t min, Vcint_t max, int type);
This function encodes/decodes the integer v based on whether type is VC_ENCODE or VC_DECODE. v and near should be in the inclusive range [min,max]. v is coded assuming that it is close to near. This type of integer coding is useful to reduce data representation in a transform such as Vcsieve which tend to code sequence of addresses that are close to one another.

char* vcstrcode(char* s, char* buf, ssize_t n);
This function translates the characters in the string s from its native code set to the ASCII code set which can be used for portable persistent data. It returns buf which should have the translated string on success and NULL on error.

typedef int (*Vccompare_f)(void* elt1, void* elt2, void* disc);

void vcqsort(void* list, ssize_t nelts, ssize_t eltsize, Vccompare_f comparf, void* disc);
vcqsort() is like the qsort() function but it allows specifying an element disc to pass on to the comparison function comparf. This is useful to provide data about elements being compared.

ssize_t vcbcktsort(ssize_t* sort, ssize_t* list, ssize_t n, Vcchar_t* data, ssize_t bckt256);
This function sorts indices given in list and returns them in sort. The number of element is n and the bytes used to sort are given in data. The bckt[] array returns the cumulative counts of bytes. For example, bckt[0] is the count of byte 0, bckt[1] is the count of bytes 0 and 1, and so on.

char* vcsubstring(char* data, int separ, char* buf, ssize_t bufsz, int type);
This function extracts a substring of data and returns it in the buffer buf with a null byte appended. vcsubstring() returns a pointer to the remainder of data. Substrings are separated by separ. Characters in C-notation are allowed. Data can also be grouped by square brackets. The backslash escapes the next character.
type == 0:

Only search for separ but still respecting backslashes and grouping by brackets.
type == 1:

The backslash character means special treatment of characters as in the C language. The left square bracket means to obtain every character until the appearance of an un-escaped right square bracket. The brackets will be left out of the returned data.

AUTHOR

Kiem-Phong Vo

ACKNOWLEDGMENTS

David Korn helped designing the VCDIFF data format standardized in RFC3284. Binh Vo invented the table compressor Vctable. He also helped designing a number of transforms such as Vchuffgroup, Vchuffpart and Vcmtf.


Jun 26 2007 March 09, 2010