Vcodex_t* vcopen(Vcdisc_t* disc, Vcmethod_t* meth,
Void_t* init, Vcodex_t* coder, int flags);
Vcodex_t* vccreate(char* spec, int flags);
ssize_t vcextract(Vcodex_t* vc, Void_t** datap);
Vcodex_t* vcrestore(Vcodex_t* vc, Void_t* data,
size_t size);
int vcclose(Vcodex_t* vc);
ssize_t vcapply(Vcodex_t* vc, Void_t* data,
size_t size, Void_t** output);
ssize_t vcrecode(Vcodex_t* vc, Vcchar_t** datap,
ssize_t* sizep, ssize_t head);
size_t vcundone(Vcodex_t* vc);
Vcchar_t* vcbuffer(Vcodex_t* vc, Vcchar_t* trunc,
ssize_t size, ssize_t head);
int vctellbuf(Vcodex_t* vc, Vcbuffer_f bufferf);
Vccontext_t* vcinitcontext(Vcodex_t* vc, Vccontext_t* ctxt);
Vccontext_t* vcfreecontext(Vcodex_t* vc, Vccontext_t* ctxt);
type vcgetcontext(vc, type);
typedef struct _vcmethod_s
{ Vcapply_f encodef; /* function to encode a data set */
Vcapply_f decodef; /* function to decode a data set */
int (*eventf)(Vcodex_t*, int, Void_t*);
char* name; /* the name of the transform */
char* desc; /* a description of the transform */
Vcmtarg_t* args; /* arguments usable with vcopen() */
ssize_t window; /* suggested window size */
int type; /* VC_MTSOURCE */
} Vcmethod_t;
typedef struct _vcmtarg_s
{ char* name; /* the name of the argument */
char* desc; /* a description of the argument */
Void_t* data; /* any associated data */
} Vcmtarg_t;
typedef struct _vcmtcode_s
{ Vcchar_t* data; /* encoded data of a handle */
ssize_t size; /* size of encoded data */
Vcodex_t* coder; /* reconstructed coder */
} Vcmtcode_t;
#define VC_OPENING
#define VC_CLOSING
#define VC_INITCONTEXT
#define VC_FREECONTEXT
#define VC_FREEBUFFER
#define VC_SETMTARG
#define VC_EXTRACT
#define VC_RESTORE
#define VC_GETIDENT
#define VC_TELLBUFFER
int vcaddmeth(Vcmethod_t** list, ssize_t size);
Vcmethod_t* vcgetmeth(char* name, int type);
int vcwalkmeth(Vcwalk_f walkf, Void_t* disc);
char* vcgetident(Vcmethod_t* meth, char* buf, ssize_t size);
char* vcgetmtarg(char* data, char* val, ssize_t vlsz,
Vcmtarg_t* list, Vcmtarg_t** arg);
int vcsetmtarg(Vcodex_t* vc, char* name, void* val,
int type);
type vcgetmtdata(Vcodex_t* vc, type);
void vcsetmtdata(Vcodex_t* vc, data);
typedef struct {...} Vcwindow_t;
typedef struct {...} Vcwmethod_t;
typedef int (*Vcwevent_f)(Vcw_t* vcw, int type, Void_t* data,
Vcwdisc_t* disc);
typedef struct
{ Sfio_t* srcf; /* source file */
Sfio_t* tarf; /* target file */
Vcwevent_f eventf; /* event handler */
} Vcwdisc_t;
typedef struct
{ int type; /* VCD_[SOURCE|TARGET]FILE */
Void_t* wdata; /* window data */
ssize_t wsize; /* window length */
Sfoff_t wpos; /* starting position in file */
ssize_t msize; /* data size to be matched */
} Vcwmatch_t;
Vcwmethod_t* Vcwmirror; /* match by file positions */
Vcwmethod_t* Vcwvote; /* match by n-gram frequencies */
Vcwmethod_t* Vcwprefix; /* use prefix matching */
Vcwindow_t* vcwopen(Vcwdisc_t* disc, Vcwmethod_t* meth);
int vcwclose(Vcwindow_t* vcw);
Vcwmatch_t* vcwapply(Vcwindow_t* vcw, Void_t* data,
size_t size, Sfoff_t pos);
ssize_t vcwfeedback(Vcwindow_t* vcw, ssize_t size);
Vcwmethod_t* vcwgetmeth(char* name);
int vcwwalkmeth(Vcwalk_f walkf, Void_t* disc);
ssize_t vchsize(ssize_t* freq, ssize_t* size, int* runb);
ssize_t vchbits(ssize_t* size, Vcbits_t* bits);
int vchcopy(Vcodex_t* vc, ssize_t* freq,
ssize_t* size, ssize_t maxs);
ssize_t vchputcode(ssize_t* size, ssize_t maxs,
Vcchar_t* data, size_t dtsz);
ssize_t vchgetcode(ssize_t* size, ssize_t maxs,
Vcchar_t* data, size_t dtsz);
#define VCRD_FIELD /* given data is field-oriented */
#define VCRD_FULL /* make fields or records full */
#define VCRD_FIXED /* fixed-length field */
#define VCRD_DOT /* field was of the form x.y... */
#define VCRD_SLASH /* like above but form x.y.../z */
typedef struct _vcrdsepar_s
{ int fsep; /* computed field separator */
int rsep; /* record separator */
ssize_t size; /* actual amount of data used */
ssize_t recn; /* total number of records */
ssize_t recf; /* # records containing an fsep */
double allz; /* (entropy)size for all data */
double fldz; /* total (entropy)size by fields */
} Vcrdsepar_t;
typedef struct _vcrdinfo_s
{ ssize_t recn; /* number of records, if known */
ssize_t fldn; /* number of fields, if known */
ssize_t* flen; /* field lengths, if known */
int fsep; /* field separator, if needed */
int rsep; /* record separator, if needed */
} Vcrdinfo_t;
typedef struct _vcrdrecord_s
{ Vcchar_t* data; /* data in a field for a record */
ssize_t dtsz; /* size of data */
} Vcrdrecord_t;
typedef struct _vcrdfield_s
{ int type; /* field type data */
Vcrdrecord_t* rcrd; /* list of records for this field */
ssize_t maxz; /* maximum size of any record */
ssize_t* vect; /* transform vector by this field */
Vcchar_t* data; /* field data if allocated here */
Vcchar_t* exdt; /* space for extracted data */
} Vcrdfield_t;
typedef struct _vcrdtable_s
{ Vcrdinfo_t* info; /* information on table data */
ssize_t parz; /* size of parsed data */
ssize_t recn; /* number of records */
ssize_t fldn; /* number of fields */
Vcrdfield_t* fld; /* list of fields */
ssize_t* vtmp; /* temp space of recn elements */
Vcchar_t* exdt; /* space for extracted data */
} Vcrdtable_t;
typedef struct _vcrdplan_s
{ ssize_t fldn; /* number of fields */
ssize_t pred[]; /* predictor list */
} Vcrdplan_t;
ssize_t vcrdsepar(Vcrdsepar_t* rds, ssize_t nrds,
Vcchar_t* data, ssize_t dtsz, int rsep);
Vcrdtable_t* vcrdparse(Vcrdinfo_t* info,
Vcchar_t* data, ssize_t dtsz, int type);
void vcrdclose(Vcrdtable_t* tbl);
ssize_t vcrdfield(Vcrdtable_t* tbl, ssize_t f, ssize_t n,
Vcchar_t* data, ssize_t dtsz);
int vcrdattrs(Vcrdtable_t* tbl, ssize_t f, int attrs,
int type);
ssize_t vcrdsize(Vcrdtable_t* tbl);
ssize_t vcrdextract(Vcrdtable_t* tbl, ssize_t f,
Vcchar_t** datap, int type);
typedef ssize_t (*Vcrdweight_f)(Vcchar_t* str1, ssize_t n1,
Vcchar_t* str2, ssize_t n2);
ssize_t vcrdweight(Vcrdtable_t* tbl, ssize_t f,
ssize_t* vect, Vcrdweight_f wghtf);
int vcrdvector(Vcrdtable_t* tbl, ssize_t f,
Vcchar_t* trdt, ssize_t dtsz, int type);
Vcrdplan_t* vcrdmakeplan(Vcrdtable_t* tbl, Vcrdweight_t wghtf);
void vcrdfreeplan(Vcrdplan_t* plan);
int vcrdexecplan(Vcrdtable_t* tbl, Vcrdplan_t* plan,
int type);
void vcioinit(Vcio_t* io, Vcchar_t* buf, int n);
size_t vciosize(Vcio_t* io);
size_t vciomore(Vcio_t* io);
Vcchar_t* vciodata(Vcio_t* io);
Vcchar_t* vcionext(Vcio_t* io);
Vcchar_t* vcioskip(Vcio_t* io, int n);
void vciomove(Vcio_t* to, Vcio_t* from, int n);
ssize_t vcioputc(Vcio_t* io, int c);
int vciogetc(Vcio_t* io);
ssize_t vcioputu(Vcio_t* io, Vcint_t v);
Vcint_t vciogetu(Vcio_t* io);
ssize_t vcioputm(Vcio_t* io, Vcint_t v, Vcint_t max);
Vcint_t vciogetm(Vcio_t* io, Vcint_t max);
ssize_t vcioput2(Vcio_t* io, Vcint_t v,
Vcchar_t a, Vcchar_t b);
Vcint_t vcioget2(Vcio_t* io, Vcchar_t a, Vcchar_t b);
ssize_t vcioputg(Vcio_t* io, Vcint_t v);
Vcint_t vciogetg(Vcio_t* io);
ssize_t vcioputlist(Vcio_t* io, Vcint_t* list, ssize_t n);
Vcint_t vciogetlist(Vcio_t* io, Vcint_t* list, ssize_t n);
ssize_t vcioputs(Vcio_t* io, Void_t* s, size_t len);
ssize_t vciogets(Vcio_t* io, Void_t* s, size_t len);
void vciosetb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
int type);
void vcioendb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
int type);
void vciofilb(Vcio_t* io, Vcbits_t bits, ssize_t nbit);
Vcbits_t vciodelb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
ssize_t ndel);
void vcioflsb(Vcio_t* io, Vcbits_t bits, ssize_t nbit);
Vcbits_t vcioaddb(Vcio_t* io, Vcbits_t bits, ssize_t nbit,
Vcbits_t add, ssize_t nadd);
double vclog(size_t v);
size_t vclogi(size_t v);
size_t vcsizeu(Vcint_t v);
Vcint_t vcatoi(char* buf);
ssize_t vcitoa(Vcint_t v, char* buf, ssize_t n);
Vcint_t vcintcode(Vcint_t v, Vcint_t near,
Vcint_t min, Vcint_t max, int type);
char* vcstrcode(char* s, char* buf, ssize_t n);
typedef int (*Vccompare_f)(void* elt1, void* elt2, void* disc);
void vcqsort(void* list, ssize_t nelts, ssize_t eltsize,
Vccompare_f comparf, void* disc);
ssize_t vcbcktsort(ssize_t* idx, ssize_t* tmp, ssize_t n,
Vcchar_t*, ssize_t* maxc);
char* vcsubstring(char* data, int separ,
char* buf, ssize_t bufsz, int type);
Vcodex provides a collection of data transforms including
compression, differencing and encryption via a discipline and method
interface
(see The Discipline and Method Architecture for Reusable Libraries,
Software Practice & Experience, v.30, pp.107-128, 2000).
This type is used to pass data between
Vcodex and application code.
Void_t is defined as
void for ANSI-C and C++
and
char for older compilation environments.
This defines the type of bytes as manipulated by Vcodex.
This defines an integer type of maximum supported size.
This defines an integer type suitable for bit manipulation and I/O.
This defines the type of a discipline structure.
This defines the type of a data transform.
This defines the type of a structure describing arguments to a method.
This defines the type of a function to be invoked while walking
some list of data structures. Its prototype is:
int (*Vcwalk_f)(Void_t* obj, char* name, char* value, Void_t* disc);
- obj:
-
This is the object being walked. For example, if the list of methods
is being walked, this would be a pointer to a Vcmethod_t object.
- name, value:
The name and value of the object being walked.
- disc:
The structure passed along to the invoking walk function.
See also vcwalkmeth() and vcwalkalias().
This defines the type of a Vcodex handle.
vcopen() creates a handle to process data.
It returns the new handle on success or
NULL on failure.
- disc:
A discipline structure to describe data (see DISCIPLINE section).
- meth:
a data transform method (see METHODS section).
- init:
Any additional parameters for the given method.
- coder:
This specifies a continuation coder, i.e.,
another handle to process various data parts produced by
the main transform.
- flags:
This gives the bit values to control behaviors of the new handle.
Exactly one of VC_ENCODE and VC_DECODE should be specified
to indicate whether the handle is used for encoding or decoding.
The bit VC_CLOSECODER, if specified, indicates that
the continuation coder, if any, should be closed on handle closing.
This function creates a composite handle based on the specification of
a sequence of methods. It returns the composite handle on success and
NULL on failure.
See section ALIAS PROCESSING for the syntax of a specification.
This function extract data from a composite encoding handle
vc
that can be used later by
vcrestore() to construct a composite decoder.
The data is returned in
*storep.
vcextract() returns the data length on success and
-1 on failure.
vcrestore() constructs a composite decoding handle from the given data.
It returns the handle on success and
NULL on failure.
This closes the
vc handle,
meaning to free all associated memory resources.
vcclose() returns
0 on success or
-1 on error.
This applies the data transform associated with
vc
to the given
data.
If the handle was opened for decoding,
the given
data and
size should
match exactly the previously encoded result.
vcapply() returns the length of the output data buffer or
-1 on failure.
If
output == NULL, only the size of the result will be returned.
If
output != NULL, the transformed data will be returned
in
*output (see also
vcundone() below.)
Note that each call to
vcapply() returns a separate buffer
of encoded or decoded data. These buffers should be freed as soon
as the respective data have been consumed. See
vcbuffer() for
how to do that.
This function is intended to be used by method writers for continuation of
data process in secondary coders. It invokes
vcapply()
on the continuation coder to transform the data given
in
*datap and
*sizep. The transformed data and its size are returned
in the same arguments.
head, if positive, causes buffer allocation in secondary
coders to leave that much room in front of the transformed data.
In this way, successive calls of
vcrecode() can leave an accumulated amount
of header space for adding data later by the callers.
vcrecode() returns the size of the transformed data on success and
-1 on error.
Certain methods (e.g.,
Vctable) may not process all given data.
After a successful
vcapply() call (i.e., it returns
a non-negative value),
the call
vcundone() tells the amount of data left unprocessed.
This function manages the buffer pool for a handle.
It always returns
NULL on freeing buffers.
For buffer allocation, it returns the allocated buffer or
NULL on failure.
When trunc != NULL, it specifies a buffer in vc
or one of the continuation coders to be truncated down to size or freed
if size < 0.
When trunc == NULL, there are two cases.
If size is negative, all buffers allocated via
vc and continuation coders are freed.
Otherwise, a new buffer of the given size is allocated.
This buffer is guaranteed to have at least n writable
bytes in front of the allocated buffer where n is head
plus some other amount as defined by various vcrecode() calls
during continuation processing.
This function walks though all handles
starting from
vc calls
bufferf() on the lists
of buffers in these handles.
The needed types are:
typedef struct _vcbuffer_s Vcbuffer_t;
struct _vcbuffer_s
{ Vcbuffer_t* next;
size_t size; /* total buffer size */
char* file; /* file allocating it */
int line; /* line number in file */
unsigned char buf[1]; /* actual data buffer */
};
int (*Vcbuffer_f)(Vcodex_t* vc,
Vcbuffer_t* list, Vcmethod_t* meth);
Here, list is a linked list of buffers of the type Vcbuffer_t
and meth is the method used for the given handle.
The function creates a new context if
ctxt is
NULL
or makes
ctxt the current context if it is not
NULL. It returns the
current context or
NULL on error.
In certain situations, states may be retained across calls of vcapply() or vcrecode().
For example, the Vctable transform computes and reuses
a transformation plan repeatedly as appropriate.
A handle using such a transform should create separate contexts
to keep states for different table types.
In particular, a transform such as Vctable could extend
Vccontext_t by including it as the first element
of a private context type. In that case, the method should also supply an
eventf function capable of handling the events VC_INITCONTEXT
and VC_FREECONTEXT. The first allocates a new context while the latter
frees a given one.
This function deletes the given context
ctxt.
This macro function retrieves the current context of
vc and
coerces that to its given private
type.
Methods are of the type Vcmethod_t as defined below:
typedef struct _vcmethod_s
{ Vcapply_f encodef; /* function to encode data */
Vcapply_f decodef; /* function to decode data */
int (*eventf)(Vcodex_t*, int, Void_t*);
char* name; /* the string name of the transform */
char* desc; /* a short description */
Vcmtarg_t* args; /* arguments usable with vcopen() */
ssize_t window; /* suggested window size */
int type; /* VC_MTSOURCE */
} Vcmethod_t;
- encodef, decodef:
These are functions to encode and decode data.
- eventf:
If not NULL, this is a function to handle events.
- args:
This defines a list of arguments usable to initialize the method.
See ALIAS PROCESSING for details on how to specify arguments.
Each element of the list is of the type Vcmtarg_t:
typedef struct _vcmtarg_s
{ char* name; /* the name of the argument */
char* desc; /* a description of the argument */
Void_t* data; /* any associated data */
} Vcmtarg_t;
- type:
This provides flags telling certain characteristics of the method.
Currently only VC_MTSOURCE is used to indicate that
the method is a delta compressor.
The list of events passable to eventf are:
- VC_OPENING:
A new handle is opened. The third argument is data for initialization.
- VC_CLOSING;
A handle is being closed. All private method data should be deallocated.
- VC_INITCONTEXT:
Create a new context for data transformation.
- VC_FREECONTEXT:
Delete the given context.
- VC_FREEBUFFER:
Free any allocated buffers for this handle or buffers allocated in
other handles created internally.
- VC_SETMTARG:
Set a parameter which should be specified in a name=value format.
For example, to tell the Vctranspose data transform that the
number of columns for the data to be transformed next is 10,
the event VC_SETMTARG can be raised with associated data ``columns=10''.
- VC_EXTRACT:
- VC_RESTORE:
The third argument to the eventf function should be a structure of type Vcmtcode_t:
typedef struct _vcmtcode_s
{ Vcchar_t* data; /* encoded data of a handle */
ssize_t size; /* size of encoded data */
Vcodex_t* coder; /* reconstructed coder */
} Vcmtarg_t;
For VC_EXTRACT,
internal states should be coded in a form suitable for persistent storage
and returned in the fields Vcmtcode_t.data and Vcmtcode_t.size.
For VC_RESTORE,
a coder for decoding data should be constructed based on
the data in the fields Vcmtcode_t.data and Vcmtcode_t.size
and returned in Vcmtcode_t.coder.
In both cases, the eventf function should return a positive value on success.
- VC_GETIDENT:
Obtaining the identification string of the method.
The handling of this event is optional.
If not defined, the identification string will be
the method's name translated into the ASCII character set.
- VC_TELLBUFFER:
This event is used mostly for debugging memory usage. It is a request
to process memory buffers being maintained by a handle and its continuation.
The third argument to eventf should be the address of a function
defined with the below type. The arguments are:
vc the handle creating the buffer, list the list of buffers in that handle
and meth the method in use.
typedef struct _vcbuffer_s Vcbuffer_t;
typedef int (*Vcbuffer_f)(Vcodex_t* vc,
Vcbuffer_t* list, Vcmethod_t* meth);
struct _vcbuffer_s
{ Vcbuffer_t* next; /* link to next buffer */
ssize_t size; /* size of the buffer */
char* file; /* file creating the buffer */
int line; /* line number in the file */
...
};
This function adds a list of methods to the collection of available methods.
It is intended to be used by applications that wish to make their own
transforms available to a search via
vcgetmeth().
This function returns a method matching
name.
If
type is zero,
name is assumed to be in the native codeset
(e.g., ASCII or some form of EBCDIC).
Otherwise, it should have been previously coded by
vcstrcode().
Such coding of a string makes the data portable across platforms using different
codesets.
This function walks the currently available list of methods and
invokes
walkf as
(*walkf)(meth,name,desc,disc).
Here,
meth is the method being walked while
name and
desc
are its name and description.
This function returns the identification string of the method
meth.
This string is guaranteed to be in ASCII so that it can be portably stored.
buf and
size define a buffer that
vcgetident()
may use to store the identification string.
vcgetident() returns
NULL on error.
This call may be used by a method writer to parse the string
data which should be
composed of elements separated by periods.
Each element can be either just a plain string
name or a pair
name=value.
The string
name will be matched against
list to find a matching argument.
If one is found,
*arg will be set to it.
value, if any, will be copied into the buffer
val without exceeding
vlsz.
vcgetmtarg() returns the remainder of unparsed data or
NULL on failure.
This function invokes
eventf() to set an argument (parameter).
By convention, each transform should ignore any inapplicable parameters.
The
type argument can be
0,
1,
2 to indicate that
the
val argument is a
byte, an
integer or a
string respectively.
A byte value is coded in C-style while an integer is coded in decimal.
For example, the call
vcsetarg(vc, "columns", 10, 1)
set the number of columns to
10 for a method such as
Vctranspose or
Vctable.
These macro functions retrieve and set the private data structure
created by the transform to use in the handle
vc.
vcgetmtdata() coerces the data structure to its private type.
This method implements a generalized Lempel-Ziv parsing strategy
to compress a target data segment either by itself or
against another source data segment.
The source data segment, if any, should be specified
by the
data and
size field of a discipline structure.
The compressed data are in the standard format defined by the IETF RFC3284.
String matching is based on a suffix sorting algorithm
if the
init argument to
vcopen() is the string
"s".
Otherwise, a hashing algorithm is used.
This method implement various strategies to condition data for compressibility.
The argument
init to
vcopen() can be composed from:
- delta:
This does delta compression with approximate matching.
A matched data segment is coded by subtracting
it against the matching segment.
Thus, in this case, Vcsieve does not reduce data.
However, the data that it produces is amenable to compression
by other transforms, for example, a run-length compressor
together with a Huffman coder.
If delta is not specified, only exact matching will be done
and matched data will not be coded by subtracting as described.
- reverse:
This allows matching data in reverse. For example abc
can be matched with cba (as well as another instance of abc).
- map:
-
This allows defining a set of pairs of letters that are allowed to match
with one another. For example, map=ATGCatgc allows matching A
with T, G with C, etc. For example, such a map specification
along with reverse can be used to compress DNA data.
- mmin:
This defines a minimum prefix that must be exactly matched before further matching
either exact or approximate.
This code a target data segment by subtracting it against another source data segment
given in the discipline structure. If the source data segment is short, it will be
repeated as necessary.
This method performs Huffman coding.
This method first divides the given data into small contiguous parts
of equal size (except perhaps the last one). Then,
the parts are collected into groups so that
each group can be compressed
separately with its own Huffman code table.
This method divides the given data
into contiguous sections each of which uses a separate Huffman coding
table optimized for it.
This method encodes runs in data.
Normally, runs are encoded using a scheme whereby a run is indicated by
some escape character and its length is encoded in a compact variable size format.
The run lengths are coded in a separate area from the data so that
the two types of data can be effectively compressed via a later continuation coder
if there is one.
An alternative run encoding method is indicated
by giving the string
"0" as the
init argument of
vcopen().
In this case, only runs of 0's are encoded using a binary encoding method.
This is useful in a compression technique that produces large sequences of zeros
such as the Burrows-Wheeler transform
Vcbwt or the table transform
Vctable.
Vcbwt performs a Burrows-Wheeler transform on the given data.
This data is amenable to better compression via some combination of
move-to-front, run-length encoding and entropy coding.
This method performs a move-to-front transform on the data,
i.e., mapping each data byte into their positions in some
dynamically updated array of the 256 byte values.
By default, the array is updated using a predictive method
aiming at creating more zeros.
If the
init argument of
vcopen() is the string
"0",
the array is updated by moving the encoded byte to position 0
(i.e., the traditional MTF transform).
This method transposes rows and columns.
If the
init argument of
vcopen() is the string
"v",
each row ends with some separator character.
If the
init argument is anything else, row lengths are fixed size.
In that case, the row size (i.e., number of columns) is normally encoded
at the start of transformed data unless the
init argument is
"0".
The number of columns may be learned directly from the data.
But it and the row separator can also be set
via
vcsetmtarg() using one of
columns and
rsep or
fsep.
Only data fitting the table profile, i.e., the maximum number of well-defined rows
will be processed. The unprocessed amount can be retrieved via
vcundone().
This method transforms table data, i.e., two-dimensional byte arrays,
into a form that is amenable to compression via a combination of move-to-front (
Vcmtf),
run-length encoding (
Vcrle) and entropy coding (e.g.,
Vchuffgroup).
Normally, the number of columns is learned directly from data.
However, it can also be explicitly set via
vcsetmtarg()
by the parameter
columns.
Only data fitting the table profile will be processed.
The unprocessed amount can be told via
vcundone().
This method transforms a relational data table to make it more compressible.
The size of unprocessed data can be told with
vcundone().
A relational data table is of one of two forms:
- Flat:
This is a collection of records such that each of which terminates with a record separator.
Within each record is a sequence of fields each of which terminates with a field separator.
Field and record separators can be set via vcmtsetarg() using fsep and rsep.
The attribute full can be used to coerce padding of field values so that they all
have the same lengths.
All attributes can be set in the init argument of vcopen().
- Schematic:
This is a collection of records such that each of which consists
of a set of fields, each of a fixed length. The lengths of such fields
can be set via schema=[f1,f2,...] either via the init
argument of vcopen() or via vcmtsetarg().
This method is similar to
Vcrdb for flat relational data tables
but uses a faster algorithm with comparable or better performance for this class of data.
Field and record separators can be set via
vcmtsetarg() using
fsep and
rsep.
This method maps each byte to another byte.
An application-specific mapping can be specified via
the field
data of the discipline structure.
It must be a
Vcchar_t array of size 256 where each
position has its mapped value.
Predefined mappings are provided to translate between
ASCII and various versions of EBCDIC.
These can be selected via
init argument of
vcopen():
- a2e, e2a:
These pertain to the EBCDIC character set defined
by the X/OPEN dd(1) command.
- a2i, i2a:
These pertain to the EBCDIC character set defined
the dd(1) command on IBM systems.
- a2h, h2a:
These pertain to the EBCDIC character set defined
on IBM-37 AS/400 systems.
- a2o, o2a:
These pertain to the EBCDIC character set defined
on IBM OpenEdition Unix systems.
- a2s, s2a:
These pertain to the EBCDIC character set defined
on Siemens Posix systems.
A delta compression method such as Vcdelta
requires source data to compare against.
Such information must be supplied by an application via the discipline
structure Vcdisc_t:
typedef struct
{ Void_t* data;
size_t size;
Vcevent_f eventf;
} Vcdisc_t;
If not
NULL,
eventf announces various events.
Unless noted otherwise,
a negative value causes the calling operation to terminate with failure while
a non-negative return value let the calling function proceed normally.
Following are the events:
- VC_OPENING:
When data is NULL, this announces the opening of the handle vc.
Otherwise, an error has happened during initialization and
data was the method used to open the handle,
- VC_CLOSING:
The handle vc is being closed.
The argument data will be NULL in this case.
- VC_DISC:
The current discipline of vc is being changed to the new one given in
(Vcdisc_t*)data.
This changes the discipline of
vc to
disc.
On success,
vcdisc() returns the previous discipline if it is
not
NULL; otherwise, it returns
disc.
On failure,
vcdisc() returns
NULL.
See also the methods
Vctable and
Vctranspose.
This macro function retrieves the discipline for
vc.
This collection of types and functions provides a subpackage
for computing matching windows to help with delta compresssion.
This opens a handle for computing matching windows based
on the given method
meth.
The discipline structure,
disc, provides source and target file
handles and an event handling function. Source and target file handles,
if given, should be opened for reading. Currently, the event handling function
is only called with
VCW_OPENING and
VCW_CLOSING.
If
meth is
NULL, the handle is opened for decoding data.
Below are the methods:
- Vcwmirror;
This method reuses the position of the data in the target file
as the tentative start of the computed matching window
in the source file. It may add a small amount of data around
that position to enhance string matching.
- Vcwvote:
This method uses the frequencies of the n-grams (for some private n)
in a target window of data and a voting algorithm to find
a matching window in either the source file or the target file.
- Vcwprefix:
This method uses prefix matching on blocks of data to determine
windows matching portions of target data.
This closes the handle
vcw.
This applies the windowing method of
vcw to compute
a suitable matching window to the given data segment.
pos gives the starting position of the data segment in the target file.
If the handle was opened with both a source and a target files,
the computed matching window can be anywhere in the source file but it is restricted
to the part of the target file before
pos.
A matching window, if found,
is returned in a
Vcwmatch_t structure with elements:
- type:
This is either VCD_SOURCEFILE or VCD_TARGETFILE
to indicate which file the matching window is from.
- wdata, wsize, wpos:
These give the desired window data and the position in the respective file.
- msize:
This gives the amount of data from data that should be used for
compressing against the computed window data.
This tells
vcw the compression result from the use of the window computed
in the last
vcwapply() call. Certain methods may use this information
to focus window searching.
This function searches for a window matching method matching
name.
It returns the method or
NULL on failure.
This walks the list of window matching methods
in the same manner as
vcwalkmeth().
The compression methods Vchuffman, Vchuffgroup and Vchuffpart
are based on the Huffman encoding. The functions described here
provide efficient algorithms for encoding and decoding of certain basic
data structures required in Huffman encoding.
This function computes the code lengths in bits for all bytes given
their frequency as defined in
freq[].
Code lengths are returned in
size[].
If there is a single run, the run byte is returned in
*runb.
vchsize() returns the maximum length of any code.
This function computes the actual codes to encode each byte based on
the sizes given in
size[]. It returns the maximum code size.
This function sets the frequency and size arrays in handle
vc
if they are not
NULL.
maxs is the maximum code size as given
given in
size[].
vc will use these arrays for encoding
instead of computing them from the data being encoded.
vchcopy() returns
0 on success and
-1 on failure.
The Huffman code is completely specified by the lengths of the codes as given
in the
size array.
This function encodes the
size[] array in a compressed portable form
suitable for output. The
data array should have a sufficiently large
size
dtsz to store the result. It returns the length of the result in bytes.
This function decodes the
size[] array from the encoded data given
in
data. It returns the number of bytes consumed from
data
for decoding
size[].
Data in tables with fixed number of columns
can be better compressed by transforming the data of each column
based on whether or not it can be predicted well by some other column.
The data types and functions needed for such a data transformation
are described below.
typedef struct _vctblcolumn_s /* column specifications */
{ ssize_t index; /* column index */
ssize_t pred1; /* <0 if self compressing */
ssize_t pred2; /* if pred1<0, 1 for doing MTF */
} Vctbltrans_t;
typedef struct _vctblplan_s /* transform plan */
{ ssize_t ncols; /* row size or # of columns */
Vctblcolumn_t* trans; /* the plan to transform data */
Vcodex_t* zip; /* Vcdelta to code plan */
} Vctblplan_t;
This function uses the given
data to compute a plan that
can be used to compress table with fixed-length rows.
vctblmakeplan() returns the plan on success or
NULL on failure.
The argument
ncols tells the number of columns.
The argument
flags should be a combination of the below bits:
- VCTBL_RLE: use run-length compressive entropy to measure dependency strength (default).
- VCTBL_CEE: use conditional empirical entropy to measure dependency strength.
- VCTBL_LEFT: use left to right dependency relations (default).
- VCTBL_RIGHT: use right to left dependency relations (default).
- VCTBL_SINGLE: use only a single predictor. Default is two predictors.
This function frees a plan computed by
vctblmakeplan().
vctblencodeplan() computes from the given
plan a string suitable
for permanent storage.
It returns the length of the string on success
and
-1 on failure.
The code string itself is returned in
*codep on success.
vctbldecodeplan() reconstructs the plan encoded in the given string.
Each relational data table consists of a number of records where
each record contains a number of fields. We allow fields to be any
combinations of being fixed-length or delineated by some separators.
The collection of data types and functions provided here enable construction
of on-line data structures suitable for manipulating relational
data tables. In particular, the type Vcrdtable_t describes
the structure of a relational dataset.
In the case when the data are records and fields purely delineated by record
and field separators, a record may be allowed to not containing a full
complement of fields. In such a case, the on-line structure fills in
missing fields with data consisting of single record separators so that
the table appears as if all records have the same number of fields.
A relational dataset may consist of records separated by some record separator.
In addition, each record consists of fields separated by some field separator.
Often, a dataset may be given without specification of either of these separators.
This function computes candidates for record and field separators for such a dataset.
- rds, nrds: These define an array with nrds elements of the type Vcrdsepar_t.
vcrdsepar() returns up to nrds best candidate separators in this array.
- data, dtsz: These provide the data used in computing the separator candidates.
- rsep: If this is non-negative and less than 256, it defines the record separator.
In that case, only the field separator will be computed.
These functions constructs and frees
Vcrdtable_t structures
to represent data by fields and records.
vcrdsize() returns the actual amount of data from
data used to
construct the table after a
vcrdparse() or
vcrdfield() call
(i.e., not all data may be used in such a call). Below are the arguments for
vcrdparse():
- info: This structure defines initial information about the table data.
The field flen, if not NULL, should be an array with positive fldn elements.
Each element of flen[] can be (1) positive to define
a fixed-length field, (2) zero to define a field delineated by a separator, and
(3) negative to define a field delineated by some separator
but having been filled in so that data across all records have the same length, the absolute
value of the given value.
- data, dtsz: The raw data of the table.
The fields recn and fldn of info must be positive if
either data is NULL or dtsz <= 0. In this case,
only a skeleton Vcrdtable_t structure will be constructed.
Field data must be filled in later via vcrdfield().
- type: This is either 0 or VCRD_FIELD. The latter means
that the given data is in a field-oriented format instead of the default record-oriented format.
The fields recn and fldn of info must be positive if data is field-oriented.
This call changes the data of a field in the table
tbl.
The field index is given in
f while its length is given in
n.
The treatment of
n is exactly the same as described above for elements in the
flen[] array.
vcrdfield() returns the size of data actually used.
This call sets the attributes of fields in the table
tbl.
If
f is nonnegative, it indicates the single field to change.
Otherwise, all fields will be considered to change.
type can be zero or non-zero to turn off or on the given attributes.
attrs is a bit combination of the below attributes:
- VCRD_FULL:
If the field is delineated by a separator, values across records may have different lengths.
When this attribute is on, such values will be padded with the separator as necessary to make
all have the same length. Note that this padding is done virtually on data extraction only.
See also vcrdextract().
- VCRD_DOT:
Certain fields contain values such as floating point numbers or IP addresses
are of the form xxx.yyy.zzz.... In the case of IP addresses, a value
may also be ended in /nnn to indicate the number of bits to use.
When this attribute is on, such a value will be converted into a fixed-length sequence
of bytes, each representing a component of the value. The conversion is done
only if all values of the field can be converted uniformly. On a successful conversion,
the type attribute of the respective Vcrdfield_t will be set to contain
VCRD_DOT and perhaps also VCRD_SLASH if the /nnn pattern was detected and converted.
This call extracts from table
tbl the data of a field
f if
f is non-negative
or the data of the entire table otherwise. The data array is returned in
*datap and its
size is the return value.
type can be a bit combination of the following:
- VCRD_FULL:
If this bit is on, records with missing fields will be filled in with the record separator
as appropriate to make all records have the same number of fields.
Note that variable-length fields may also be filled separately by their internal attributes
as described in vcrdattrs().
- VCRD_FIELD:
The default data orientation is by record.
This bit causes data to be extracted in a field-oriented fashion.
Note that this bit implies VCRD_FULL.
typedef struct _vcrdplan_s
{ ssize_t fldn; /* number of fields */
ssize_t pred[]; /* predictor list */
} Vcrdplan_t;
Vcrdplan_t defines the dependency relations among fields.
fldn is the number of fields while pred[] gives the predictive relations.
If pred[i] != i, field pred[i] predicts field i.
Otherwise, i is unpredicted.
This function computes the weight of field
f in table
tbl.
The weight of a field is calculated by summing the weights of values in adjacent records.
vect, if not
NULL, is a vector used to reorder the records of
fld before
calculating its weight.
wghtf, if not
NULL, is a function to compute the weight
of two given strings of data. When
wghtf is
NULL, an internal function is used
which simply computes the length of the common prefix of the two given strings.
This function computes a transform vector of field
f in table
tbl.
It returns -1 on error and non-negative on success. On success, the return value 1 signifies
that all values are equal.
- type == VC_ENCODE:
If trdt is not NULL, it is memory of size dtsz.
Record data will be transposed column by column into it
based on the sort states up to a particular column.
In this case, the internal transform vector, if any, will not be changed.
On the other hand, if trdt is NULL, the internal transform vector will
be computed if not yet done so.
- type == VC_DECODE:
This inverts the data previously transformed by vcrdvector() and repopulates the field f.
The transform vector will be updated to reflect the new data.
vcrdmakeplan() computes the field dependency in a relational data table to define
a plan to transform the data for better compression. It returns
NULL on error.
wghtf, if not
NULL or
VCRD_NOPLAN,
is a function to compute the weight of a pair of strings.
vcrdfreeplan() frees all memory associated with the given plan.
This function executes the given
plan to transform the fields in table
tbl.
The argument
type is either
VC_ENCODE or
VC_DECODE to indicate whether
the data is being encoded or decoded.
The function returns
-1 on error.
typedef struct _vcsfx_s
{ ssize_t* idx; /* sorted suffix indices */
ssize_t* inv; /* inverse permuation of idx[] */
Vcchar_t* str; /* string with sorted suffixes */
size_t nstr; /* length of above string */
} Vcsfx_t;
vcsfxsort() sorts indices of suffixes of
str to create
a suffix array. The string is treated as if it ends with an element
larger than all other bytes in the string. In this way, the order for
all suffixes are well-defined.
vcsfxsort() returns a structure
of type
Vcsfx_t as above on success or
NULL on failure.
The field
Vcsfx_t.idx has the indices of all suffixes sorted
lexicographically. The field
Vcsfx_t.inv has the inverse
permutation of
Vcsfx_t.idx. That is,
inv[idx[i]] == i.
vcperiod computes a quasi-period in the given data, if any.
This is useful to determine the number of columns in table data.
Certain transforms such as the delta compression method Vcdelta uses
a greedy parsing scheme based on a combination of Lempel-Ziv'77 and
Tichy's block-move. Source and target strings are conceptually concatenated into
a superstring. The resulting data is then parsed starting from target data to
construct a sequence of matching fragments.
As an example, consider parsing a target sequence
abcde given the source sequence abcdabcdabcdf.
The catenated sequence is abcdeabcdabcdabcdf so parsing
may produce the below matching segments and a possible
reconstruction plan from the Vcdelta transform with the
shown COPY and ADD instructions:
5 4 0 /* position 5 matches 4 bytes starting at 0 */
9 8 5 /* position 9 matches 8 bytes starting at 5 */
COPY 4 0 /* abcd......... */
COPY 8 5 /* abcdabcdabcd. */
ADD 1 f /* abcdabcdabcdf */
This function parses the target data and calls
vcpa->parsef on each parsed segments.
Normally, hashing is used for string matching but if
bound < 0 and the parsing does not use reverse matching
or character mapping, then a suffix sorting algorithm is used for string matching.
The handle
vcpa is of type
Vclzparse_t
and specifies the data to be parsed and parsing parameters.
The argument
bound, if negative, means that a suffix sorting algorithm
will be used for string matching. This tends to be slow but finds longer matches.
If
bound is non-negative, a hashing method is used for string matching.
This is faster but may miss some long matches. Further, if
bound is positive
it provides a hint to prune the hash table of far away addresses.
typedef struct _vclzparse_s
{ Vclzparse_f parsef; /* function to process a segment sequence */
Vcchar_t* src; /* the source string, if not NULL */
ssize_t nsrc; /* the length of the source data */
Vcchar_t* tar; /* the target string to be parsed */
ssize_t ntar; /* the length of the target data */
ssize_t mmin; /* the minimum length required for a match */
Vcchar_t* cmap; /* if !NULL, byte pairs to map for matching */
int type; /* a combination of VCLZ_REVERSE|VCLZ_MAP */
} Vclzparse_t;
typedef struct _vclzmatch_s
{ ssize_t tpos; /* the position of data in target string */
ssize_t mpos; /* the matching position in source/target */
ssize_t size; /* the length of the data involved */
} Vclzmatch_t;
If the bit VCLZ_REVERSE is on in the field Vclzparse_t.type,
reverse matching is allowed. For example abc may match with cba
as well as abc. Likewise, if the bit VCLZ_MAP is on,
the field Vclzparse_t.map should be a string specifying pairs
of characters that are allowed to match. For example, if map
is "ATGCatcg", then A matches with T,
G matches with C, and so on.
This example pattern and reverse matching are useful for compressing DNA data.
This call-back function is invoked on sequences of data segments. It should return
the amount of data actually consumed. A negative return value causes
vclzparse()
to stop and return immediately. A zero return value means that all the data covered
by the given segments should be considered unmatchable.
- vcpa:
The same handle that was passed to vclzparse().
- type:
This is a combination of VCLZ_REVERSE and VCLZ_MAP to tell
what type of matching was done.
- mtch, n:
This is the list of parsed segments, each presented in a structure
of type Vclzmatch_t. Only mtch[0] may have mpos < 0
to indicate a segment of unmatched data.
Other elements should indicate actual matches.
The below functions are provided by Vcodex for portable data encoding.
These functions perform only minimal bound checking, if any.
Applications should take care of such details.
This initializes a
Vcio_t structure
io with
the given buffer
buf with size
n bytes.
This returns the extent that the buffer has been written to or read from.
This returns the size of the remaining part of the buffer that has
not yet been written to or read from.
This returns the original data buffer.
This returns the pointer to the next position in the buffer that can
be written to or read from.
This skips ahead
n bytes and returns the resulting pointer to the buffer.
This moves
n bytes between the given buffer and advances their
positions accordingly.
These functions write and read bytes respectively.
These functions write and read arrays of bytes.
They return the number of bytes written or read.
These functions write and read unsigned integers using a 7-bit encoding
in the Sfio style.
vcputu() returns the number of bytes used to encode the value.
These functions write and read unsigned integers
using an 8-bit encoding in the Sfio style.
The size of the encoding depends on the magnitude of
max.
For example, if
max is 255, only one byte is used while, if
max
is 256, two bytes will be used.
vcputm() returns the number of bytes used to encode the value.
These functions write and read unsigned integers using a
compact binary coding with
a and
b as digits.
vcput2() returns the number of bytes used to encode the value.
These functions write and read integers using the Gamma code.
These functions write and read lists of non-negative integers using
a compact bit code. Note that the list size
n given in
vciogetl()
must match exactly the list size given to
vcioputl().
These macro functions constitute a set of operations for fast bit I/O.
The variables bits and nbit are declared
by the application and will be asociated with the handle io via vciosetb().
The association is for either encoding, i.e., type=VC_ENCODE,
or decoding, i.e., type=VC_DECODE, and must be done before any bit operations.
vcioendb() must be called to finalize bit I/O to get ready for other forms of I/O.
vciofilb() fills bits with some number of bits available from io.
nbit is updated to indicate the number of available bits.
vcioflsb() flushes the bits available in bits out to io.
Bits are flushed in 8-bit aggregates into bytes.
nbit is updated to indicate the number of bits remained unflushed in bits.
vcioaddb() adds nadd bits from add to bits.
It is the caller's respobsibility to ensure that there is room in bits
to add the new bits. Otherwise, bits may be lost.
vciodelb() takes off ndel number of bits from bits.
Such bits are lost forever.
The functions described in this section process data from a stream.
Depending on local environments, these functions are either based
on the Sfio library or on Stdio, the ANSI-C I/O interface.
The type
Vcsfio_t represents a stream for performing I/O.
When Stdio is used, it is an opaque type keeping
state information needed for data transformation.
In this case, the I/O functions
vcsfwrite() and
vcsfread() should be used
to read and write data.
When Sfio is used, it is defined as
Sfio_t.
In this case, all normal Sfio operations could be used on the stream.
Functions such as
vcsfread() and
vcsfwrite() will be macro redefinitions of
their corresponding Sfio functions.
The type
Vcsfdata_t is used to pass initialization data to
vcsfio().
typedef struct _vcsfdata_s
{ int type; /* various types of processing */
char* trans; /* transformation specification */
char* window; /* window specification */
char* source; /* source file to delta against */
Vcsferror_f errorf; /* to process errors */
} Vcsfdata_t;
- type:
This is composed from two bits: VCSF_VCDIFF for outputting a header
as defined by the IETF RFC3284 for the VCDIFF delta encoding format or
VCSF_PLAIN for outputting no header at all. If neither bits are
set, a header based on methods used will be output.
- trans:
This is the transform specification.
See section ALIAS PROCESSING for details.
- window:
This specifies window size and window matching method.
The window size if defined should come first and be
an integer with optional suffix k for kilobyte
or m for megabyte. This window size defines the maximum amount
of data to be transformed at one time.
A window matching method
should be defined starting with a period then the name of the method.
- source:
Source file to delta-compress against if the first method
is a delta compressor.
- errorf:
This function announces an error encountered during data processing.
It is called as (*errorf)(char* string) where string
tells the error type.
This function constructs a
Vcsfio_t handle to perform encoding
or decoding on the stream
sf depending on whether
type is
VC_ENCODE or
VC_DECODE.
When the Sfio library is in use,
the same stream
sf is returned. When the Stdio library is in use,
the type
Sfio_t is a macro redefinition of the
FILE type while
Vcsfio_t is an opaque type as described earlier.
The structure sfdt provides the specification for methods used
to transform data.
The method specification sfdt->spec
must be supplied on encoding and also on decoding
if sfdt->type has the bit VCSF_PLAIN.
If type is VC_DECODE, sfdt can be NULL.
In that case, the transforms needed to decode data will be
obtained from starting data in the stream.
This function reads from a stream of encoded data and decode them into the
given buffer
buf. It returns the length of the decoded data on success and
a negative value on failure.
This function writes the raw data given in
data to the stream
vcsf.
Such data will be encoded before written out to the underlying data stream.
On success, the amount of raw data written is returned. On failure, a negative
value is returned.
This function flushes unwritten data out to the underlying stream.
This function closes the stream
vcsf. Unwritten data will be flushed
out to the underlying stream first.
This function adds a set of aliases or short-hands for
longer sequences of method specification.
The given
list is null-terminated.
Each item on list is a string of the form alias~=~specification.
Each specification is a list of comma-separated elements.
Each element should be a transform name
followed by any arguments. The arguments are separated by '.'.
Each argument may be a name or a name=value pair.
In the latter case,
value can have C characters embedded in them. For example, consider
the below two specifications:
rdb.fsep=[\t],bwt,mtf,rle.0,huffgroup
rdb.schema=[10,20,30],table,mtf,rle.0,huffgroup
The first specification says that the Vcrdb transform of relational data
should be used first. In this case, the data file is a flat file database with
text lines as records and the field separator is the tab character as
defined by fsep=[\t]. After rdb is done processing, it will pass
its output onto the Burrows-Wheeler transform bwt which, in turn, will invoke
the move-to-front transform mtf, and so on.
The second specification also invokes the Vcrdb first but this time
the data consists of records with three fixed-length fields
of lengths 10, 20 and 30 in that order. After rdb is done processing,
it will invoke the table transform to process each field.
This function returns the specification corresponding to the given
alias.
The arguments
spec and
size define a buffer to return
the specification in.
vcgetalias() returns the specification or
NULL
on failure.
This function invokes
(*walkf)(0, name, spec, disc) on each defined alias.
vclog() computes the base-2 logarithm of
v.
vclogi() only returns the integer part.
This macro function returns the number of bytes needed to encode
v
using
vcioputu().
These functions convert a string to an integer and vice versa.
vcitoa() requires that
n be sufficiently large to store
the constructed null-terminate string. It returns the string length.
This function encodes/decodes the integer
v based on whether
type is
VC_ENCODE or
VC_DECODE.
v and
near should be in the inclusive range
[min,max].
v is coded assuming that it is close to
near.
This type of integer coding is useful to reduce data representation
in a transform such as
Vcsieve which tend to code sequence of addresses
that are close to one another.
This function translates the characters in the string
s
from its native code set to the ASCII code set which can be
used for portable persistent data.
It returns
buf which should have the translated string
on success and
NULL on error.
vcqsort() is like the
qsort() function but it allows specifying
an element
disc to pass on to the comparison function
comparf.
This is useful to provide data about elements being compared.
This function sorts indices given in
list and returns them in
sort.
The number of element is
n and the bytes used to sort are given in
data.
The
bckt[] array returns the cumulative counts of bytes. For example,
bckt[0] is the count of byte
0,
bckt[1] is the count of bytes
0
and
1, and so on.
This function extracts a substring of
data and returns it in the buffer
buf
with a null byte appended.
vcsubstring() returns a pointer to the remainder
of
data. Substrings are separated by
separ.
Characters in C-notation are allowed. Data can also be grouped by square brackets.
The backslash escapes the next character.
- type == 0:
Only search for separ but still respecting backslashes and grouping by brackets.
- type == 1:
The backslash character means special treatment of characters as in the C language.
The left square bracket means to obtain every character until the appearance
of an un-escaped right square bracket. The brackets will be left out of
the returned data.