CWB
|
#include "../cl/globals.h"
#include "../cl/cl.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/storage.h"
#include "../cl/bitio.h"
#include "../cl/macros.h"
void bprintf | ( | unsigned int | i, |
int | width, | ||
FILE * | stream | ||
) |
Prints a binary representation of an integer to a stream.
i | Integer to print |
width | Number of bits in the integer |
stream | Where to print to. |
Referenced by compute_code_lengths().
Compresses the token stream of a p-attribute.
Three files are created: the compressed token stream, the descriptor block, and a sync file.
attr | The attribute to compress. |
hc | Location for the resulting Huffmann code descriptor block. |
fname | Base filename for the resulting files. |
References _Attribute::any, BFclose(), BFflush(), BFopen(), BFposition(), BFwriteWord(), bprintf(), CDA_OK, cderrno, cdperror, cl_calloc(), cl_cpos2id(), cl_malloc(), cl_max_cpos(), cl_max_id(), CL_MAX_LINE_LENGTH, CompCorpus, CompCorpusFreqs, CompHuffCodes, CompHuffSeq, CompHuffSync, CompLexicon, CompLexiconIdx, component_full_name(), corpus_id, do_protocol, ensure_component(), get_id_frequency, get_string_of_id, _huffman_code_descriptor::lcount, _huffman_code_descriptor::length, _huffman_code_descriptor::max_codelen, MAXCODELEN, _huffman_code_descriptor::min_code, _huffman_code_descriptor::min_codelen, NwriteInt(), print_heap(), protocol, TCorpus::registry_dir, TCorpus::registry_name, sift(), _huffman_code_descriptor::size, _huffman_code_descriptor::symbols, _huffman_code_descriptor::symindex, SYNCHRONIZATION, and WriteHCD().
Referenced by main().
void decode_check_huff | ( | Attribute * | attr, |
char * | fname | ||
) |
Checks a huffcoded attribute for errors by decompressing it.
This function assumes that compute_code_lengths() has been called beforehand and made sure that the _uncompressed_ token sequence is used by CL access functions.
attr | The attribute to check. |
fname | Base filename to use for the three compressed-attribute files. Can be NULL, in which case the filenames in the attribute are used. |
References _Attribute::any, BFclose(), BFflush(), BFopen(), BFposition(), BFread(), CDA_OK, cderrno, cl_cpos2id(), cl_max_cpos(), CL_MAX_LINE_LENGTH, CompCorpus, CompHuffCodes, CompHuffSeq, CompHuffSync, component_full_name(), corpus_id, _huffman_code_descriptor::length, _huffman_code_descriptor::min_code, NreadInt(), ReadHCD(), _huffman_code_descriptor::symbols, _huffman_code_descriptor::symindex, and SYNCHRONIZATION.
Referenced by main().
void dump_heap | ( | int * | heap, |
int | heap_size, | ||
int | node, | ||
int | indent | ||
) |
Dumps the specified heap of memory to the program output stream.
heap | Location of the heap to dump. |
heap_size | Number of nodes in the heap. |
node | Heap at which to begin dumping. |
indent | How many tabs to indent the start of each line. |
References protocol.
Referenced by print_heap().
void huffcode_usage | ( | char * | msg, |
int | error_code | ||
) |
Prints a usage message and exits the program.
msg | A message about the error. |
error_code | Value to be returned by the program when it exits. |
References drop_corpus, progname, and VERSION.
Referenced by main().
int main | ( | int | argc, |
char ** | argv | ||
) |
Main function for cwb-huffcode.
argc | Number of command-line arguments. |
argv | Command-line arguments. |
References _Attribute::any, ATT_POS, TCorpus::attributes, central_corpus_directory, cl_delete_corpus(), cl_new_attribute, cl_new_corpus(), compute_code_lengths(), corpus_id, debug, decode_check_huff(), DEFAULT_ATT_NAME, do_protocol, huffcode_usage(), progname, protocol, and registry_directory.
void print_heap | ( | int * | heap, |
int | heap_size, | ||
char * | title | ||
) |
Prints a description of the specified heap of memory to the program output stream.
heap | Location of the heap to print. |
heap_size | Number of nodes in the heap. |
title | Title of the heap to print. |
References dump_heap(), node, and protocol.
Referenced by compute_code_lengths().
int ReadHCD | ( | char * | filename, |
HCD * | hc | ||
) |
Reads a Huffman compressed sequence from file.
filename | Path to file where compressed sequence is saved. |
hc | Pointer to location where the sequence's descriptor block will be loaded to. |
References cl_malloc(), _huffman_code_descriptor::lcount, _huffman_code_descriptor::length, _huffman_code_descriptor::max_codelen, MAXCODELEN, _huffman_code_descriptor::min_code, _huffman_code_descriptor::min_codelen, NreadInt(), NreadInts(), _huffman_code_descriptor::size, _huffman_code_descriptor::symbols, and _huffman_code_descriptor::symindex.
Referenced by decode_check_huff().
static int sift | ( | int * | heap, |
int | heap_size, | ||
int | node | ||
) | [static] |
Sifts the heap into order.
heap | Location of the heap to sift. |
heap_size | Number of nodes in the heap. |
node | Node at which to begin sifting. |
Referenced by compute_code_lengths().
int WriteHCD | ( | char * | filename, |
HCD * | hc | ||
) |
Writes a Huffman code descriptor to file.
filename | Path to file where descriptor is to be saved. |
hc | Pointer to the descriptor block to save. |
References _huffman_code_descriptor::lcount, _huffman_code_descriptor::length, _huffman_code_descriptor::max_codelen, MAXCODELEN, _huffman_code_descriptor::min_code, _huffman_code_descriptor::min_codelen, NwriteInt(), NwriteInts(), _huffman_code_descriptor::size, _huffman_code_descriptor::symbols, and _huffman_code_descriptor::symindex.
Referenced by compute_code_lengths().
char* corpus_id = NULL |
int debug = 0 |
int do_protocol = 0 |
Level of progress-info (inc compression protocol) message output: 0 = none.
Referenced by compute_code_lengths(), and main().
char* progname |
FILE* protocol |
File handle for this program's progress-info output: always stdout.
Referenced by compute_code_lengths(), dump_heap(), main(), and print_heap().