CWB
|
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include "../cl/globals.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/cdaccess.h"
#include "../cl/macros.h"
#include "corpmanag.h"
#include "eval.h"
#include "ranges.h"
#include "output.h"
#include "matchlist.h"
#include "options.h"
#define SORT_DEBUG 0 |
Referenced by i2compare(), and SortExternally().
#define spaceship | ( | A, | |
B | |||
) | ((A) > (B)) ? 1 : ((A) < (B)) ? -1 : 0 |
Referenced by random_compare().
#define USE_SORT_CACHE |
Defined if a sort cache is to be used in sorting concordance lines.
The sort cache (caching the lexicon IDs of the first two tokens to be compared in each line) is indispensable for the internal sorting algorithm since random accesses to a compressed corpus are painfully slow; when sorting variable length matches such as German NPs on word forms, the current implementation has a hit rate of around 99%.
Referenced by SortSubcorpus().
int _RS_compare_ranges | ( | const void * | pa, |
const void * | pb | ||
) |
qsort() helper function for RangeSort() below
References _Range::end, and _Range::start.
Referenced by RangeSort().
int calculate_leftboundary | ( | CorpusList * | cl, |
int | cpos, | ||
Context | spc | ||
) |
References calculate_ranges(), left, and right.
Referenced by expand_dataspace(), and findcorpus().
int calculate_ranges | ( | CorpusList * | cl, |
int | cpos, | ||
Context | spc, | ||
int * | left, | ||
int * | right | ||
) |
References ctxtsp::attrib, False, get_bounds_of_nth_struc, get_nr_of_strucs(), get_num_of_struc, get_struc_attribute, MAX, MIN, cl::mother_size, ctxtsp::size, structure, ctxtsp::type, and word.
Referenced by calculate_leftboundary(), calculate_rightboundary(), and evaluate_target().
int calculate_rightboundary | ( | CorpusList * | cl, |
int | cpos, | ||
Context | spc | ||
) |
References calculate_ranges(), left, and right.
Referenced by expand_dataspace(), findcorpus(), and simulate().
int copy_intervals | ( | CorpusList * | cp, |
Bitfield | intervals, | ||
int | mode, | ||
char * | subcorpname | ||
) |
Copy concordance hits from a query-generated subcorpus to a (new or existing) subcorpus.
This function is not currently in use.
cp | The CorpusList indicating the query to copy from. |
intervals | A Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not. |
mode | ALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to copy). |
subcorpname | Name for the subcorpus to which the lines are to be copied. |
References auto_save, cqpmessage(), delete_intervals(), dropcorpus(), duplicate_corpus(), BFBuf::elements, Error, False, findcorpus(), cl::mother_name, cl::name, RangeSetop(), RUnion, save_subcorpus(), cl::saved, SELECTED_LINES, cl::size, SUB, SYSTEM, toggle_bit(), cl::type, UNDEF, and UNSELECTED_LINES.
int delete_interval | ( | CorpusList * | cp, |
int | nr | ||
) |
Delete a single concordance hit from a query-generated subcorpus.
This function is not currently in use.
cp | The CorpusList indicating the query to delete from. |
nr | The index of the interval to delete (by setting its start and end values to -1). |
References cl_free, _Range::end, cl::range, RangeSetop(), RReduce, cl::size, cl::sortidx, _Range::start, SUB, and cl::type.
int delete_intervals | ( | CorpusList * | cp, |
Bitfield | intervals, | ||
int | mode | ||
) |
Delete a whole bunch of concordance hits from a query-generated subcorpus.
cp | The CorpusList indicating the query to delete from. |
intervals | A Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not. |
mode | ALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to delete). |
References ALL_LINES, auto_save, cl_free, BFBuf::elements, _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RReduce, save_subcorpus(), SELECTED_LINES, cl::size, cl::sortidx, _Range::start, SUB, cl::targets, TEMP, touch_corpus(), cl::type, and UNSELECTED_LINES.
Referenced by copy_intervals(), do_delete_lines(), do_delete_lines_num(), do_reduce(), and do_StandardQuery().
void FreeSortClause | ( | SortClause | sc | ) |
Frees a SortClause object.
References _sort_clause::attribute_name, and cl_free.
static int group2compare | ( | const void * | vidx1, |
const void * | vidx2 | ||
) | [static] |
Compares two groups of equivalent matches by group sizes (descending), breaking ties through i2compare.
References current_sortidx, EvaluationIsRunning, group_first, group_size, i2compare(), s1, and s2.
Referenced by SortSubcorpus().
static int i2compare | ( | const void * | vidx1, |
const void * | vidx2 | ||
) | [static] |
Compare two matches according to current sort settings in static variables (qsort callback used in query result sorting).
This is the primary query-hit-comparison function. It wraps cl_string_qsort_compare for string comparison, but does much more as well, because we are not just comparing individual strings but rather, potentially, whole bundles of strings from various different positions.
vidx1 | Pointer to the integer index of the first of the intervals to be compared (ie an index into an array of start/end positions). |
vidx2 | Pointer to the integer index of the second of the intervals to be compared. |
References break_ties, TCorpus::charset, cl_cpos2id(), cl_id2str(), cl_string_qsort_compare(), cl::corpus, EvaluationIsRunning, MIN, s1, s2, SORT_DEBUG, sort_id_cache, srt_ascending, srt_end, srt_flags, srt_reverse, and srt_start.
Referenced by group2compare(), and SortSubcorpus().
static int random_compare | ( | const void * | vidx1, |
const void * | vidx2 | ||
) | [static] |
Sorts hits in random order by comparing random numbers in the vector random_sort_keys[], breaking ties by start and end positions of matches (from *srt_cl) in order to ensure stable sorting.
This is another qsort callback function.
References _Range::end, random_sort_keys, cl::range, spaceship, and _Range::start.
Referenced by SortSubcorpusRandomize().
int RangeSetop | ( | CorpusList * | corpus1, |
RangeSetOp | operation, | ||
CorpusList * | corpus2, | ||
Bitfield | restrictor | ||
) |
Carries out one of a set of operations on corpus1.
The operations that can be carried out are as follows:
RUnion - copy intervals from corpus2 to corpus1 (no duplicates); RIntersection - remove from corpus1 any intervals that are not also in corpus2; RDiff RMaximalMatches - remove spurious matches according to "longest" strategy; RMinimalMatches - remove spurious matches according to "shortest" strategy; RLeftMaximalMatches - remove spurious matches according to "standard" strategy; RNonOverlapping RUniq - remove duplicate intervals from corpus1; RReduce - remove intervals marked for deletion (by having the start memebr set to -1).
TODO to avopid confusion with the object, a better name for this function would be do_RangeSetOp
corpus1 | The corpus to be changed. |
operation | Specifies which operation is to be carried out. |
corpus2 | The corpus that is the second argument for this operation. Can be NULL if no corpus2 is required for operation. |
restrictor | Specifies which intervals in corpus2 are to be taken notice of versus ignored. Can be NULL. |
References cl_free, cl_malloc(), cl_realloc(), _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RDiff, RIntersection, RLeftMaximalMatches, RMaximalMatches, RMinimalMatches, RNonOverlapping, RReduce, rs_cp_range(), RUnion, RUniq, cl::size, cl::sortidx, _Range::start, cl::targets, and touch_corpus().
Referenced by copy_intervals(), delete_interval(), delete_intervals(), do_cut(), do_setop(), do_StandardQuery(), evaluate_subset(), expand_dataspace(), findcorpus(), prepare_Query(), RangeSetop(), and set_corpus_matchlists().
void RangeSort | ( | CorpusList * | c, |
int | mk_sortidx | ||
) |
Make sure that ranges are sorted in 'natural' order (i.e.
by start and end cpos).
This function has to be called when matching ranges are modified and may be needed when loading a query result (with "undump") that is not sorted in ascending order; with optional "mk_sortidx" flag, a sortidx corresponding to the original ordering is created.
c | The corpus (ie subcorpus/query) whose intervals ('ranges') are to be sorted. |
mk_sortidx | Boolean flag: if true a sortidx is created. |
References _RS_compare_ranges(), cl_free, cl_malloc(), cqpmessage(), Error, cl::keywords, cl::name, cl::range, cl::size, cl::sortidx, SUB, cl::targets, TEMP, cl::type, and Warning.
Referenced by do_undump(), evaluate_target(), and set_target().
void rs_cp_range | ( | Range * | rng, |
int * | target, | ||
int * | keyword, | ||
int | ins, | ||
CorpusList * | corpus, | ||
int | j | ||
) |
this is a rather specialised utility function for the UNION part of RangeSetop() (copies range + keyword/target (if defined) in corpus into temporary lists)
References _Range::end, cl::keywords, cl::range, _Range::start, and cl::targets.
Referenced by RangeSetop().
int SortExternally | ( | void | ) |
Use an external program to sort a query.
No parameters - the assumption is that everything is set up already by the SortSubCorpus function which calls this one.
References TCorpus::charset, cl_cpos2str(), cl_free, cl_malloc(), CL_MAX_LINE_LENGTH, cl_string_canonical(), cl_string_reverse(), cl::corpus, cqpmessage(), _Range::end, Error, ExternalSortingCommand, KeywordField, cl::keywords, line, MatchEndField, MatchField, open_temporary_file(), cl::range, cl::size, SORT_DEBUG, cl::sortidx, srt_anchor1, srt_anchor2, srt_ascending, srt_flags, srt_offset1, srt_offset2, srt_reverse, _Range::start, TargetField, cl::targets, TEMP_FILENAME_BUFSIZE, text_size, utf8, and Warning.
Referenced by SortSubcorpus().
int SortSubcorpus | ( | CorpusList * | cl, |
SortClause | sc, | ||
int | count_mode, | ||
struct Redir * | redir | ||
) |
Sort the (query) subcorpus specified by cl, or count frequencies of matching strings.
(Note that frequency counting and query result sorting are done via the same sorting algorithm.)
If the sort was not performed successfully, the sort index is reset to the default sort order, and the function returns false.
cl | Subcorpus designating the query to sort. |
sc | A sort clause. sc = NULL resets the sort index to the default sort order (i.e. sorted by corpus position). |
count_mode | Boolean: run the function in count frequency mode? |
redir | Redir object for where the output of string-counting is to be displayed. |
References access_corpus(), _sort_clause::anchor1, _sort_clause::anchor2, ATT_POS, _sort_clause::attribute_name, break_ties, TCorpus::charset, cl_cpos2id(), cl_cpos2str(), cl_free, cl_malloc(), cl_max_cpos(), cl_strdup(), cl_string_canonical(), cl_string_reverse(), close_stream(), cl::corpus, cqp, cqpmessage(), current_sortidx, DEFAULT_ATT_NAME, _Range::end, Error, EvaluationIsRunning, find_attribute, _sort_clause::flags, group2compare(), group_first, group_size, i2compare(), Info, insecure, install_signal_handler(), KeywordField, cl::keywords, MatchEndField, MatchField, cl::name, NoField, _sort_clause::offset1, _sort_clause::offset2, open_stream(), pretty_print, cl::range, cl::size, _sort_clause::sort_ascending, sort_id_cache, _sort_clause::sort_reverse, SortExternally(), cl::sortidx, srt_anchor1, srt_anchor2, srt_ascending, srt_end, srt_flags, srt_offset1, srt_offset2, srt_reverse, srt_start, _Range::start, Redir::stream, TargetField, cl::targets, text_size, touch_corpus(), USE_SORT_CACHE, UseExternalSorting, Warning, and which_app.
int SortSubcorpusRandomize | ( | CorpusList * | cl, |
int | seed | ||
) |
Sorts a query result in random order.
If seed > 0, a reproducible and stable ordering is generated based on the start and end corpus positions of matches (i.e. two given matches will always be sorted in the same order).
cl | Corpus-list object representing the query to sort. |
seed | Seed for the randomiser; should ideally be a prime number (2^31 is a particularly bad choice); if it is 0, then the internal RNG's standard random order is used. |
References access_corpus(), cl_free, cl_malloc(), cl_random(), cl_set_rng_state(), cqp, cqpmessage(), _Range::end, Error, EvaluationIsRunning, Info, install_signal_handler(), cl::name, random_compare(), random_sort_keys, cl::range, cl::size, cl::sortidx, _Range::start, touch_corpus(), Warning, and which_app.
static int srt_strcmp | ( | unsigned char * | s1, |
unsigned char * | s2, | ||
unsigned char * | maptable, | ||
int | reverse | ||
) | [static] |
Variable used by _RS_compare_ranges; global so data can be passed in without going through that function's parameter list!
int break_ties [static] |
whether to break ties (by comparison without cd flags, and by line number in the last instance)
Referenced by i2compare(), and SortSubcorpus().
int* current_sortidx [static] |
alias to newly created sortidx, so it can be accessed by the callback function
Referenced by group2compare(), and SortSubcorpus().
int* group_first [static] |
first match for each group of identical (or equivalent) sort strings
Referenced by group2compare(), and SortSubcorpus().
int* group_size [static] |
number of matches for each group of identical (or equivalent) sort strings
Referenced by group2compare(), and SortSubcorpus().
unsigned int* random_sort_keys [static] |
random keys for randomized sort order (ties are broken by cpos of matches)
Referenced by random_compare(), and SortSubcorpusRandomize().
int* sort_id_cache = NULL [static] |
FieldType srt_anchor1 [static] |
In a query sort, indicates the field type of the start of sort region.
Referenced by SortExternally(), and SortSubcorpus().
FieldType srt_anchor2 [static] |
In a query sort, indicates the field type of the end of sort region.
Referenced by SortExternally(), and SortSubcorpus().
int srt_ascending [static] |
boolean: sort query into ascending order or not
Referenced by i2compare(), SortExternally(), and SortSubcorpus().
Attribute* srt_attribute [static] |
The )p-)Attribute on which a query is to be sorted.
CorpusList* srt_cl [static] |
The CorpusList object representing a query to be sorted.
int* srt_end [static] |
When sorting a query, this contains end positions of intervals to be sorted.
Referenced by i2compare(), and SortSubcorpus().
int srt_flags [static] |
Whether to use the c and/or d flags when sorting a query.
Referenced by i2compare(), SortExternally(), and SortSubcorpus().
int srt_offset1 [static] |
In a query sort, indicates the offset of the start of sort region.
Referenced by SortExternally(), and SortSubcorpus().
int srt_offset2 [static] |
In a query sort, indicates the offset of the end of sort region.
Referenced by SortExternally(), and SortSubcorpus().
int srt_reverse [static] |
boolean: sort query on reversed-character-sequence strings (and reversed sequences OF strings) or not
Referenced by i2compare(), SortExternally(), and SortSubcorpus().
int* srt_start [static] |
When sorting a query, this contains start positions of intervals to be sorted.
Referenced by i2compare(), and SortSubcorpus().
int text_size [static] |
When sorting a query - this represents the size of the corpus the query belongs to.
Referenced by compose_kwic_line(), SortExternally(), and SortSubcorpus().