CWB
|
#include <stddef.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include "../cl/globals.h"
#include "../cl/macros.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/cdaccess.h"
#include "../cl/fileutils.h"
#include "corpmanag.h"
#include "cqp.h"
#include "options.h"
#include "output.h"
#include "ranges.h"
#include "paths.h"
#define COLON ':' |
Referenced by findcorpus(), is_qualified(), load_corpusnames(), and split_subcorpus_name().
#define SLASH '^' |
Referenced by findcorpus().
#define subcorpload_debug 0 |
Referenced by attach_subcorpus().
#define SUBCORPMAGIC 36193928 |
magic number for {?? subcorpus files}
Referenced by attach_subcorpus(), check_stamp(), and save_subcorpus().
Boolean access_corpus | ( | CorpusList * | cl | ) |
Assesses whether a specified corpus can be accessed.
That is, it makes sure that the data for corpus in "cl" is loaded and accessible.
cl | A CorpusList specifying the corpus to check. |
References attach_subcorpus(), False, cl::loaded, cl::range, cl::saved, cl::size, SUB, SYSTEM, TEMP, True, and cl::type.
Referenced by catalog_corpus(), change_corpus(), CorpusLoad(), cqi_find_corpus(), cqi_lookup_attribute(), do_cqi_corpus_attributes(), do_cqi_corpus_full_name(), findcorpus(), prepare_AlignmentConstraints(), prepare_Query(), red_factor(), Setop(), SortSubcorpus(), and SortSubcorpusRandomize().
static Boolean accessible | ( | char * | dir, |
char * | file | ||
) | [static] |
Tests whether a file is accessible.
A file is considered accessible iff user can read it and it is not a (sub)directory.
This test is used for registry entries.
dir | Directory in which the file is to be found. |
file | The filename to check. |
References cl_malloc(), False, and True.
Referenced by load_corpusnames().
CorpusList* assign_temp_to_sub | ( | CorpusList * | tmp, |
char * | subname | ||
) |
Convert a temporary corpus to a real subcorpus.
assign_temp_to_sub assigns the temporary corpus in *tmp to a "real" subcorpus with name "subname". If such a subcorpus already exists, it is overwritten. The temporary corpus is deleted afterwards. The return value is the new subcorpus (which may be equal to tmp, but not necessarily).
tmp | Temporary corpus to convert. |
subname | Name to use for new subcorpus. |
References cl::abs_fn, auto_save, cl_free, cl_strdup(), cl::corpus, dropcorpus(), False, findcorpus(), initialize_cl(), cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, cl::query_corpus, cl::query_text, cl::range, cl::registry, save_subcorpus(), cl::saved, cl::size, cl::sortidx, SUB, cl::targets, TEMP, True, cl::type, and UNDEF.
Referenced by CorpusChangeTMPtoSUB(), do_undump(), and in_UnnamedCorpusCommand().
static Boolean attach_subcorpus | ( | CorpusList * | cl, |
char * | advertised_directory, | ||
char * | advertised_filename | ||
) | [static] |
References cl::abs_fn, cl_free, cl_malloc(), CL_MAX_FILENAME_LENGTH, cl_strdup(), cl::corpus, cqpmessage(), dropcorpus(), _Range::end, ensure_syscorpus(), False, file_length(), get_fulllocalpath(), initialize_cl(), cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, open_file(), cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, _Range::start, SUB, subcorpload_debug, SUBCORPMAGIC, cl::targets, TEMP, True, cl::type, and Warning.
Referenced by access_corpus(), and ensure_corpus_size().
Make a corpus accessible for searching as the "current" corpus.
change_corpus sets the current corpus to the corpus with name "name", first searching SUB corpora, then searching SYSTEM corpora.
When a corpus is "made accessible", its name is checked for validity and availability; if all is OK, set_current_corpus is called on it.
name | A string indicating the name of a corpus. |
silent | Boolean. Ignored. |
References access_corpus(), False, cl::name, search_corpus(), set_current_corpus(), and True.
static char* changecase_string | ( | char * | str, |
enum case_mode | mode | ||
) | [static] |
References cl_strdup(), and LOWER.
Referenced by get_fulllocalpath(), and GetSystemCorpus().
static char* changecase_string_no_copy | ( | char * | str, |
enum case_mode | mode | ||
) | [static] |
References LOWER.
Referenced by load_corpusnames().
void check_available_corpora | ( | enum corpus_type | ct | ) |
References load_corpusnames(), LOCAL_CORP_PATH, set_current_corpus(), SUB, SYSTEM, TEMP, and UNDEF.
Referenced by CorpusLoadDescriptors(), execute_side_effects(), and initialize_cqp().
int check_stamp | ( | char * | directory, |
char * | fname | ||
) |
References CL_MAX_FILENAME_LENGTH, open_file(), SUBCORPMAGIC, and SUBDIR_SEP_STRING.
CorpusList* CorpusChangeTMPtoSUB | ( | CorpusList * | tmp, |
char * | subname | ||
) |
References assign_temp_to_sub().
Boolean CorpusDiscard | ( | CorpusList * | cl, |
Boolean | remove_file_also, | ||
Boolean | save_if_unsaved | ||
) |
References dropcorpus(), and True.
Boolean CorpusDiscardTMPCorpora | ( | void | ) |
References drop_temp_corpora(), and True.
CorpusList* CorpusDuplicate | ( | CorpusList * | cl, |
char * | new_name, | ||
Boolean | force_overwrite | ||
) |
References duplicate_corpus().
CorpusList* CorpusDuplicateIntoTMP | ( | CorpusList * | cl, |
char * | new_name | ||
) |
References make_temp_corpus().
void CorpusListFree | ( | void | ) |
References free_corpuslist().
void CorpusListInit | ( | void | ) |
References init_corpuslist().
Boolean CorpusLoad | ( | CorpusList * | cl | ) |
References access_corpus().
void CorpusLoadDescriptors | ( | CorpusType | ct | ) |
References check_available_corpora().
Boolean CorpusNameQualified | ( | char * | name | ) |
References is_qualified().
Boolean CorpusNameValid | ( | char * | name | ) |
References valid_subcorpus_id().
Boolean CorpusSave | ( | CorpusList * | cl, |
char * | file_name | ||
) |
References save_subcorpus().
Boolean CorpusSaveAll | ( | void | ) |
References save_unsaved_subcorpora(), and True.
Boolean CorpusSetCurrent | ( | CorpusList * | cl | ) |
References set_current_corpus().
Boolean CorpusSetCurrentByname | ( | char * | name | ) |
References set_current_corpus_name().
void CorpusShowNames | ( | CorpusType | ct | ) |
References show_corpora_files().
Boolean CorpusTouch | ( | CorpusList * | cl | ) |
References touch_corpus().
void drop_temp_corpora | ( | void | ) |
Delete temproary corpora.
drop_temp_corpora clears the list of corpora of all temporary stuff.
References corpuslist, dropcorpus(), initialize_cl(), cl::next, TEMP, True, and cl::type.
Referenced by CorpusDiscardTMPCorpora(), do_undump(), in_UnnamedCorpusCommand(), and load_corpusnames().
void dropcorpus | ( | CorpusList * | cl | ) |
Remove a corpus from the global list of corpora.
cl | The corpus to drop. |
References corpuslist, current_corpus, initialize_cl(), cl::next, set_current_corpus(), and True.
Referenced by assign_temp_to_sub(), attach_subcorpus(), copy_intervals(), CorpusDiscard(), do_cqi_cqp_drop_subcorpus(), drop_temp_corpora(), ensure_corpus_size(), and main().
CorpusList* duplicate_corpus | ( | CorpusList * | cl, |
char * | new_name, | ||
Boolean | force_overwrite | ||
) |
Duplicate a corpus via its CorpusList object.
duplicate_corpus creates a copy of an existing corpus and casts its type to SUB. The new corpus is given the name "new_name". If a subcorpus of that name is already present, NULL is retured if force_overwrite is False. If force_overwrite is True, the old corpus is discarded.
cl | The corpus to duplicate |
new_name | Name for the duplicated corpus. |
force_overwrite | Boolean: whether or not to force an overwrite if the subcorpus you are attempting to create already exists. |
References cl::abs_fn, auto_save, cl_malloc(), cl_strdup(), cl::corpus, corpuslist, cqpmessage(), False, initialize_cl(), cl::keywords, cl::loaded, LoadedCorpus(), cl::mother_name, cl::mother_size, cl::name, cl::needs_update, NewCL(), cl::next, cl::query_corpus, cl::query_text, cl::range, cl::registry, save_subcorpus(), cl::saved, cl::size, cl::sortidx, SUB, SYSTEM, cl::targets, True, cl::type, and Warning.
Referenced by copy_intervals(), CorpusDuplicate(), findcorpus(), and in_CorpusCommand().
Boolean ensure_corpus_size | ( | CorpusList * | cl | ) |
This is an internal function used to ensure that a system corpus from the corpus list is accessible and that its size has been computed.
In case of subcorpora, this function implements delayed loading. It is necessary because of a hack that prevents CQP from determining the sizes of all know corpora at start-up (which caused annoying delays if one or more corpora are not accessible) and from reading all subcorpora in the local corpus directory (which caused a number of delays and crashes with MP templates). ensure_corpus_size() is needed by findcorpus() and ensure_syscorpus() at the very least. It may be needed in other places to keep CQP from crashing.
cl | The corpus whose accessibility is to be checked. |
References attach_subcorpus(), cderrno, cdperror_string, CL_MAX_FILENAME_LENGTH, cl::corpus, cqpmessage(), dropcorpus(), _Range::end, False, cl::loaded, cl::local_dir, cl::mother_name, cl::mother_size, cl::name, cl::range, SUB, SYSTEM, SystemCorpusSize(), True, cl::type, user_level, and Warning.
Referenced by ensure_syscorpus(), and findcorpus().
CorpusList* ensure_syscorpus | ( | char * | registry, |
char * | name | ||
) |
References corpuslist, ensure_corpus_size(), GetSystemCorpus(), LoadedCorpus(), cl::next, and SYSTEM.
Referenced by attach_subcorpus().
FieldType field_name_to_type | ( | char * | name | ) |
Returns a FieldType enumeration corresponding to the field name indicated by its stirng argument.
References KeywordField, MatchEndField, MatchField, NoField, and TargetField.
Referenced by do_cqi_cqp_fdist_1(), do_cqi_cqp_fdist_2(), and labellookup().
char* field_type_to_name | ( | FieldType | ft | ) |
Returns a pointer to an internal constant string that labels the FieldType argument.
References cqpmessage(), Error, KeywordField, MatchEndField, MatchField, NoField, and TargetField.
Referenced by do_AnchorPoint(), and prepare_do_subset().
CorpusList* findcorpus | ( | char * | s, |
CorpusType | type, | ||
int | try_recursive_search | ||
) |
Finds the pointer to the corpus with the given name.
When searching for s (name of corpus) strcmp() is used; no case conversion is done.
If "type" is UNDEF, it returns the first corpus with matching name. Otherwise the returned corpus has the type "type".
s | name of the corpus to find (as string) |
type | If this is UNDEF, all corpora are checked; if it is any other type, only corproa of that type are checked. |
try_recursive_search | Boolean: whether or not to try to find corpus through implicit expansion. |
References access_corpus(), ATT_STRUC, ctxtsp::attrib, calculate_leftboundary(), calculate_rightboundary(), COLON, cl::corpus, cqpmessage(), ctxtsp::direction, duplicate_corpus(), _Range::end, ensure_corpus_size(), expansion, False, find_attribute, left, leftright, LoadedCorpus(), cl::mother_name, cl::range, RangeSetop(), right, RUniq, cl::size, ctxtsp::size, SLASH, _Range::start, structure, SYSTEM, touch_corpus(), ctxtsp::type, and Warning.
Referenced by assign_temp_to_sub(), copy_intervals(), corpus_info(), cqi_find_corpus(), cqi_lookup_attribute(), do_cqi_corpus_attributes(), do_cqi_corpus_full_name(), do_undump(), make_temp_corpus(), prepare_AlignmentConstraints(), search_corpus(), set_current_corpus_name(), and valid_subcorpus_id().
CorpusList* FirstCorpusFromList | ( | ) |
Gets the CorpusList pointer for the first corpus on the currently-loaded list.
Function for iterating through the list of currently-loaded corpora.
References corpuslist.
Referenced by do_cqi_corpus_list_corpora(), do_cqi_cqp_list_subcorpora(), and main().
void free_corpuslist | ( | void | ) |
Frees the global list of currently-loaded corpora.
This function sets the corpus list to NULL and frees all members of the list.
References corpuslist, initialize_cl(), cl::next, set_current_corpus(), and True.
Referenced by CorpusListFree().
static char* get_fulllocalpath | ( | CorpusList * | cl, |
int | qualify | ||
) | [static] |
References changecase_string(), cl_free, CL_MAX_FILENAME_LENGTH, cl_strdup(), LOCAL_CORP_PATH, cl::mother_name, cl::name, and UPPER.
Referenced by attach_subcorpus().
CorpusList * GetSystemCorpus | ( | char * | name, |
char * | registry | ||
) |
References cl::abs_fn, changecase_string(), cl_strdup(), cl::corpus, _Range::end, False, cl::keywords, cl::loaded, LOWER, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, New, NewCL(), cl::next, cl::range, cl::registry, TCorpus::registry_dir, cl::saved, setup_corpus, cl::size, cl::sortidx, _Range::start, SYSTEM, cl::targets, True, and cl::type.
Referenced by ensure_syscorpus(), and load_corpusnames().
void init_corpuslist | ( | void | ) |
Initialises the global corpus list (sets it to NULL, no matter what its value was).
References set_current_corpus().
Referenced by CorpusListInit().
void initialize_cl | ( | CorpusList * | cl, |
int | free_name | ||
) |
Resets to empty a CorpusList object.
This is done, largely, by freeing all its members (and setting nonfreeable members to 0 or NULL)...
cl | The corpus list to initialise. |
free_name | Boolean: the name, mother_name and mother_sizemembers will be cleared iff free_name. |
References cl::abs_fn, cl::cd, cl_free, cl::corpus, False, FreeContextDescriptor(), cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, cl::query_corpus, cl::query_text, cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, cl::targets, cl::type, and UNDEF.
Referenced by assign_temp_to_sub(), attach_subcorpus(), drop_temp_corpora(), dropcorpus(), duplicate_corpus(), free_corpuslist(), and make_temp_corpus().
Boolean is_qualified | ( | char * | corpusname | ) |
Checks whether corpusname is fully qualified (with name of mother corpus); does not imply syntatic validity.
References COLON.
Referenced by CorpusNameQualified(), do_undump(), and in_CorpusCommand().
void load_corpusnames | ( | enum corpus_type | ct | ) |
References accessible(), changecase_string_no_copy(), CL_MAX_FILENAME_LENGTH, CL_MAX_LINE_LENGTH, cl_standard_registry(), cl_strdup(), COLON, corpus, corpuslist, cqpmessage(), drop_temp_corpora(), False, get_path_component, GetSystemCorpus(), cl::loaded, LoadedCorpus(), LOCAL_CORP_PATH, cl::local_dir, cl::mother_name, cl::name, cl::needs_update, NewCL(), cl::next, registry, cl::saved, silent, SUB, SYSTEM, TEMP, True, cl::type, UPPER, and Warning.
Referenced by check_available_corpora().
CorpusList* LoadedCorpus | ( | char * | name, |
char * | qualifier, | ||
CorpusType | type | ||
) |
Finds a loaded corpus.
This function tries to find the corpus with name 'name' in the list of currently loaded corpora. In case of subcorpora, qualifier is the mother's name. in case of system corpora, qualifier is the registry. If qualifier is NULL, it is neglected and the first matching corpus is returned. If type is not UNDEF, only corpora of that type are returned. No side effects take place.
name | The corpus we are lookign for. |
qualifier | An extra "bit" of the corpus name (see function description). |
type | Which type of corpus is wanted (may be UNDEF). |
References current_corpus, cl::mother_name, cl::name, cl::next, cl::registry, STREQ, SUB, SYSTEM, TEMP, cl::type, and UNDEF.
Referenced by duplicate_corpus(), ensure_syscorpus(), findcorpus(), and load_corpusnames().
CorpusList* make_temp_corpus | ( | CorpusList * | cl, |
char * | new_name | ||
) |
Copy a corpus as type TEMP.
make_temp_corpus makes a copy of the corpus in *cl into a corpus of type "TEMP" with name "new_name". If a temporary corpus with that name already exists, it is overwritten.
cl | The corpus to copy. |
new_name | Name for the temporary copy. |
References cl::abs_fn, cl_malloc(), cl_strdup(), cl::corpus, corpuslist, False, findcorpus(), initialize_cl(), cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, NewCL(), cl::next, cl::query_corpus, cl::query_text, cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, cl::targets, TEMP, True, and cl::type.
Referenced by CorpusDuplicateIntoTMP(), do_setop(), do_undump(), in_UnnamedCorpusCommand(), prepare_do_subset(), and prepare_Query().
CorpusList* NewCL | ( | void | ) |
Creates a new CorpusList object.
References cl::abs_fn, cl::cd, cl::corpus, False, cl::keywords, cl::loaded, cl::local_dir, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, New, cl::next, cl::query_corpus, cl::query_text, cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, cl::targets, cl::type, and UNDEF.
Referenced by duplicate_corpus(), GetSystemCorpus(), load_corpusnames(), and make_temp_corpus().
CorpusList* NextCorpusFromList | ( | CorpusList * | cl | ) |
Gets the CorpusList pointer for the next corpus on the currently-loaded list.
Function for iterating through the list of currently-loaded corpora.
cl | The current corpus on the list. |
References cl::next.
Referenced by do_cqi_corpus_list_corpora(), do_cqi_cqp_list_subcorpora(), and main().
int NrFieldValues | ( | CorpusList * | cl, |
FieldType | ft | ||
) |
References KeywordField, cl::keywords, MatchField, NoField, cl::size, TargetField, and cl::targets.
Boolean save_subcorpus | ( | CorpusList * | cl, |
char * | fname | ||
) |
References cl::abs_fn, CL_MAX_FILENAME_LENGTH, cqpmessage(), False, cl::keywords, cl::loaded, LOCAL_CORP_PATH, cl::mother_name, cl::name, cl::needs_update, open_file(), cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, SUB, SUBCORPMAGIC, SUBDIR_SEPARATOR, cl::targets, True, cl::type, and Warning.
Referenced by after_CorpusCommand(), assign_temp_to_sub(), copy_intervals(), CorpusSave(), delete_intervals(), do_save(), duplicate_corpus(), and save_unsaved_subcorpora().
void save_unsaved_subcorpora | ( | ) |
References cqpmessage(), False, LOCAL_CORP_PATH, cl::next, save_subcorpus(), cl::saved, SUB, cl::type, and Warning.
Referenced by CorpusSaveAll(), and cqp_parse_file().
CorpusList* search_corpus | ( | char * | name | ) |
Find the CorpusList object corresponding to a corpus name.
First the SUB corpora (created by queries) are searched, then the SYSTEM corproa.
name | String containing name of corpus to find. |
References findcorpus(), SUB, and SYSTEM.
Referenced by change_corpus().
int set_current_corpus | ( | CorpusList * | cp, |
int | force | ||
) |
Sets the current corpus (by pointer to the corpus).
Also, executes Xkwic side effects, if necessary.
cp | Pointer to the corpus to set as current. cp may be NULL, which is legal. |
force | If true, the current corpus is set to the specified corpus, even if it is ALREADY set to that corpus. |
References _context_description_block::attributes, CD, cl::corpus, current_corpus, DEFAULT_ATT_NAME, DestroyAttributeList(), FindInAL(), _attlist::list, _attrbuf::next, _attrbuf::status, _context_description_block::strucAttributes, and update_context_descriptor().
Referenced by after_CorpusCommand(), change_corpus(), check_available_corpora(), CorpusSetCurrent(), cqi_activate_corpus(), dropcorpus(), free_corpuslist(), init_corpuslist(), and set_current_corpus_name().
int set_current_corpus_name | ( | char * | name, |
int | force | ||
) |
Sets the current corpus (by name).
Also, execustes Xkwic side effects, if necessary.
name | Name of the corpus to set as current. |
force | If true, the current corpus is set to the specified corpus, even if it is ALREADY set to that corpus. |
References findcorpus(), set_current_corpus(), and UNDEF.
Referenced by CorpusSetCurrentByname(), and initialize_cqp().
void show_corpora_files | ( | enum corpus_type | ct | ) |
A function to print out a list of corpora currently available.
"files" is a misnomer; it actually looks on the global list of currently loaded corpora, and prints their names.
Either system corpora (SYSTEM) or subcorpora (SUB) can be shown, depending on ct. If ct is UNDEF, both are shown.
For subcorpora, a bundle of other information is shown too.
ct | Type of corpus to show (SUB, SYSTEM or UNDEF). |
References show_corpora_files1(), SUB, SYSTEM, and UNDEF.
Referenced by CorpusShowNames().
void show_corpora_files1 | ( | enum corpus_type | ct | ) |
Function that does the work for show_corpora_files.
References cl_malloc(), end_indented_list(), cl::loaded, cl::mother_name, cl::name, cl::needs_update, cl::next, pretty_print, print_indented_list_br(), print_indented_list_item(), cl::saved, show_corpora_files_sort(), cl::size, start_indented_list(), SUB, SYSTEM, and cl::type.
Referenced by show_corpora_files().
static int show_corpora_files_sort | ( | const void * | p1, |
const void * | p2 | ||
) | [static] |
Internal function for sorting list of corpus names.
Referenced by show_corpora_files1().
char* split_subcorpus_name | ( | char * | corpusname, |
char * | mother_name | ||
) |
Splits a query result corpus-name into qualifier and local name.
This function splits query result name {corpusname} into qualifier (name of mother corpus) and local name; returns pointer to local name part, or NULL if {corpusname} is not syntactically valid; if mother_name is not NULL, it must point to a buffer of suitable length (CL_MAX_LINE_LENGTH is sufficient) where the qualifier will be stored (empty string for unqualified corpus, and return value == {corpusname} in this case)
References COLON.
Referenced by do_undump(), and valid_subcorpus_name().
int SystemCorpusSize | ( | Corpus * | corpus | ) |
References ATT_POS, DEFAULT_ATT_NAME, find_attribute, and get_attribute_size.
Referenced by ensure_corpus_size().
int touch_corpus | ( | CorpusList * | cp | ) |
Touches a corpus, ie, marks it as changed.
cp | The corpus to touch. This must be of type SUB. |
References cl::needs_update, cl::saved, SUB, and cl::type.
Referenced by CorpusTouch(), delete_intervals(), do_cut(), evaluate_target(), findcorpus(), RangeSetop(), set_target(), SortSubcorpus(), and SortSubcorpusRandomize().
Boolean valid_subcorpus_id | ( | char * | corpusname | ) |
References False, findcorpus(), SYSTEM, and True.
Referenced by CorpusNameValid().
Boolean valid_subcorpus_name | ( | char * | corpusname | ) |
Checks whether corpusname is syntactically valid as a query result name.
References False, split_subcorpus_name(), and True.
Referenced by do_undump().
Global list of currently-loaded corpora.
Referenced by drop_temp_corpora(), dropcorpus(), duplicate_corpus(), ensure_syscorpus(), FirstCorpusFromList(), free_corpuslist(), initialize_cqp(), load_corpusnames(), and make_temp_corpus().