index definition
index test1{ # index type # optional, default is 'plain' # known values are 'plain', 'distributed', and 'rt' (see samples below) # type = plain # document source(s) to index # multi-value, mandatory # document IDs must be globally unique across all sources source = src1 # index files path and file name, without extension # mandatory, path must be writable, extensions will be auto-appended path = /usr/local/sphinx/var/data/test1 # document attribute values (docinfo) storage mode # optional, default is 'extern' # known values are 'none', 'extern' and 'inline' docinfo = extern # dictionary type, 'crc' or 'keywords' # crc is faster to index when no substring/wildcards searches are needed # crc with substrings might be faster to search but is much slower to index # (because all substrings are pre-extracted as individual keywords) # keywords is much faster to index with substrings, and index is much (3-10x) smaller # keywords supports wildcards, crc does not, and never will # optional, default is 'keywords' dict = keywords # memory locking for cached data (.spa and .spi), to prevent swapping # optional, default is 0 (do not mlock) # requires searchd to be run from root mlock = 0 # a list of morphology preprocessors to apply # optional, default is empty # # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru', # 'soundex', and 'metaphone'; additional preprocessors available from # libstemmer are 'libstemmer_XXX', where XXX is algorithm code # (see libstemmer_c/libstemmer/modules.txt) # # morphology = stem_en, stem_ru, soundex # morphology = libstemmer_german # morphology = libstemmer_sv morphology = none # minimum word length at which to enable stemming # optional, default is 1 (stem everything) # # min_stemming_len = 1 # stopword files list (space separated) # optional, default is empty # contents are plain text, charset_table and stemming are both applied # # stopwords = /usr/local/sphinx/var/data/stopwords.txt # wordforms file, in "mapfrom > mapto" plain text format # optional, default is empty # # wordforms = /usr/local/sphinx/var/data/wordforms.txt # tokenizing exceptions file # optional, default is empty # # plain text, case sensitive, space insensitive in map-from part # one "Map Several Words => ToASingleOne" entry per line # # exceptions = /usr/local/sphinx/var/data/exceptions.txt # embedded file size limit # optional, default is 16K # # exceptions, wordforms, and stopwords files smaller than this limit # are stored in the index; otherwise, their paths and sizes are stored # # embedded_limit = 16K # minimum indexed word length # default is 1 (index everything) min_word_len = 1 # ignored characters list # optional, default value is empty # # ignore_chars = U+00AD # minimum word prefix length to index # optional, default is 0 (do not index prefixes) # # min_prefix_len = 0 # minimum word infix length to index # optional, default is 0 (do not index infixes) # # min_infix_len = 0 # maximum substring (prefix or infix) length to index # optional, default is 0 (do not limit substring length) # # max_substring_len = 8 # list of fields to limit prefix/infix indexing to # optional, default value is empty (index all fields in prefix/infix mode) # # prefix_fields = filename # infix_fields = url, domain # expand keywords with exact forms and/or stars when searching fit indexes # search-time only, does not affect indexing, can be 0 or 1 # optional, default is 0 (do not expand keywords) # # expand_keywords = 1 # n-gram length to index, for CJK indexing # only supports 0 and 1 for now, other lengths to be implemented # optional, default is 0 (disable n-grams) # # ngram_len = 1 # n-gram characters list, for CJK indexing # optional, default is empty # # ngram_chars = U+3000..U+2FA1F # phrase boundary characters list # optional, default is empty # # phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis # phrase boundary word position increment # optional, default is 0 # # phrase_boundary_step = 100 # blended characters list # blended chars are indexed both as separators and valid characters # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t") # optional, default is empty # # blend_chars = +, &, U+23 # blended token indexing mode # a comma separated list of blended token indexing variants # known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure # optional, default is trim_none # # blend_mode = trim_tail, skip_pure # whether to strip HTML tags from incoming documents # known values are 0 (do not strip) and 1 (do strip) # optional, default is 0 html_strip = 0 # what HTML attributes to index if stripping HTML # optional, default is empty (do not index anything) # # html_index_attrs = img=alt,title; a=title; # what HTML elements contents to strip # optional, default is empty (do not strip element contents) # # html_remove_elements = style, script # whether to preopen index data files on startup # optional, default is 0 (do not preopen), searchd-only # # preopen = 1 # whether to enable in-place inversion (2x less disk, 90-95% speed) # optional, default is 0 (use separate temporary files), indexer-only # # inplace_enable = 1 # in-place fine-tuning options # optional, defaults are listed below # # inplace_hit_gap = 0 # preallocated hitlist gap size # inplace_docinfo_gap = 0 # preallocated docinfo gap size # inplace_reloc_factor = 0.1 # relocation buffer size within arena # inplace_write_factor = 0.1 # write buffer size within arena # whether to index original keywords along with stemmed versions # enables "=exactform" operator to work # optional, default is 0 # # index_exact_words = 1 # position increment on overshort (less that min_word_len) words # optional, allowed values are 0 and 1, default is 1 # # overshort_step = 1 # position increment on stopword # optional, allowed values are 0 and 1, default is 1 # # stopword_step = 1 # hitless words list # positions for these keywords will not be stored in the index # optional, allowed values are 'all', or a list file name # # hitless_words = all # hitless_words = hitless.txt # detect and index sentence and paragraph boundaries # required for the SENTENCE and PARAGRAPH operators to work # optional, allowed values are 0 and 1, default is 0 # # index_sp = 1 # index zones, delimited by HTML/XML tags # a comma separated list of tags and wildcards # required for the ZONE operator to work # optional, default is empty string (do not index zones) # # index_zones = title, h*, th # index per-document and average per-index field lengths, in tokens # required for the BM25A(), BM25F() in expression ranker # optional, default is 0 (do not index field lenghts) # # index_field_lengths = 1 # regular expressions (regexps) to filter the fields and queries with # gets applied to data source fields when indexing # gets applied to search queries when searching # multi-value, optional, default is empty list of regexps # # regexp_filter = \b(\d+)\" => \1inch # regexp_filter = (blue|red) => color # list of the words considered frequent with respect to bigram indexing # optional, default is empty # # bigram_freq_words = the, a, i, you, my # bigram indexing mode # known values are none, all, first_freq, both_freq # option, default is none (do not index bigrams) # # bigram_index = both_freq # snippet document file name prefix # preprended to file names when generating snippets using load_files option # WARNING, this is a prefix (not a path), trailing slash matters! # optional, default is empty # # snippets_file_prefix = /mnt/mydocs/server1 # whether to apply stopwords before or after stemming # optional, default is 0 (apply stopwords after stemming) # # stopwords_unstemmed = 0 # path to a global (cluster-wide) keyword IDFs file # optional, default is empty (use local IDFs) # # global_idf = /usr/local/sphinx/var/global.idf}