TextSearch API

Base.:* — Method

*(a::Dict{Ti,Tv}, b::Dict{Ti,Tv}) where {Ti,Tv<:Real}
*(a::Dict{K, V}, b::F) where K where {V<:Real} where {F<:Real}

Computes the element-wise product of a and b

source

Base.:+ — Method

+(a::Dict{Ti,Tv}, b::Dict{Ti,Tv}) where {Ti,Tv<:Real}
+(a::Dict, b::Pair)

Computes the sum of a and b

source

Base.:- — Method

-(a::Dict{Ti,Tv}, b::Dict{Ti,Tv}) where {Ti,Tv<:Real}

Substracts of b of a

source

Base.:/ — Method

/(a::Dict{K, V}, b::F) where K where {V<:Real} where {F<:Real}

Computes the element-wise division of a and b

source

Base.sum — Method

Base.sum(col::AbstractVector{<:Dict})

Computes the sum of the given list of vectors

source

Base.zero — Method

zero(::Type{Dict{Ti,Tv}}) where {Ti,Tv}

Creates an empty Dict vector

source

Distances.evaluate — Method

evaluate(::Angle, a::Dict, b::Dict)::Float64

Computes the angle between two Dict sparse vectors

source

Distances.evaluate — Method

evaluate(::Cosine, a::Dict, b::Dict)::Float64

Computes the cosine distance between two Dict sparse vectors

source

Distances.evaluate — Method

evaluate(::NormAngle, a::Dict, b::Dict)::Float64

Computes the angle between two Dict sparse vectors

It supposes that all bags are normalized (see normalize! function)

source

Distances.evaluate — Method

evaluate(::NormCosine, a::Dict, b::Dict)::Float64

Computes the cosine distance between two Dict sparse vectors

It supposes that bags are normalized (see normalize! function)

source

LinearAlgebra.dot — Method

dot(a::Dict, b::Dict)

Computes the dot product for two Dict vectors

source

LinearAlgebra.norm — Method

norm(a::Dict)

Computes a normalized Dict vector

source

LinearAlgebra.normalize! — Method

normalize!(bow::Dict)

Inplace normalization of bow

source

SimilaritySearch.search — Method

search(acceptpostinglist::Function, idx::BM25InvertedFile, ctx::InvertedFileContext, qtext::AbstractString, res::AbstractKnn search(idx::BM25InvertedFile, ctx::InvertedFileContext, qtext::AbstractString, res::AbstractKnn

Find candidates for solving query Q using idx. It calls callback on each candidate (docID, dist)

source

SparseArrays.sparsevec — Method

sparsevec(vec::Dict{Ti,Tv}, m=0) where {Ti<:Integer,Tv<:Number}

Creates a sparse vector from a Dict-based sparse vector

source

TextSearch.add! — Method

add!(a::Dict{Ti,Tv}, b::Dict{Ti,Tv}) where {Ti,Tv<:Real}
add!(a::Dict{Ti,Tv}, b::AbstractSparseArray) where {Ti,Tv<:Real}
add!(a::Dict{Ti,Tv}, b::Pair{Ti,Tv}) where {Ti,Tv<:Real}

Updates a to the sum of a+b

source

TextSearch.bagofwords! — Method

bagofwords!(bow::BOW, voc::Vocabulary, tokenlist::TokenizedText)
bagofwords!(buff::TextSearchBuffer, voc::Vocabulary, text)
bagofwords(voc::Vocabulary, messages)

Creates a bag of words from the given text (a string or a list of strings). If bow is given then updates the bag with the text. When config is given, the text is parsed according to it.

source

TextSearch.bagofwords! — Method

bagofwords(voc::Vocabulary, messages)
bagofwords!(buff, voc::Vocabulary, messages)

Computes a bag of words from messages

source

TextSearch.bagofwords_corpus — Method

bagofwords_corpus(voc::Vocabulary, corpus::AbstractVector; minbatch=0)

Computes a list of bag of words from a corpus

source

TextSearch.centroid — Method

centroid(cluster::AbstractVector{<:Dict})

Computes a centroid of the given list of Dict vectors

source

TextSearch.collocations — Method

collocations(q, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Computes a kind of collocations of the given text

source

TextSearch.dvec — Method

dvec(x::AbstractSparseVector)

Converts an sparse vector into a dict-based sparse vector

source

TextSearch.filter_tokens! — Method

filter_tokens!(voc::Vocabulary, text::TokenizedText)

Removes tokens from text array

source

TextSearch.filter_tokens! — Method

filter_tokens!(voc::Vocabulary, text::TokenizedText)

Removes tokens from a given tokenized text based using the valid vocabulary

source

TextSearch.filter_tokens — Method

filter_tokens(pred::Function, voc::Vocabulary)

Returns a copy of reduced vocabulary based on evaluating pred function for each entry in voc

source

TextSearch.flush_collocation! — Method

flush_collocations!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Pushes a collocation inside the buffer to the token list; it discards empty strings.

source

TextSearch.flush_nword! — Method

flush_nword!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Pushes the nword inside the buffer to the token list; it discards empty strings.

source

TextSearch.flush_qgram! — Method

flush_qgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Pushes the qgram inside the buffer to the token list; it discards empty strings.

source

TextSearch.flush_skipgram! — Method

flush_skipgram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Pushes the skipgram inside the buffer to the token list; it discards empty strings.

source

TextSearch.flush_unigram! — Method

flush_unigram!(buff::TextSearchBuffer, tt::AbstractTokenTransformation)

Pushes the word inside the buffer to the token list; it discards empty strings.

source

TextSearch.merge_voc — Method

merge_voc(voc1::Vocabulary, voc2::Vocabulary[, ...])
merge_voc(pred::Function, voc1::Vocabulary, voc2::Vocabulary[, ...])

Merges two or more vocabularies into a new one. A predicate function can be used to filter token entries.

Note: All vocabularies should had been created with a compatible TextConfig to be able to work on them.

source

TextSearch.normalize_text — Method

normalize_text(config::TextConfig, text::AbstractString, output::Vector{Char}; limits::Bool=true)

Normalizes a given text using the specified transformations of config

source

TextSearch.nwords — Method

nwords(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

source

TextSearch.qgrams — Method

qgrams(q::Integer, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Computes character q-grams for the given input

source

TextSearch.skipgrams — Method

skipgrams(q::Skipgram, buff::TextSearchBuffer, tt::AbstractTokenTransformation, mark_token_type)

Tokenizes using skipgrams

source

TextSearch.sparse_coo — Method

sparse(cols::AbstractVector{<:Dict}, m=0; minweight=1e-9) 
sparse_coo(cols::AbstractVector{<:Dict}, minweight=1e-9)

Creates a sparse matrix from an array of Dict sparse vectors.

source

TextSearch.tokenize — Method

tokenize(textconfig::TextConfig, text)
tokenize(copy_::Function, textconfig::TextConfig, text)

tokenize(textconfig::TextConfig, text, buff)
tokenize(copy_::Function, textconfig::TextConfig, text, buff)

Tokenizes text using the given configuration. The tokenize makes heavy usage of buffers, and when these buffers are shared it is mandatory to create a copy of the result (buff.tokens).

Change the default copy function to make an additional filtering of the tokens. You can also pass the identity function to avoid copying.

source

TextSearch.tokenize_and_append! — Method

tokenize_and_append!(voc::Vocabulary, corpus; minbatch=0)

Parse each document in the given corpus and appends each token to the vocabulary.

source

TextSearch.tokenize_corpus — Method

tokenize_corpus(textconfig::TextConfig, arr; minbatch=0, verbose=true)
tokenize_corpus(copy_::Function, textconfig::TextConfig, arr; minbatch=0, verbose=true)

Tokenize a list of texts. The copy_ function is passed to tokenize as first argument.

source

TextSearch.transform_collocation — Method

transform_collocation(::AbstractTokenTransformation, tok)

Hook applied in the tokenization stage to change the input token tok if needed. Return nothing to ignore the tok occurence (e.g., stop words).

source

TextSearch.transform_nword — Method

transform_nword(::AbstractTokenTransformation, tok)

Hook applied in the tokenization stage to change the input token tok if needed. For instance, it can be used to apply stemming or any other kind of normalization. Return nothing to ignore the tok occurence (e.g., stop words).

source

TextSearch.transform_qgram — Method

transform_qgram(::AbstractTokenTransformation, tok)

source

TextSearch.transform_skipgram — Method

transform_skipgram(::AbstractTokenTransformation, tok)

source

TextSearch.transform_unigram — Method

transform_unigram(::AbstractTokenTransformation, tok)

source

TextSearch.unigrams — Method

unigrams(buff::TextSearchBuffer, tt::AbstractTokenTransformation)

Performs the word tokenization

source

TextSearch.update_voc! — Method

update_voc!(voc::Vocabulary, another::Vocabulary)
update_voc!(pred::Function, voc::Vocabulary, another::Vocabulary)

Update voc vocabulary using another vocabulary. Optionally a predicate can be given to filter vocabularies.

Note 1: corpuslen remains unchanged (the structure is immutable and a new Vocabulary should be created to update this field). Note 2: Both voc and another vocabularies should had been created with a compatible TextConfig to be able to work on them.

source

TextSearch.vectorize! — Method

vectorize!(buff::TextSearchBuffer, model::VectorModel{G_,L_}, bow::BOW; normalize=true, minweight=1e-9) where {G_,L_}

Computes a weighted vector using the given bag of words and the specified weighting scheme.

source

TextSearch.vocab_from_small_collection — Method

Vocabulary(textconfig, corpus; minbatch=0)

Computes a vocabulary from a corpus using the TextConfig textconfig.

source

TextSearch.BM25InvertedFile — Type

struct BM25InvertedFile <: AbstractInvertedFile

Parameters

source

TextSearch.BM25InvertedFile — Method

BM25InvertedFile(voc::Vocabulary, db=nothing)

Fits the BM25 score from the given vocabulary it also creates the associated inverted file structure.

source

TextSearch.BinaryGlobalWeighting — Type

BinaryGlobalWeighting()

The weight is 1 for known tokens, 0 for out of vocabulary tokens

source

TextSearch.BinaryLocalWeighting — Type

BinaryLocalWeighting()

The weight is 1 for known tokens, 0 for out of vocabulary tokens

source

TextSearch.EntropyWeighting — Type

EntropyWeighting(; smooth=0.0, lowerweight=0.0, weights=:balance)

Entropy weighting uses the empirical entropy of the vocabulary along classes to produce a notion of importance for each token

source

TextSearch.FreqWeighting — Type

FreqWeighting()

Frequency weighting

source

TextSearch.GlobalWeighting — Type

GlobalWeighting

Abstract type for global weighting

source

TextSearch.IdfWeighting — Type

IdfWeighting()

Inverse document frequency weighting

source

TextSearch.LocalWeighting — Type

LocalWeighting

Abstract type for local weighting

source

TextSearch.Skipgram — Type

Skipgram(qsize, skip)

A skipgram is a kind of tokenization where qsize words having skip separation are used as a single token.

source

TextSearch.TextConfig — Type

TextConfig(;
    del_diac::Bool=true,
    del_dup::Bool=false,
    del_punc::Bool=false,
    group_num::Bool=true,
    group_url::Bool=true,
    group_usr::Bool=false,
    group_emo::Bool=false,
    lc::Bool=true,
    collocations::Int8=0,
    qlist::Vector=Int8[],
    nlist::Vector=Int8[],
    slist::Vector{Skipgram}=Skipgram[],
    mark_token_type::Bool = true
    tt=IdentityTokenTransformation()
)

Defines a preprocessing and tokenization pipeline

del_diac: indicates if diacritic symbols should be removed
del_dup: indicates if duplicate contiguous symbols must be replaced for a single symbol
del_punc: indicates if punctuaction symbols must be removed
group_num: indicates if numbers should be grouped _num
group_url: indicates if urls should be grouped as _url
group_usr: indicates if users (@usr) should be grouped as _usr
group_emo: indicates if emojis should be grouped as _emo
lc: indicates if the text should be normalized to lower case
collocations: window to expand collocations as tokens, please take into account that:
- 0 => disables collocations
- 1 => will compute words (ignored in favor of use typical unigrams)
- 2 => will compute bigrams (don't use this, but not disabled)
- 3 <= typical values
qlist: a list of character q-grams to use
nlist: a list of words n-grams to use
slist: a list of skip-grams tokenizers to use
mark_token_type: each token is marked with its type (qgram, skipgram, nword) when is true.
tt: An AbstractTokenTransformation struct

Note: If qlist, nlist, and slists are all empty arrays, then it defaults to nlist=[1]

source

TextSearch.TextModel — Type

Model

An abstract type that represents a weighting model

source

TextSearch.TfWeighting — Type

TfWeighting()

Term frequency weighting

source

TextSearch.TpWeighting — Type

TpWeighting()

Term probability weighting

source

TextSearch.VectorModel — Method

VectorModel(ent::EntropyWeighting, lw::LocalWeighting, corpus::BOW, labels;
    mindocs::Integer=1,
    smooth::Float64=0.0,
    weights=:balance
    comb::CombineWeighting=NormalizedEntropy(),
)

Creates a vector model using the input corpus.

source

TextSearch.Vocabulary — Method

Vocabulary(textconfig::TextConfig, trainsize::Int, numtokens::Int)

Creates a Vocabulary struct

source