hipdf.core.tokenize_vocabulary.TokenizeVocabulary#

21 min read time

Applies to Linux

class hipdf.core.tokenize_vocabulary.TokenizeVocabulary(vocabulary: Series)#

Bases: object

A vocabulary object used to tokenize input text.

Parameters#

vocabularystr

Strings column of vocabulary terms

__init__(vocabulary: Series) None#

Methods

__init__(vocabulary)

tokenize(text[, delimiter, default_id])

Parameters text cudf string series The strings to be tokenized. delimiter str Delimiter to identify tokens. Default is whitespace. default_id int Value to use for tokens not found in the vocabulary. Default is -1.

__init__(vocabulary: Series) None#
tokenize(text, delimiter: str = '', default_id: int = -1) Series#

Parameters#

textcudf string series

The strings to be tokenized.

delimiterstr

Delimiter to identify tokens. Default is whitespace.

default_idint

Value to use for tokens not found in the vocabulary. Default is -1.

Returns#

Tokenized strings