Source code for antu.io.token_indexers.single_id_token_indexer

from typing import Dict, List, Callable, TypeVar
from overrides import overrides
from .. import Vocabulary
from . import TokenIndexer
Indices = TypeVar("Indices", List[int], List[List[int]])


[docs]class SingleIdTokenIndexer(TokenIndexer): """ A ``SingleIdTokenIndexer`` determines how string token get represented as arrays of single id indices in a model. Parameters ---------- related_vocabs : ``List[str]`` Which vocabularies are related to the indexer. transform : ``Callable[[str,], str]``, optional (default=``lambda x:x``) What changes need to be made to the token when counting or indexing. Commonly used are lowercase transformation functions. """ def __init__( self, related_vocabs: List[str], transform: Callable[[str, ], str] = lambda x: x) -> None: self.related_vocabs = related_vocabs self.transform = transform
[docs] @overrides def count_vocab_items( self, token: str, counters: Dict[str, Dict[str, int]]) -> None: """ The token is counted directly as an element. Parameters ---------- counter : ``Dict[str, Dict[str, int]]`` We count the number of strings if the string needs to be counted to some counters. """ for vocab_name in self.related_vocabs: if vocab_name in counters: counters[vocab_name][self.transform(token)] += 1
[docs] @overrides def tokens_to_indices( self, tokens: List[str], vocab: Vocabulary) -> Dict[str, List[int]]: """ Takes a list of tokens and converts them to one or more sets of indices. During the indexing process, each item corresponds to an index in the vocabulary. Parameters ---------- vocab : ``Vocabulary`` ``vocab`` is used to get the index of each item. Returns ------- res : ``Dict[str, List[int]]`` if the token and index list is [w1:5, w2:3, w3:0], the result will be {'vocab_name' : [5, 3, 0]} """ res = {} for index_name in self.related_vocabs: index_list = [vocab.get_token_index(self.transform(tok), index_name) for tok in tokens] res[index_name] = index_list return res