Source code for antu.io.token_indexers.single_id_token_indexer

from typing import Dict, List, Callable, TypeVar
from overrides import overrides
from .. import Vocabulary
from . import TokenIndexer
Indices = TypeVar("Indices", List[int], List[List[int]])


[docs]class SingleIdTokenIndexer(TokenIndexer):
    """
    A ``SingleIdTokenIndexer`` determines how string token get represented as
    arrays of single id indices in a model.

    Parameters
    ----------
    related_vocabs : ``List[str]``
        Which vocabularies are related to the indexer.
    transform : ``Callable[[str,], str]``, optional (default=``lambda x:x``)
        What changes need to be made to the token when counting or indexing.
        Commonly used are lowercase transformation functions.
    """

    def __init__(
            self,
            related_vocabs: List[str],
            transform: Callable[[str, ], str] = lambda x: x) -> None:
        self.related_vocabs = related_vocabs
        self.transform = transform

[docs]    @overrides
    def count_vocab_items(
            self,
            token: str,
            counters: Dict[str, Dict[str, int]]) -> None:
        """
        The token is counted directly as an element.

        Parameters
        ----------
        counter : ``Dict[str, Dict[str, int]]``
            We count the number of strings if the string needs to be counted to
            some counters.
        """
        for vocab_name in self.related_vocabs:
            if vocab_name in counters:
                counters[vocab_name][self.transform(token)] += 1

[docs]    @overrides
    def tokens_to_indices(
            self,
            tokens: List[str],
            vocab: Vocabulary) -> Dict[str, List[int]]:
        """
        Takes a list of tokens and converts them to one or more sets of indices.
        During the indexing process, each item corresponds to an index in the
        vocabulary.

        Parameters
        ----------
        vocab : ``Vocabulary``
            ``vocab`` is used to get the index of each item.

        Returns
        -------
        res : ``Dict[str, List[int]]``
            if the token and index list is [w1:5, w2:3, w3:0], the result will
            be {'vocab_name' : [5, 3, 0]}
        """
        res = {}
        for index_name in self.related_vocabs:
            index_list = [vocab.get_token_index(self.transform(tok), index_name)
                          for tok in tokens]
            res[index_name] = index_list
        return res