Source code for antu.io.fields.text_field

from typing import List, Iterator, Dict
from overrides import overrides
from ..token_indexers import TokenIndexer
from .. import Vocabulary
from . import Field


[docs]class TextField(Field):
    """
    A ``TextField`` is a data field that is commonly used in NLP tasks, and we
    can use it to store text sequences such as sentences, paragraphs, POS tags,
    and so on.

    Parameters
    ----------
    name : ``str``
        Field name. This is necessary and must be unique (not the same as other
        field names).
    tokens : ``List[str]``
        Field content that contains a list of string.
    indexers : ``List[TokenIndexer]``, optional (default=``list()``)
        Indexer list that defines the vocabularies associated with the field.
    """

    def __init__(self,
                 name: str,
                 tokens: List[str],
                 indexers: List[TokenIndexer] = list()):
        self.name = name
        self.tokens = tokens
        self.indexers = indexers

    def __iter__(self) -> Iterator[str]:
        return iter(self.tokens)

    def __getitem__(self, idx: int) -> str:
        return self.tokens[idx]

    def __len__(self) -> int:
        return len(self.tokens)

    def __str__(self) -> str:
        return '{}: [{}]'.format(self.name, ', '.join(self.tokens))

[docs]    @overrides
    def count_vocab_items(self, counters: Dict[str, Dict[str, int]]) -> None:
        """
        We count the number of strings if the string needs to be counted to some
         counters. You can pass directly if there is no string that needs
        to be counted.

        Parameters
        ----------
        counters : ``Dict[str, Dict[str, int]]``
            Element statistics for datasets. if field indexers indicate that
            this field is related to some counters, we use field content to
            update the counters.
        """
        for idxer in self.indexers:
            for token in self.tokens:
                idxer.count_vocab_items(token, counters)

[docs]    @overrides
    def index(self, vocab: Vocabulary) -> None:
        """
        Gets one or more index mappings for each element in the Field.

        Parameters
        ----------
        vocab : ``Vocabulary``
            ``vocab`` is used to get the index of each item.
        """
        self.indexes = {}
        for idxer in self.indexers:
            self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab))