Source code for antu.io.fields.text_field

from typing import List, Iterator, Dict
from overrides import overrides
from ..token_indexers import TokenIndexer
from .. import Vocabulary
from . import Field


[docs]class TextField(Field): """ A ``TextField`` is a data field that is commonly used in NLP tasks, and we can use it to store text sequences such as sentences, paragraphs, POS tags, and so on. Parameters ---------- name : ``str`` Field name. This is necessary and must be unique (not the same as other field names). tokens : ``List[str]`` Field content that contains a list of string. indexers : ``List[TokenIndexer]``, optional (default=``list()``) Indexer list that defines the vocabularies associated with the field. """ def __init__(self, name: str, tokens: List[str], indexers: List[TokenIndexer] = list()): self.name = name self.tokens = tokens self.indexers = indexers def __iter__(self) -> Iterator[str]: return iter(self.tokens) def __getitem__(self, idx: int) -> str: return self.tokens[idx] def __len__(self) -> int: return len(self.tokens) def __str__(self) -> str: return '{}: [{}]'.format(self.name, ', '.join(self.tokens))
[docs] @overrides def count_vocab_items(self, counters: Dict[str, Dict[str, int]]) -> None: """ We count the number of strings if the string needs to be counted to some counters. You can pass directly if there is no string that needs to be counted. Parameters ---------- counters : ``Dict[str, Dict[str, int]]`` Element statistics for datasets. if field indexers indicate that this field is related to some counters, we use field content to update the counters. """ for idxer in self.indexers: for token in self.tokens: idxer.count_vocab_items(token, counters)
[docs] @overrides def index(self, vocab: Vocabulary) -> None: """ Gets one or more index mappings for each element in the Field. Parameters ---------- vocab : ``Vocabulary`` ``vocab`` is used to get the index of each item. """ self.indexes = {} for idxer in self.indexers: self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab))