Source code for torch_frame.config.text_tokenizer

from __future__ import annotations

from collections.abc import Callable
from dataclasses import dataclass

from torch_frame.typing import TextTokenizationOutputs


[docs]@dataclass
class TextTokenizerConfig:
    r"""Text tokenizer that maps a list of strings/sentences into a
    dictionary of :class:`MultiNestedTensor`.

    Args:
        text_tokenizer (callable): A callable text tokenizer that takes a
            list of strings as input and outputs a list of dictionaries.
            Each dictionary contains keys that are arguments to the text
            encoder model and values are corresponding tensors such as
            tokens and attention masks.
        batch_size (int, optional): Batch size to use when tokenizing the
            sentences. If set to :obj:`None`, the text embeddings will
            be obtained in a full-batch manner. (default: :obj:`None`)

    """
    text_tokenizer: Callable[[list[str]], TextTokenizationOutputs]
    batch_size: int | None = None