Source code for torch_frame.config.text_tokenizer

from __future__ import annotations

from collections.abc import Callable
from dataclasses import dataclass

from torch_frame.typing import TextTokenizationOutputs


[docs]@dataclass class TextTokenizerConfig: r"""Text tokenizer that maps a list of strings/sentences into a dictionary of :class:`MultiNestedTensor`. Args: text_tokenizer (callable): A callable text tokenizer that takes a list of strings as input and outputs a list of dictionaries. Each dictionary contains keys that are arguments to the text encoder model and values are corresponding tensors such as tokens and attention masks. batch_size (int, optional): Batch size to use when tokenizing the sentences. If set to :obj:`None`, the text embeddings will be obtained in a full-batch manner. (default: :obj:`None`) """ text_tokenizer: Callable[[list[str]], TextTokenizationOutputs] batch_size: int | None = None