Skip to content

Generic Text Dataset Module for Hugging Face Datasets¤

Generic Text Dataset Module for Hugging Face Datasets.

GenericTextDataset should work with the following datasets: - monology/pile-uncopyrighted - the_pile_openwebtext2 - roneneldan/TinyStories

GenericTextDataBatch ¤

Bases: TypedDict

Generic Text Dataset Batch.

Assumes the dataset provides a 'text' field with a list of strings.

Source code in sparse_autoencoder/source_data/text_dataset.py
18
19
20
21
22
23
24
25
class GenericTextDataBatch(TypedDict):
    """Generic Text Dataset Batch.

    Assumes the dataset provides a 'text' field with a list of strings.
    """

    text: list[str]
    meta: list[dict[str, dict[str, str]]]  # Optional, depending on the dataset structure.

TextDataset ¤

Bases: SourceDataset[GenericTextDataBatch]

Generic Text Dataset for any text-based dataset from Hugging Face.

Source code in sparse_autoencoder/source_data/text_dataset.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
@final
class TextDataset(SourceDataset[GenericTextDataBatch]):
    """Generic Text Dataset for any text-based dataset from Hugging Face."""

    tokenizer: PreTrainedTokenizerBase

    def preprocess(
        self,
        source_batch: GenericTextDataBatch,
        *,
        context_size: int,
    ) -> TokenizedPrompts:
        """Preprocess a batch of prompts.

        Tokenizes and chunks text data into lists of tokenized prompts with specified context size.

        Args:
            source_batch: A batch of source data, including 'text' with a list of strings.
            context_size: Context size for tokenized prompts.

        Returns:
            Tokenized prompts.
        """
        prompts: list[str] = source_batch["text"]

        tokenized_prompts = self.tokenizer(prompts, truncation=True, padding=False)

        # Chunk each tokenized prompt into blocks of context_size, discarding incomplete blocks.
        context_size_prompts = []
        for encoding in list(tokenized_prompts[self._dataset_column_name]):  # type: ignore
            chunks = [
                encoding[i : i + context_size]
                for i in range(0, len(encoding), context_size)
                if len(encoding[i : i + context_size]) == context_size
            ]
            context_size_prompts.extend(chunks)

        return {"input_ids": context_size_prompts}

    @validate_call(config={"arbitrary_types_allowed": True})
    def __init__(
        self,
        dataset_path: str,
        tokenizer: PreTrainedTokenizerBase,
        buffer_size: PositiveInt = 1000,
        context_size: PositiveInt = 256,
        dataset_dir: str | None = None,
        dataset_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None,
        dataset_split: str = "train",
        dataset_column_name: str = "input_ids",
        n_processes_preprocessing: PositiveInt | None = None,
        preprocess_batch_size: PositiveInt = 1000,
        *,
        pre_download: bool = False,
    ):
        """Initialize a generic text dataset from Hugging Face.

        Args:
            dataset_path: Path to the dataset on Hugging Face (e.g. `'monology/pile-uncopyright'`).
            tokenizer: Tokenizer to process text data.
            buffer_size: The buffer size to use when shuffling the dataset when streaming. When
                streaming a dataset, this just pre-downloads at least `buffer_size` items and then
                shuffles just that buffer. Note that the generated activations should also be
                shuffled before training the sparse autoencoder, so a large buffer may not be
                strictly necessary here. Note also that this is the number of items in the dataset
                (e.g. number of prompts) and is typically significantly less than the number of
                tokenized prompts once the preprocessing function has been applied.
            context_size: The context size to use when returning a list of tokenized prompts.
                *Towards Monosemanticity: Decomposing Language Models With Dictionary Learning* used
                a context size of 250.
            dataset_dir: Defining the `data_dir` of the dataset configuration.
            dataset_files: Path(s) to source data file(s).
            dataset_split: Dataset split (e.g., 'train').
            dataset_column_name: The column name for the prompts.
            n_processes_preprocessing: Number of processes to use for preprocessing.
            preprocess_batch_size: Batch size for preprocessing (tokenizing prompts).
            pre_download: Whether to pre-download the whole dataset.
        """
        self.tokenizer = tokenizer

        super().__init__(
            buffer_size=buffer_size,
            context_size=context_size,
            dataset_dir=dataset_dir,
            dataset_files=dataset_files,
            dataset_path=dataset_path,
            dataset_split=dataset_split,
            dataset_column_name=dataset_column_name,
            n_processes_preprocessing=n_processes_preprocessing,
            pre_download=pre_download,
            preprocess_batch_size=preprocess_batch_size,
        )

    @validate_call
    def push_to_hugging_face_hub(
        self,
        repo_id: str,
        commit_message: str = "Upload preprocessed dataset using sparse_autoencoder.",
        max_shard_size: str | None = None,
        n_shards: PositiveInt = 64,
        revision: str = "main",
        *,
        private: bool = False,
    ) -> None:
        """Share preprocessed dataset to Hugging Face hub.

        Motivation:
            Pre-processing a dataset can be time-consuming, so it is useful to be able to share the
            pre-processed dataset with others. This function allows you to do that by pushing the
            pre-processed dataset to the Hugging Face hub.

        Warning:
            You must be logged into HuggingFace (e.g with `huggingface-cli login` from the terminal)
            to use this.

        Warning:
            This will only work if the dataset is not streamed (i.e. if `pre_download=True` when
            initializing the dataset).

        Args:
            repo_id: Hugging Face repo ID to save the dataset to (e.g. `username/dataset_name`).
            commit_message: Commit message.
            max_shard_size: Maximum shard size (e.g. `'500MB'`). Should not be set if `n_shards`
                is set.
            n_shards: Number of shards to split the dataset into. A high number is recommended
                here to allow for flexible distributed training of SAEs across nodes (where e.g.
                each node fetches its own shard).
            revision: Branch to push to.
            private: Whether to save the dataset privately.

        Raises:
            TypeError: If the dataset is streamed.
        """
        if isinstance(self.dataset, IterableDataset):
            error_message = (
                "Cannot share a streamed dataset to Hugging Face. "
                "Please use `pre_download=True` when initializing the dataset."
            )
            raise TypeError(error_message)

        self.dataset.push_to_hub(
            repo_id=repo_id,
            commit_message=commit_message,
            max_shard_size=max_shard_size,
            num_shards=n_shards,
            private=private,
            revision=revision,
        )

__init__(dataset_path, tokenizer, buffer_size=1000, context_size=256, dataset_dir=None, dataset_files=None, dataset_split='train', dataset_column_name='input_ids', n_processes_preprocessing=None, preprocess_batch_size=1000, *, pre_download=False) ¤

Initialize a generic text dataset from Hugging Face.

Parameters:

Name Type Description Default
dataset_path str

Path to the dataset on Hugging Face (e.g. 'monology/pile-uncopyright').

required
tokenizer PreTrainedTokenizerBase

Tokenizer to process text data.

required
buffer_size PositiveInt

The buffer size to use when shuffling the dataset when streaming. When streaming a dataset, this just pre-downloads at least buffer_size items and then shuffles just that buffer. Note that the generated activations should also be shuffled before training the sparse autoencoder, so a large buffer may not be strictly necessary here. Note also that this is the number of items in the dataset (e.g. number of prompts) and is typically significantly less than the number of tokenized prompts once the preprocessing function has been applied.

1000
context_size PositiveInt

The context size to use when returning a list of tokenized prompts. Towards Monosemanticity: Decomposing Language Models With Dictionary Learning used a context size of 250.

256
dataset_dir str | None

Defining the data_dir of the dataset configuration.

None
dataset_files str | Sequence[str] | Mapping[str, str | Sequence[str]] | None

Path(s) to source data file(s).

None
dataset_split str

Dataset split (e.g., 'train').

'train'
dataset_column_name str

The column name for the prompts.

'input_ids'
n_processes_preprocessing PositiveInt | None

Number of processes to use for preprocessing.

None
preprocess_batch_size PositiveInt

Batch size for preprocessing (tokenizing prompts).

1000
pre_download bool

Whether to pre-download the whole dataset.

False
Source code in sparse_autoencoder/source_data/text_dataset.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
@validate_call(config={"arbitrary_types_allowed": True})
def __init__(
    self,
    dataset_path: str,
    tokenizer: PreTrainedTokenizerBase,
    buffer_size: PositiveInt = 1000,
    context_size: PositiveInt = 256,
    dataset_dir: str | None = None,
    dataset_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None,
    dataset_split: str = "train",
    dataset_column_name: str = "input_ids",
    n_processes_preprocessing: PositiveInt | None = None,
    preprocess_batch_size: PositiveInt = 1000,
    *,
    pre_download: bool = False,
):
    """Initialize a generic text dataset from Hugging Face.

    Args:
        dataset_path: Path to the dataset on Hugging Face (e.g. `'monology/pile-uncopyright'`).
        tokenizer: Tokenizer to process text data.
        buffer_size: The buffer size to use when shuffling the dataset when streaming. When
            streaming a dataset, this just pre-downloads at least `buffer_size` items and then
            shuffles just that buffer. Note that the generated activations should also be
            shuffled before training the sparse autoencoder, so a large buffer may not be
            strictly necessary here. Note also that this is the number of items in the dataset
            (e.g. number of prompts) and is typically significantly less than the number of
            tokenized prompts once the preprocessing function has been applied.
        context_size: The context size to use when returning a list of tokenized prompts.
            *Towards Monosemanticity: Decomposing Language Models With Dictionary Learning* used
            a context size of 250.
        dataset_dir: Defining the `data_dir` of the dataset configuration.
        dataset_files: Path(s) to source data file(s).
        dataset_split: Dataset split (e.g., 'train').
        dataset_column_name: The column name for the prompts.
        n_processes_preprocessing: Number of processes to use for preprocessing.
        preprocess_batch_size: Batch size for preprocessing (tokenizing prompts).
        pre_download: Whether to pre-download the whole dataset.
    """
    self.tokenizer = tokenizer

    super().__init__(
        buffer_size=buffer_size,
        context_size=context_size,
        dataset_dir=dataset_dir,
        dataset_files=dataset_files,
        dataset_path=dataset_path,
        dataset_split=dataset_split,
        dataset_column_name=dataset_column_name,
        n_processes_preprocessing=n_processes_preprocessing,
        pre_download=pre_download,
        preprocess_batch_size=preprocess_batch_size,
    )

preprocess(source_batch, *, context_size) ¤

Preprocess a batch of prompts.

Tokenizes and chunks text data into lists of tokenized prompts with specified context size.

Parameters:

Name Type Description Default
source_batch GenericTextDataBatch

A batch of source data, including 'text' with a list of strings.

required
context_size int

Context size for tokenized prompts.

required

Returns:

Type Description
TokenizedPrompts

Tokenized prompts.

Source code in sparse_autoencoder/source_data/text_dataset.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def preprocess(
    self,
    source_batch: GenericTextDataBatch,
    *,
    context_size: int,
) -> TokenizedPrompts:
    """Preprocess a batch of prompts.

    Tokenizes and chunks text data into lists of tokenized prompts with specified context size.

    Args:
        source_batch: A batch of source data, including 'text' with a list of strings.
        context_size: Context size for tokenized prompts.

    Returns:
        Tokenized prompts.
    """
    prompts: list[str] = source_batch["text"]

    tokenized_prompts = self.tokenizer(prompts, truncation=True, padding=False)

    # Chunk each tokenized prompt into blocks of context_size, discarding incomplete blocks.
    context_size_prompts = []
    for encoding in list(tokenized_prompts[self._dataset_column_name]):  # type: ignore
        chunks = [
            encoding[i : i + context_size]
            for i in range(0, len(encoding), context_size)
            if len(encoding[i : i + context_size]) == context_size
        ]
        context_size_prompts.extend(chunks)

    return {"input_ids": context_size_prompts}

push_to_hugging_face_hub(repo_id, commit_message='Upload preprocessed dataset using sparse_autoencoder.', max_shard_size=None, n_shards=64, revision='main', *, private=False) ¤

Share preprocessed dataset to Hugging Face hub.

Motivation

Pre-processing a dataset can be time-consuming, so it is useful to be able to share the pre-processed dataset with others. This function allows you to do that by pushing the pre-processed dataset to the Hugging Face hub.

Warning

You must be logged into HuggingFace (e.g with huggingface-cli login from the terminal) to use this.

Warning

This will only work if the dataset is not streamed (i.e. if pre_download=True when initializing the dataset).

Parameters:

Name Type Description Default
repo_id str

Hugging Face repo ID to save the dataset to (e.g. username/dataset_name).

required
commit_message str

Commit message.

'Upload preprocessed dataset using sparse_autoencoder.'
max_shard_size str | None

Maximum shard size (e.g. '500MB'). Should not be set if n_shards is set.

None
n_shards PositiveInt

Number of shards to split the dataset into. A high number is recommended here to allow for flexible distributed training of SAEs across nodes (where e.g. each node fetches its own shard).

64
revision str

Branch to push to.

'main'
private bool

Whether to save the dataset privately.

False

Raises:

Type Description
TypeError

If the dataset is streamed.

Source code in sparse_autoencoder/source_data/text_dataset.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
@validate_call
def push_to_hugging_face_hub(
    self,
    repo_id: str,
    commit_message: str = "Upload preprocessed dataset using sparse_autoencoder.",
    max_shard_size: str | None = None,
    n_shards: PositiveInt = 64,
    revision: str = "main",
    *,
    private: bool = False,
) -> None:
    """Share preprocessed dataset to Hugging Face hub.

    Motivation:
        Pre-processing a dataset can be time-consuming, so it is useful to be able to share the
        pre-processed dataset with others. This function allows you to do that by pushing the
        pre-processed dataset to the Hugging Face hub.

    Warning:
        You must be logged into HuggingFace (e.g with `huggingface-cli login` from the terminal)
        to use this.

    Warning:
        This will only work if the dataset is not streamed (i.e. if `pre_download=True` when
        initializing the dataset).

    Args:
        repo_id: Hugging Face repo ID to save the dataset to (e.g. `username/dataset_name`).
        commit_message: Commit message.
        max_shard_size: Maximum shard size (e.g. `'500MB'`). Should not be set if `n_shards`
            is set.
        n_shards: Number of shards to split the dataset into. A high number is recommended
            here to allow for flexible distributed training of SAEs across nodes (where e.g.
            each node fetches its own shard).
        revision: Branch to push to.
        private: Whether to save the dataset privately.

    Raises:
        TypeError: If the dataset is streamed.
    """
    if isinstance(self.dataset, IterableDataset):
        error_message = (
            "Cannot share a streamed dataset to Hugging Face. "
            "Please use `pre_download=True` when initializing the dataset."
        )
        raise TypeError(error_message)

    self.dataset.push_to_hub(
        repo_id=repo_id,
        commit_message=commit_message,
        max_shard_size=max_shard_size,
        num_shards=n_shards,
        private=private,
        revision=revision,
    )