Skip to content

Mock dataset¤

Mock dataset.

For use with tests and simple examples.

ConsecutiveIntHuggingFaceDataset ¤

Bases: IterableDataset

Consecutive integers Hugging Face dataset for testing.

Creates a dataset where the first item is [0,1,2...], and the second item is [1,2,3...] and so on.

Source code in sparse_autoencoder/source_data/mock_dataset.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class ConsecutiveIntHuggingFaceDataset(IterableDataset):
    """Consecutive integers Hugging Face dataset for testing.

    Creates a dataset where the first item is [0,1,2...], and the second item is [1,2,3...] and so
    on.
    """

    _data: Int[Tensor, "items context_size"]
    """Generated data."""

    _length: int
    """Size of the dataset."""

    _format: Literal["torch", "list"] = "list"
    """Format of the data."""

    def create_data(self, n_items: int, context_size: int) -> Int[Tensor, "items context_size"]:
        """Create the data.

        Args:
            n_items: The number of items in the dataset.
            context_size: The number of tokens in the context window.

        Returns:
            The generated data.
        """
        rows = torch.arange(n_items).unsqueeze(1)
        columns = torch.arange(context_size).unsqueeze(0)
        return rows + columns

    def __init__(self, context_size: int, vocab_size: int = 50_000, n_items: int = 10_000) -> None:
        """Initialize the mock HF dataset.

        Args:
            context_size: The number of tokens in the context window
            vocab_size: The size of the vocabulary to use.
            n_items: The number of items in the dataset.

        Raises:
            ValueError: If more items are requested than we can create with the vocab size (given
                that each item is a consecutive list of integers and unique).
        """
        self._length = n_items

        # Check we can create the data
        if n_items + context_size > vocab_size:
            error_message = (
                f"n_items ({n_items}) + context_size ({context_size}) must be less than "
                f"vocab_size ({vocab_size})"
            )
            raise ValueError(error_message)

        # Initialise the data
        self._data = self.create_data(n_items, context_size)

    def __iter__(self) -> Iterator:  # type: ignore (HF typing is incorrect)
        """Initialize the iterator.

        Returns:
            Iterator.
        """
        self._index = 0
        return self

    def __next__(self) -> TokenizedPrompts | TorchTokenizedPrompts:
        """Return the next item in the dataset.

        Returns:
            TokenizedPrompts: The next item in the dataset.

        Raises:
            StopIteration: If the end of the dataset is reached.
        """
        if self._index < self._length:
            item = self[self._index]
            self._index += 1
            return item

        raise StopIteration

    def __len__(self) -> int:
        """Len Dunder Method."""
        return self._length

    def __getitem__(self, index: int) -> TokenizedPrompts | TorchTokenizedPrompts:
        """Get Item."""
        item = self._data[index]

        if self._format == "torch":
            return {"input_ids": item}

        return {"input_ids": item.tolist()}

    def with_format(  # type: ignore (only support 2 types)
        self,
        type: Literal["torch", "list"],  # noqa: A002
    ) -> "ConsecutiveIntHuggingFaceDataset":
        """With Format."""
        self._format = type
        return self

__getitem__(index) ¤

Get Item.

Source code in sparse_autoencoder/source_data/mock_dataset.py
106
107
108
109
110
111
112
113
def __getitem__(self, index: int) -> TokenizedPrompts | TorchTokenizedPrompts:
    """Get Item."""
    item = self._data[index]

    if self._format == "torch":
        return {"input_ids": item}

    return {"input_ids": item.tolist()}

__init__(context_size, vocab_size=50000, n_items=10000) ¤

Initialize the mock HF dataset.

Parameters:

Name Type Description Default
context_size int

The number of tokens in the context window

required
vocab_size int

The size of the vocabulary to use.

50000
n_items int

The number of items in the dataset.

10000

Raises:

Type Description
ValueError

If more items are requested than we can create with the vocab size (given that each item is a consecutive list of integers and unique).

Source code in sparse_autoencoder/source_data/mock_dataset.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def __init__(self, context_size: int, vocab_size: int = 50_000, n_items: int = 10_000) -> None:
    """Initialize the mock HF dataset.

    Args:
        context_size: The number of tokens in the context window
        vocab_size: The size of the vocabulary to use.
        n_items: The number of items in the dataset.

    Raises:
        ValueError: If more items are requested than we can create with the vocab size (given
            that each item is a consecutive list of integers and unique).
    """
    self._length = n_items

    # Check we can create the data
    if n_items + context_size > vocab_size:
        error_message = (
            f"n_items ({n_items}) + context_size ({context_size}) must be less than "
            f"vocab_size ({vocab_size})"
        )
        raise ValueError(error_message)

    # Initialise the data
    self._data = self.create_data(n_items, context_size)

__iter__() ¤

Initialize the iterator.

Returns:

Type Description
Iterator

Iterator.

Source code in sparse_autoencoder/source_data/mock_dataset.py
77
78
79
80
81
82
83
84
def __iter__(self) -> Iterator:  # type: ignore (HF typing is incorrect)
    """Initialize the iterator.

    Returns:
        Iterator.
    """
    self._index = 0
    return self

__len__() ¤

Len Dunder Method.

Source code in sparse_autoencoder/source_data/mock_dataset.py
102
103
104
def __len__(self) -> int:
    """Len Dunder Method."""
    return self._length

__next__() ¤

Return the next item in the dataset.

Returns:

Name Type Description
TokenizedPrompts TokenizedPrompts | TorchTokenizedPrompts

The next item in the dataset.

Raises:

Type Description
StopIteration

If the end of the dataset is reached.

Source code in sparse_autoencoder/source_data/mock_dataset.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def __next__(self) -> TokenizedPrompts | TorchTokenizedPrompts:
    """Return the next item in the dataset.

    Returns:
        TokenizedPrompts: The next item in the dataset.

    Raises:
        StopIteration: If the end of the dataset is reached.
    """
    if self._index < self._length:
        item = self[self._index]
        self._index += 1
        return item

    raise StopIteration

create_data(n_items, context_size) ¤

Create the data.

Parameters:

Name Type Description Default
n_items int

The number of items in the dataset.

required
context_size int

The number of tokens in the context window.

required

Returns:

Type Description
Int[Tensor, 'items context_size']

The generated data.

Source code in sparse_autoencoder/source_data/mock_dataset.py
38
39
40
41
42
43
44
45
46
47
48
49
50
def create_data(self, n_items: int, context_size: int) -> Int[Tensor, "items context_size"]:
    """Create the data.

    Args:
        n_items: The number of items in the dataset.
        context_size: The number of tokens in the context window.

    Returns:
        The generated data.
    """
    rows = torch.arange(n_items).unsqueeze(1)
    columns = torch.arange(context_size).unsqueeze(0)
    return rows + columns

with_format(type) ¤

With Format.

Source code in sparse_autoencoder/source_data/mock_dataset.py
115
116
117
118
119
120
121
def with_format(  # type: ignore (only support 2 types)
    self,
    type: Literal["torch", "list"],  # noqa: A002
) -> "ConsecutiveIntHuggingFaceDataset":
    """With Format."""
    self._format = type
    return self

MockDataset ¤

Bases: SourceDataset[TokenizedPrompts]

Mock dataset for testing.

For use with tests and simple examples.

Source code in sparse_autoencoder/source_data/mock_dataset.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@final
class MockDataset(SourceDataset[TokenizedPrompts]):
    """Mock dataset for testing.

    For use with tests and simple examples.
    """

    tokenizer: PreTrainedTokenizerFast

    def preprocess(
        self,
        source_batch: TokenizedPrompts,
        *,
        context_size: int,  # noqa: ARG002
    ) -> TokenizedPrompts:
        """Preprocess a batch of prompts."""
        # Nothing to do here
        return source_batch

    @validate_call
    def __init__(
        self,
        context_size: PositiveInt = 250,
        buffer_size: PositiveInt = 1000,  # noqa: ARG002
        preprocess_batch_size: PositiveInt = 1000,  # noqa: ARG002
        dataset_path: str = "dummy",  # noqa: ARG002
        dataset_split: str = "train",  # noqa: ARG002
    ):
        """Initialize the Random Int Dummy dataset.

        Example:
            >>> data = MockDataset()
            >>> first_item = next(iter(data))
            >>> len(first_item["input_ids"])
            250

        Args:
            context_size: The context size to use when returning a list of tokenized prompts.
                *Towards Monosemanticity: Decomposing Language Models With Dictionary Learning* used
                a context size of 250.
            buffer_size: The buffer size to use when shuffling the dataset. As the dataset is
                streamed, this just pre-downloads at least `buffer_size` items and then shuffles
                just that buffer. Note that the generated activations should also be shuffled before
                training the sparse autoencoder, so a large buffer may not be strictly necessary
                here. Note also that this is the number of items in the dataset (e.g. number of
                prompts) and is typically significantly less than the number of tokenized prompts
                once the preprocessing function has been applied.
            preprocess_batch_size: The batch size to use just for preprocessing the dataset (e.g.
                tokenizing prompts).
            dataset_path: The path to the dataset on Hugging Face.
            dataset_split: Dataset split (e.g. `train`).
        """
        self.dataset = ConsecutiveIntHuggingFaceDataset(context_size=context_size)  # type: ignore
        self.context_size = context_size

__init__(context_size=250, buffer_size=1000, preprocess_batch_size=1000, dataset_path='dummy', dataset_split='train') ¤

Initialize the Random Int Dummy dataset.

Example

data = MockDataset() first_item = next(iter(data)) len(first_item["input_ids"]) 250

Parameters:

Name Type Description Default
context_size PositiveInt

The context size to use when returning a list of tokenized prompts. Towards Monosemanticity: Decomposing Language Models With Dictionary Learning used a context size of 250.

250
buffer_size PositiveInt

The buffer size to use when shuffling the dataset. As the dataset is streamed, this just pre-downloads at least buffer_size items and then shuffles just that buffer. Note that the generated activations should also be shuffled before training the sparse autoencoder, so a large buffer may not be strictly necessary here. Note also that this is the number of items in the dataset (e.g. number of prompts) and is typically significantly less than the number of tokenized prompts once the preprocessing function has been applied.

1000
preprocess_batch_size PositiveInt

The batch size to use just for preprocessing the dataset (e.g. tokenizing prompts).

1000
dataset_path str

The path to the dataset on Hugging Face.

'dummy'
dataset_split str

Dataset split (e.g. train).

'train'
Source code in sparse_autoencoder/source_data/mock_dataset.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@validate_call
def __init__(
    self,
    context_size: PositiveInt = 250,
    buffer_size: PositiveInt = 1000,  # noqa: ARG002
    preprocess_batch_size: PositiveInt = 1000,  # noqa: ARG002
    dataset_path: str = "dummy",  # noqa: ARG002
    dataset_split: str = "train",  # noqa: ARG002
):
    """Initialize the Random Int Dummy dataset.

    Example:
        >>> data = MockDataset()
        >>> first_item = next(iter(data))
        >>> len(first_item["input_ids"])
        250

    Args:
        context_size: The context size to use when returning a list of tokenized prompts.
            *Towards Monosemanticity: Decomposing Language Models With Dictionary Learning* used
            a context size of 250.
        buffer_size: The buffer size to use when shuffling the dataset. As the dataset is
            streamed, this just pre-downloads at least `buffer_size` items and then shuffles
            just that buffer. Note that the generated activations should also be shuffled before
            training the sparse autoencoder, so a large buffer may not be strictly necessary
            here. Note also that this is the number of items in the dataset (e.g. number of
            prompts) and is typically significantly less than the number of tokenized prompts
            once the preprocessing function has been applied.
        preprocess_batch_size: The batch size to use just for preprocessing the dataset (e.g.
            tokenizing prompts).
        dataset_path: The path to the dataset on Hugging Face.
        dataset_split: Dataset split (e.g. `train`).
    """
    self.dataset = ConsecutiveIntHuggingFaceDataset(context_size=context_size)  # type: ignore
    self.context_size = context_size

preprocess(source_batch, *, context_size) ¤

Preprocess a batch of prompts.

Source code in sparse_autoencoder/source_data/mock_dataset.py
133
134
135
136
137
138
139
140
141
def preprocess(
    self,
    source_batch: TokenizedPrompts,
    *,
    context_size: int,  # noqa: ARG002
) -> TokenizedPrompts:
    """Preprocess a batch of prompts."""
    # Nothing to do here
    return source_batch