Skip to content

Pre-process datasets¤

When training a sparse autoencoder (SAE) often you want to use a text dataset such as The Pile.

The TextDataset class can pre-process this for you on the fly (i.e. tokenize and split into context_size chunks of tokens), so that you can get started right away. However, if you're experimenting a lot, it can be nicer to run this once and then save the resulting dataset to HuggingFace. You can then use PreTokenizedDataset to load this directly, saving you from running this pre-processing every time you use it.

The following code shows you how to do this, and is also used to upload a set of commonly used datasets for SAE training to Alan Cooney's HuggingFace hub.

Setup¤

Note you will also need to login to HuggingFace via the CLI:

huggingface-cli login
# Check if we're in Colab
try:
    import google.colab  # noqa: F401 # type: ignore

    in_colab = True
except ImportError:
    in_colab = False

#  Install if in Colab
if in_colab:
    %pip install sparse_autoencoder transformer_lens transformers wandb datasets

# Otherwise enable hot reloading in dev mode
if not in_colab:
    %load_ext autoreload
    %autoreload 2
from dataclasses import dataclass
from datasets import load_dataset
from transformers import AutoTokenizer
from sparse_autoencoder import TextDataset

Upload helper¤

Here we define a helper function to upload multiple datasets.

@dataclass
class DatasetToPreprocess:
    """Dataset to preprocess info."""

    source_path: str
    """Source path from HF (e.g. `roneneldan/TinyStories`)."""

    tokenizer_name: str
    """HF tokenizer name (e.g. `gpt2`)."""

    data_dir: str | None = None
    """Data directory to download from the source dataset."""

    data_files: list[str] | None = None
    """Data files to download from the source dataset."""

    hugging_face_username: str = "alancooney"
    """HF username for the upload."""

    @property
    def source_alias(self) -> str:
        """Create a source alias for the destination dataset name.

        Returns:
            The modified source path as source alias.
        """
        return self.source_path.replace("/", "-")

    @property
    def tokenizer_alias(self) -> str:
        """Create a tokenizer alias for the destination dataset name.

        Returns:
            The modified tokenizer name as tokenizer alias.
        """
        return self.tokenizer_name.replace("/", "-")

    @property
    def destination_repo_name(self) -> str:
        """Destination repo name.

        Returns:
            The destination repo name.
        """
        return f"sae-{self.source_alias}-tokenizer-{self.tokenizer_alias}"

    @property
    def destination_repo_id(self) -> str:
        """Destination repo ID.

        Returns:
            The destination repo ID.
        """
        return f"{self.hugging_face_username}/{self.destination_repo_name}"


def upload_datasets(datasets_to_preprocess: list[DatasetToPreprocess]) -> None:
    """Upload datasets to HF.

    Warning:
        Assumes you have already created the corresponding repos on HF.

    Args:
        datasets_to_preprocess: List of datasets to preprocess.

    Raises:
        ValueError: If the repo doesn't exist.
    """
    repositories_updating = [dataset.destination_repo_id for dataset in datasets_to_preprocess]
    print("Updating repositories:\n" "\n".join(repositories_updating))

    for dataset in datasets_to_preprocess:
        print("Processing dataset: ", dataset.source_path)

        # Preprocess
        tokenizer = AutoTokenizer.from_pretrained(dataset.tokenizer_name)
        text_dataset = TextDataset(
            dataset_path=dataset.source_path,
            tokenizer=tokenizer,
            pre_download=True,  # Must be true to upload after pre-processing, to the hub.
            dataset_files=dataset.data_files,
            dataset_dir=dataset.data_dir,
        )
        print("Size: ", text_dataset.dataset.size_in_bytes)
        print("Info: ", text_dataset.dataset.info)

        # Upload
        text_dataset.push_to_hugging_face_hub(repo_id=dataset.destination_repo_id)

Upload to Hugging Face¤

datasets: list[DatasetToPreprocess] = [
    DatasetToPreprocess(
        source_path="roneneldan/TinyStories",
        tokenizer_name="gpt2",
        # Get the newer versions (Generated with GPT-4 only)
        data_files=["TinyStoriesV2-GPT4-train.txt", "TinyStoriesV2-GPT4-valid.txt"],
    ),
    DatasetToPreprocess(
        source_path="monology/pile-uncopyrighted",
        tokenizer_name="gpt2",
        # Get just the first few (each file is 11GB so this should be enough for a large dataset)
        data_files=[
            "00.jsonl.zst",
            "01.jsonl.zst",
            "02.jsonl.zst",
            "03.jsonl.zst",
            "04.jsonl.zst",
            "05.jsonl.zst",
        ],
        data_dir="train",
    ),
    DatasetToPreprocess(
        source_path="monology/pile-uncopyrighted",
        tokenizer_name="EleutherAI/gpt-neox-20b",
        data_files=[
            "00.jsonl.zst",
            "01.jsonl.zst",
            "02.jsonl.zst",
            "03.jsonl.zst",
            "04.jsonl.zst",
            "05.jsonl.zst",
        ],
        data_dir="train",
    ),
]

upload_datasets(datasets)

Check a dataset is as expected¤

downloaded_dataset = load_dataset(
    "alancooney/sae-roneneldan-TinyStories-tokenizer-gpt2", streaming=True
)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

i = 0
first_k = 3
for data_item in iter(downloaded_dataset["train"]):  # type:ignore
    # Get just the first few
    i += 1
    if i >= first_k:
        break

    # Print the decoded items
    input_ids = data_item["input_ids"]
    decoded = tokenizer.decode(input_ids)
    print(f"{len(input_ids)} tokens: {decoded}")