Pre-process datasets¤
When training a sparse autoencoder (SAE) often you want to use a text dataset such as The Pile.
The TextDataset
class can
pre-process this for you on the fly (i.e. tokenize and split into context_size
chunks of tokens),
so that you can get started right away. However, if you're experimenting a lot, it can be nicer to
run this once and then save the resulting dataset to HuggingFace. You can then use
PreTokenizedDataset
to load this directly, saving you from running this pre-processing every time
you use it.
The following code shows you how to do this, and is also used to upload a set of commonly used datasets for SAE training to Alan Cooney's HuggingFace hub.
Setup¤
Note you will also need to login to HuggingFace via the CLI:
huggingface-cli login
# Check if we're in Colab
try:
import google.colab # noqa: F401 # type: ignore
in_colab = True
except ImportError:
in_colab = False
# Install if in Colab
if in_colab:
%pip install sparse_autoencoder transformer_lens transformers wandb datasets
# Otherwise enable hot reloading in dev mode
if not in_colab:
%load_ext autoreload
%autoreload 2
from dataclasses import dataclass
from datasets import load_dataset
from transformers import AutoTokenizer
from sparse_autoencoder import TextDataset
Upload helper¤
Here we define a helper function to upload multiple datasets.
@dataclass
class DatasetToPreprocess:
"""Dataset to preprocess info."""
source_path: str
"""Source path from HF (e.g. `roneneldan/TinyStories`)."""
tokenizer_name: str
"""HF tokenizer name (e.g. `gpt2`)."""
data_dir: str | None = None
"""Data directory to download from the source dataset."""
data_files: list[str] | None = None
"""Data files to download from the source dataset."""
hugging_face_username: str = "alancooney"
"""HF username for the upload."""
@property
def source_alias(self) -> str:
"""Create a source alias for the destination dataset name.
Returns:
The modified source path as source alias.
"""
return self.source_path.replace("/", "-")
@property
def tokenizer_alias(self) -> str:
"""Create a tokenizer alias for the destination dataset name.
Returns:
The modified tokenizer name as tokenizer alias.
"""
return self.tokenizer_name.replace("/", "-")
@property
def destination_repo_name(self) -> str:
"""Destination repo name.
Returns:
The destination repo name.
"""
return f"sae-{self.source_alias}-tokenizer-{self.tokenizer_alias}"
@property
def destination_repo_id(self) -> str:
"""Destination repo ID.
Returns:
The destination repo ID.
"""
return f"{self.hugging_face_username}/{self.destination_repo_name}"
def upload_datasets(datasets_to_preprocess: list[DatasetToPreprocess]) -> None:
"""Upload datasets to HF.
Warning:
Assumes you have already created the corresponding repos on HF.
Args:
datasets_to_preprocess: List of datasets to preprocess.
Raises:
ValueError: If the repo doesn't exist.
"""
repositories_updating = [dataset.destination_repo_id for dataset in datasets_to_preprocess]
print("Updating repositories:\n" "\n".join(repositories_updating))
for dataset in datasets_to_preprocess:
print("Processing dataset: ", dataset.source_path)
# Preprocess
tokenizer = AutoTokenizer.from_pretrained(dataset.tokenizer_name)
text_dataset = TextDataset(
dataset_path=dataset.source_path,
tokenizer=tokenizer,
pre_download=True, # Must be true to upload after pre-processing, to the hub.
dataset_files=dataset.data_files,
dataset_dir=dataset.data_dir,
)
print("Size: ", text_dataset.dataset.size_in_bytes)
print("Info: ", text_dataset.dataset.info)
# Upload
text_dataset.push_to_hugging_face_hub(repo_id=dataset.destination_repo_id)
Upload to Hugging Face¤
datasets: list[DatasetToPreprocess] = [
DatasetToPreprocess(
source_path="roneneldan/TinyStories",
tokenizer_name="gpt2",
# Get the newer versions (Generated with GPT-4 only)
data_files=["TinyStoriesV2-GPT4-train.txt", "TinyStoriesV2-GPT4-valid.txt"],
),
DatasetToPreprocess(
source_path="monology/pile-uncopyrighted",
tokenizer_name="gpt2",
# Get just the first few (each file is 11GB so this should be enough for a large dataset)
data_files=[
"00.jsonl.zst",
"01.jsonl.zst",
"02.jsonl.zst",
"03.jsonl.zst",
"04.jsonl.zst",
"05.jsonl.zst",
],
data_dir="train",
),
DatasetToPreprocess(
source_path="monology/pile-uncopyrighted",
tokenizer_name="EleutherAI/gpt-neox-20b",
data_files=[
"00.jsonl.zst",
"01.jsonl.zst",
"02.jsonl.zst",
"03.jsonl.zst",
"04.jsonl.zst",
"05.jsonl.zst",
],
data_dir="train",
),
]
upload_datasets(datasets)
Check a dataset is as expected¤
downloaded_dataset = load_dataset(
"alancooney/sae-roneneldan-TinyStories-tokenizer-gpt2", streaming=True
)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
i = 0
first_k = 3
for data_item in iter(downloaded_dataset["train"]): # type:ignore
# Get just the first few
i += 1
if i >= first_k:
break
# Print the decoded items
input_ids = data_item["input_ids"]
decoded = tokenizer.decode(input_ids)
print(f"{len(input_ids)} tokens: {decoded}")