33 lines
1.1 KiB
Python
33 lines
1.1 KiB
Python
# Inspired by https://github.com/NVIDIA/Megatron-LM/blob/main/tasks/zeroshot_gpt/datasets.py
|
|
# Except we don't pad the last block and don't use overlapping eval
|
|
# And we return both the input and the target
|
|
import math
|
|
import numpy as np
|
|
|
|
import torch
|
|
|
|
|
|
class LMDataset(torch.utils.data.Dataset):
|
|
|
|
def __init__(self, tokens, seq_len, drop_last=True):
|
|
"""tokens should be a numpy array
|
|
"""
|
|
self.seq_len = seq_len
|
|
ntokens = len(tokens)
|
|
if drop_last:
|
|
ntokens = ((ntokens - 1) // seq_len) * seq_len + 1
|
|
self.ntokens = ntokens
|
|
# We're careful not to slice tokens, since it could be a memmap'ed array or H5 dataset,
|
|
# and slicing would load it to memory.
|
|
self.tokens = tokens
|
|
self.total_sequences = math.ceil((self.ntokens - 1) / self.seq_len)
|
|
|
|
def __len__(self):
|
|
return self.total_sequences
|
|
|
|
def __getitem__(self, idx):
|
|
start_idx = idx * self.seq_len
|
|
seq_len = min(self.seq_len, self.ntokens - 1 - start_idx)
|
|
data = torch.as_tensor(self.tokens[start_idx:(start_idx + seq_len + 1)].astype(np.int64))
|
|
return data[:-1], data[1:].clone()
|