diff options
author | Marcin Chrzanowski <m@m-chrzan.xyz> | 2021-05-22 19:23:23 +0200 |
---|---|---|
committer | Marcin Chrzanowski <m@m-chrzan.xyz> | 2021-05-22 19:23:23 +0200 |
commit | 8ff8739b236a00169339b0b78e1f39357fdfff17 (patch) | |
tree | ba6a77c21519e6a9d77110ea1dee17668c0ba3b6 | |
parent | 9638adad21940c8be270e05fa5749e9b692c6b04 (diff) |
Generate data
-rw-r--r-- | data/generate.py | 7 | ||||
-rw-r--r-- | data/testset.py | 18 |
2 files changed, 25 insertions, 0 deletions
diff --git a/data/generate.py b/data/generate.py new file mode 100644 index 0000000..4b0ba9c --- /dev/null +++ b/data/generate.py @@ -0,0 +1,7 @@ +import numpy as np + +def get_single_example(n_tokens=16, seqlen=64, max_count=9): + seq = np.random.randint(low=0, high=n_tokens, size=(seqlen,)) + label = [min(list(seq[:i]).count(x), max_count) for i, x in enumerate(seq)] + label = np.array(label) + return seq, label diff --git a/data/testset.py b/data/testset.py new file mode 100644 index 0000000..07fc811 --- /dev/null +++ b/data/testset.py @@ -0,0 +1,18 @@ +import torch + +from data.generate import get_single_example + +TEST_SIZE = 128 + +test_examples = [get_single_example() for i in range(TEST_SIZE)] + +def get_testset(device='cpu'): + # Transpositions are used, because the convention in PyTorch is to represent + # sequence tensors as <seq_len, batch_size> instead of <batch_size, seq_len>. + test_X = torch.tensor( + [x[0] for x in test_examples], device=device + ).transpose(0, 1) + test_Y = torch.tensor( + [x[1] for x in test_examples], device=device + ).transpose(0, 1) + return test_X, test_Y |