train/train.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

from time import time
import sys

import torch
from torch import nn
from torch import optim

from data.generate import get_single_example
from data.testset import get_testset

def do_verbose_test(model, n_tokens, seqlen, max_count):
    print('verbose test:')
    x, y = get_single_example(n_tokens, seqlen, max_count)
    x = torch.tensor([x]).transpose(0, 1)
    print('in          :', x.squeeze())
    print('expected out:', torch.tensor(y))
    print('model out   :', torch.argmax(model(x), dim=2).squeeze())

def train_model(model, lr, num_steps, batch_size, n_tokens, seqlen, max_count, device='cpu'):
    torch.autograd.set_detect_anomaly(True)
    model.to(device)

    start_time = time()
    accs = []
    train_losses = []
    test_losses = []

    loss_function = nn.CrossEntropyLoss(
        # weight=torch.log(2 + torch.tensor(range(max_count+1), dtype=torch.float))
    )
    optimizer = optim.Adam(model.parameters(), lr=lr)

    test_X, test_Y = get_testset(n_tokens, seqlen, max_count)
    print('test size', test_X.shape)

    for step in range(num_steps):
        batch_examples = [get_single_example(n_tokens, seqlen, max_count) for i in range(batch_size)]

        batch_X = torch.tensor([x[0] for x in batch_examples],
                                                     device=device
                                                     ).transpose(0, 1)
        batch_Y = torch.tensor([x[1] for x in batch_examples],
                                                     device=device).transpose(0, 1)

        model.train()
        model.zero_grad()
        logits = model(batch_X)
        loss = loss_function(logits.reshape(-1, max_count + 1), batch_Y.reshape(-1))
        loss.backward()
        optimizer.step()

        if step % (num_steps//100) == 0 or step == num_steps - 1:
            # Printing a summary of the current state of training every 1% of steps.
            model.eval()
            predicted_logits = model.forward(test_X).reshape(-1, max_count + 1)
            predicted_logits = predicted_logits.to('cpu')
            test_loss = loss_function(predicted_logits, test_Y.reshape(-1))
            test_acc = (
                    torch.sum(torch.argmax(predicted_logits, dim=-1) == test_Y.reshape(-1))
                    / test_Y.reshape(-1).shape[0])
            print('step', step, 'out of', num_steps)
            print('loss train', float(loss))
            print('loss test', float(test_loss))
            print('accuracy test', float(test_acc))
            do_verbose_test(model, n_tokens, seqlen, max_count)
            print()
            sys.stdout.flush()
            accs.append(round(float(test_acc), 2))
            train_losses.append(round(float(loss.detach()), 2))
            test_losses.append(round(float(test_loss.detach()), 2))
            # print(accs, train_losses, test_losses)
    print('\nTRAINING TIME:', time()-start_time)
    model.eval()
    return train_losses, test_losses, accs