import torch

states = 10
actions = 5
rewards = 7

system = torch.rand(states, rewards, states, actions)
system /= system.sum((0, 1))

policy = torch.rand(actions, states)
policy /= policy.sum(0)

epochs = 1000
for state in range(states):
    value = 0.
    for epoch in range(epochs):
        state0 = state
        gain = 0.
        while state0 != 0:
            action0 = torch.multinomial(policy[:, state0], 1).item()
            index1 = torch.multinomial(system[:, :, state0, action0].flatten(), 1).item()
            state1, reward1 = index1 // rewards, -(index1 % rewards)
            gain += reward1
            state0 = state1
        value += gain
    value /= epochs
    print("%6.2f" % (value), end = " ")
