import numpy as np
from sklearn import datasets

source = datasets.load_iris()
data = source.data
target = source.target
design = np.insert(data, 0, 1., 1)
onehot = (np.arange(3) == target[:, None])

param = np.zeros((5, 3))

activation = design @ param
exp = np.exp(activation)
sum = np.sum(exp, 1, keepdims = True)
activity = exp / sum
log = np.log(activity)
entropy = -log * onehot
loss = np.sum(entropy)
grad = np.sum((design[:, None, None, :] * (activity[:, None, :] - np.identity(3))[:, :, :, None])[onehot, :, :], 0).T

print(grad)
