using LinearAlgebra.BLAS: gemv!, set_num_threads

set_num_threads(1)

using Random: seed!, shuffle!

"""
    sigmoid(x) = 1 / (1 + exp(-x))

Classic sigmoid activation function.
"""
@inline function sigmoid(x)
    t = exp(-abs(x))
    ifelse(x >= zero(x), inv(one(t) + t), t / (one(t) + t))
end

"""
    activate!(a, x, W, b)

Evaluate sigmoid function,

    a .= sigmoid.(W*x .+ b).

Here x is the input vector,  a is the output vector,
W it the matrix of the weights, b is the vector of biases. 
"""
@inline function activate!(a, x, W, b)

    # mul!(a, W, x) # <- slower than gemv!

    gemv!('n', true, W, x, false, a)  # calculates a <- W * x 
    a .+= b

    # slower than above
    # a .= W * x .+ b

    # Even slower
    # a .= muladd(W, x, b)

    a .= sigmoid.(a)
    return nothing
end

"""
    forward!(p, xp)

Forward pass through neural network
"""
function forward!(p, xp)
    W, B, wa = p
    l = length(wa) 

    wa[1] .= xp 
    for j = 2:l
        activate!(wa[j], wa[j-1], W[j-1], B[j-1])
    end
    return nothing
end

"""
    backward!(delta, p, yy)

Backward pass through neural network
"""
function backward!(delta, p, yy)
    W, _, wa = p
    l = length(wa) 
    sp(z) = z * (1.0 - z)  # simplify formulas below
    
    # delta[l] .= wa[l] .* (1.0 .- wa[l]) .* (wa[l] .- yy)
    delta[l] .= sp.(wa[l]) .* (wa[l] .- yy)
    for j = (l-1):-1:2
        # delta[j] .= wa[j] .* (1.0 .- wa[j]) .* (W[j]' * delta[j+1])
        # delta[j] .= sp.(wa[j]) .* (W[j]' * delta[j+1])
        gemv!('t', true, W[j], delta[j+1], false, delta[j])  # calculates delta[j] <- W[j]' * delta[j+1]
        delta[j] .*= sp.(wa[j])
    end
    return nothing
end

"""
    cost2(x, y, p)

Calculate the cost function  
"""
@views function cost2(x, y, p)

    s = 0.0
    
    for i = 1:size(x, 2)
        forward!(p, x[:, i])
        # s += (norm(y[:, i] .- p.wa[end], 2))^2 # <- this is allocating
        p.wa[end] .= p.wa[end] .- y[:, i]
        s += (norm(p.wa[end], 2))^2
    end

    return s
end

"""
    predict2(xy, p)

"""
function predict2(xy, p)
    forward!(p, xy)
    # return p.wa[end] ./ sum(p.wa[end]) # probability # <- this is allocating
    nrm = sum(p.wa[end])
    p.wa[end] ./= nrm
    return p.wa[end] # probability
end

"""
    fbpropagate2!(savecosts, p, delta, eta, niters, x, y)

Minimize cost function using forward and back propagate
"""
@views function fbpropagate2!(savecosts, p, delta, eta, nepochs, x, y; show_progress=true)

    np = size(x, 2)
    lst = collect(1:np)
    
    W, B, wa = p
    l = length(wa) 

    prog = Progress(nepochs; enabled=show_progress)
    for epoch = 1:nepochs

        ord = shuffle!(lst)
        for k in ord
            
            # Forward pass
            forward!(p, x[:, k])
            
            # Backward pass
            backward!(delta, p, y[:, k])
            
            # Gradient step
            
            wa[1] .= x[:, k]
            for j = 2:l
                outer_product!(W[j - 1], delta[j], wa[j - 1], -eta)
                # W[j - 1] .-= eta .* delta[j] * wa[j - 1]'
                B[j - 1] .-= eta .* delta[j]
                # delta[j] .*= eta
                # W[j - 1] .-= delta[j] * wa[j - 1]'
                # B[j - 1] .-= delta[j]
            end

        end
        savecosts[epoch] = cost2(x, y, p)
        ProgressMeter.next!(prog, showvalues = [("Epoch: ", epoch)])
    end

    return nothing
end

"""
A += eta * b * c'
A[i, j] += eta * b[i] * c[j]
"""
function outer_product!(A, b, c, eta)
   for j in eachindex(c)
       for i in eachindex(b)
           A[i, j] += eta * b[i] * c[j]
       end
   end
   return nothing
end

function model_init(Ns)
    # Weights
    W = [0.5 * randn(Ns[i], Ns[i-1]) for i = 2:length(Ns)]
    
    # Biases
    B = [0.5 * randn(Ns[i]) for i = 2:length(Ns)]
    
    # Work arrays
    wa = [Vector{Float64}(undef, Ns[i]) for i = 1:length(Ns)]
    
    p = (; W, B, wa)
    
    # Deltas
    delta = [Vector{Float64}(undef, Ns[i]) for i = 1:length(Ns)]

    return p, delta
end