#!/usr/bin/python3.6

import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
#import tensorflow.compat.v1 as tf
import tensorflow as tf
from tensorflow.python.framework import ops
from tf_utils import load_dataset_rb_dots, random_mini_batches, convert_to_one_hot, predict

#eager execution
#tf.compat.v1.enable_eager_execution()

TEST_CASE = True
#TEST_CASE = False

#NUM_EPOCH = 3000
NUM_EPOCH = 10000
#NUM_EPOCH = 90000

np.random.seed(1)

########################################
### This pgm copied from course 2 week2 pgm => course2/week2/test_cr2_wk2.py
### We are going to write same pgm with tensorflow functions now
### We implement it for batch gd only (not other ones)
########################################

# GRADED FUNCTION: create_placeholders

def create_placeholders(n_x, n_y):
    """
    Creates the placeholders for the tensorflow session.
    
    Arguments:
    n_x -- scalar, size of an image vector (num_px * num_px = 64 * 64 * 3 = 12288)
    n_y -- scalar, number of classes (from 0 to 5, so -> 6)
    
    Returns:
    X -- placeholder for the data input, of shape [n_x, None] and dtype "tf.float32"
    Y -- placeholder for the input labels, of shape [n_y, None] and dtype "tf.float32"
    
    Tips:
    - You will use None because it let's us be flexible on the number of examples you will for the placeholders.
      In fact, the number of examples during test/train is different.
    """

    ### START CODE HERE ### (approx. 2 lines)
    X = tf.placeholder(tf.float32,shape=[n_x,None])
    Y = tf.placeholder(tf.float32,shape=[n_y,None])
    ### END CODE HERE ###
    
    return X, Y

if (TEST_CASE):
    X, Y = create_placeholders(12288, 6)
    print ("X = " + str(X))
    print ("Y = " + str(Y))

# GRADED FUNCTION: initialize_parameters

def initialize_parameters(layer_dims):
    """
    Initializes parameters to build a neural network with tensorflow. The shapes are:
                        W1 : [25, 12288]
                        b1 : [25, 1]
                        W2 : [12, 25]
                        b2 : [12, 1]
                        W3 : [6, 12]
                        b3 : [6, 1]
    
    Returns:
    parameters -- a dictionary of tensors containing W1, b1, W2, b2, W3, b3
    """
    
    tf.set_random_seed(1)                   # so that your "random" numbers match ours
        
    ### START CODE HERE ### (approx. 6 lines of code)
    W1 = tf.get_variable("W1", [layer_dims[1],layer_dims[0]], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    #W1 = tf.get_variable("W1", [layer_dims[1],layer_dims[0]], initializer = tf.initializers.glorot_uniform(seed = 1))
    b1 = tf.get_variable("b1", [layer_dims[1],1], initializer = tf.zeros_initializer())
    W2 = tf.get_variable("W2", [layer_dims[2],layer_dims[1]], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    b2 = tf.get_variable("b2", [layer_dims[2],1], initializer = tf.zeros_initializer())
    W3 = tf.get_variable("W3", [layer_dims[3],layer_dims[2]], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    b3 = tf.get_variable("b3", [layer_dims[3],1], initializer = tf.zeros_initializer())
    ### END CODE HERE ###

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    
    return parameters

if (TEST_CASE):
    tf.reset_default_graph()
    with tf.Session() as sess:
        parameters = initialize_parameters([12288,25,12,6])
        print("W1 = " + str(parameters["W1"]))
        print("b1 = " + str(parameters["b1"]))
        print("W2 = " + str(parameters["W2"]))
        print("b2 = " + str(parameters["b2"]))

# GRADED FUNCTION: forward_propagation

def forward_propagation(X, parameters):
    """
    Implements the forward propagation for the model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX
    
    Arguments:
    X -- input dataset placeholder, of shape (input size, number of examples)
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3"
                  the shapes are given in initialize_parameters

    Returns:
    Z3 -- the output of the last LINEAR unit
    """
    
    # Retrieve the parameters from the dictionary "parameters" 
    #KA: These W1,b1, etc are Tensors, and NOT numpy arrays
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    
    ### START CODE HERE ### (approx. 5 lines)              # Numpy Equivalents:
    Z1 = tf.add(tf.matmul(W1,X),b1)                                              # Z1 = np.dot(W1, X) + b1
    A1 = tf.nn.relu(Z1)                                              # A1 = relu(Z1)
    Z2 = tf.add(tf.matmul(W2,A1),b2)                                             # Z2 = np.dot(W2, A1) + b2
    A2 = tf.nn.relu(Z2)                                              # A2 = relu(Z2)
    Z3 = tf.add(tf.matmul(W3,A2),b3)                                              # Z3 = np.dot(W3, A2) + b3
    ### END CODE HERE ###
    
    return Z3

if (TEST_CASE):
    tf.reset_default_graph()

    with tf.Session() as sess:
        X, Y = create_placeholders(12288, 6)
        parameters = initialize_parameters([12288,25,12,6])
        Z3 = forward_propagation(X, parameters)
        print("Z3 = " + str(Z3))

# GRADED FUNCTION: compute_cost 

def compute_cost(Z3, Y):
    """
    Computes the cost
    
    Arguments:
    Z3 -- output of forward propagation (output of the last LINEAR unit), of shape (6, number of examples)
    Y -- "true" labels vector placeholder, same shape as Z3
    
    Returns:
    cost - Tensor of the cost function
    """
    
    # to fit the tensorflow requirement for tf.nn.softmax_cross_entropy_with_logits(...,...)
    logits = tf.transpose(Z3)
    labels = tf.transpose(Y)
    
    ### START CODE HERE ### (1 line of code)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))
    ### END CODE HERE ###
    
    return cost

if (TEST_CASE):
    tf.reset_default_graph()

    with tf.Session() as sess:
        X, Y = create_placeholders(12288, 6)
        parameters = initialize_parameters([12288,25,12,6])
        Z3 = forward_propagation(X, parameters)
        cost = compute_cost(Z3, Y)
        print("cost = " + str(cost))

def model(X_train, Y_train, layers_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 10000, print_cost = True):
    """
    3-layer neural network model which can be run in different optimizer modes.
    
    Arguments:
    X -- input data, of shape (2, number of examples)
    Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
    layers_dims -- python list, containing the size of each layer
    learning_rate -- the learning rate, scalar.
    mini_batch_size -- the size of a mini batch
    beta -- Momentum hyperparameter
    beta1 -- Exponential decay hyperparameter for the past gradients estimates 
    beta2 -- Exponential decay hyperparameter for the past squared gradients estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates
    num_epochs -- number of epochs
    print_cost -- True to print the cost every 1000 epochs

    Returns:
    parameters -- python dictionary containing your updated parameters 
    """
    
    L = len(layers_dims)             # number of layers in the neural networks
    costs = []                       # to keep track of the cost
    t = 0                            # initializing the counter required for Adam update
    seed = 10                        # For grading purposes, so that your "random" minibatches are the same as ours
    m = X_train.shape[1]                   # number of training examples
    
    ########## from test_cr2_wk3.py ###########
    ops.reset_default_graph()                         # to be able to rerun the model without overwriting tf variables
    tf.set_random_seed(1)                             # to keep consistent results
    seed = 3                                          # to keep consistent results
    (n_x, m) = X_train.shape                          # (n_x: input size, m : number of examples in the train set)
    n_y = Y_train.shape[0]                            # n_y : output size
    costs = []                                        # To keep track of the cost
    print("n_x = ",n_x," n_y = ", n_y, "m = ",m)
    # Create Placeholders of shape (n_x, n_y)
    ### START CODE HERE ### (1 line)
    X, Y = create_placeholders(n_x, n_y)
    ### END CODE HERE ###

    # Initialize parameters
    ### START CODE HERE ### (1 line)
    parameters = initialize_parameters(layers_dims)
    ### END CODE HERE ###
    
    # Forward propagation: Build the forward propagation in the tensorflow graph
    ### START CODE HERE ### (1 line)
    Z3 = forward_propagation(X, parameters)
    ### END CODE HERE ###
    
    # Cost function: Add cost function to tensorflow graph
    ### START CODE HERE ### (1 line)
    cost = compute_cost(Z3, Y)
    ### END CODE HERE ###
    
    # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer.
    ### START CODE HERE ### (1 line)
    optimizer_tf = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)
    #optimizer_tf = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
    ### END CODE HERE ###
    
    # Initialize all the variables
    init = tf.global_variables_initializer()

    #### end of test_cr2_wk3.py ###########

    #init param already done above
    # Initialize parameters
    #parameters = initialize_parameters(layers_dims)

    #init param for all algo done above. dW, db and any other needed param are all init by tf optimizer
    # Initialize the optimizer
    if optimizer == "gd":
        pass # no initialization required for gradient descent
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)
    
    # Start the session to compute the tensorflow graph
    with tf.Session() as sess:
        
        # Run the initialization
        sess.run(init)
        
        # Optimization loop
        for i in range(num_epochs):
        
            # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
            seed = seed + 1
            minibatches = random_mini_batches(X_train, Y_train, mini_batch_size, seed)
            cost_total = 0
        
            for minibatch in minibatches:

                # Select a minibatch
                (minibatch_X, minibatch_Y) = minibatch
                
                # Forward propagation
                #a3, caches = forward_propagation(minibatch_X, parameters)

                # Compute cost and add to the cost total
                #cost_total += compute_cost(a3, minibatch_Y)

                # Backward propagation
                #grads = backward_propagation(minibatch_X, minibatch_Y, caches)

                # Update parameters
                #if optimizer == "gd":
                    #parameters = update_parameters_with_gd(parameters, grads, learning_rate)
                #elif optimizer == "momentum":
                    #parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
                #elif optimizer == "adam":
                    #t = t + 1 # Adam counter
                    #parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2,  epsilon)
                                                               
                #### Above code for fwd prop, compute_cost, back_prop and update_para is replaced by single line in tf ###                    ### START CODE HERE ### (1 line)
                #para = sess.run(parameters)
                #print("tmp_W1 = " + str(para["W1"]))
                _ , minibatch_cost = sess.run([optimizer_tf, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
                ### END CODE HERE ###                                        
                #print("cost tot = ",minibatch_cost)           
                cost_total += minibatch_cost / mini_batch_size

            #Below cost avg done in test_cr2_wk2.py. That was needed since no /m was done there
            #But here, we already did a /mini_batch_size for each minibatch, so cost_total is already /m
            #So, no need to cost avg again.
            #cost_avg = cost_total / m
        
            # Print the cost every 1000 epoch
            if print_cost and i % 1000 == 0:
                print ("Cost after epoch %i: %f" %(i, cost_total))
                #parameters = sess.run(parameters)
                #print("W1 = " + str(parameters["W1"]))
                #print("b1 = " + str(parameters["b1"]))
                #print("W2 = " + str(parameters["W2"]))
                #print("b2 = " + str(parameters["b2"]))
                #print("W3 = " + str(parameters["W3"]))
                #print("b3 = " + str(parameters["b3"]))
                
            if print_cost and i % 100 == 0:
                costs.append(cost_total)
                
        # plot the cost
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('epochs (per 100)')
        plt.title("Learning rate = " + str(learning_rate))
        plt.show()

        #### below 3 lines added from test_cr2_wk3.py
        # lets save the parameters in a variable
        parameters = sess.run(parameters)
        print ("Parameters have been trained!")
        
        # Calculate the correct predictions
        correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y))

        # Calculate accuracy on the test set
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
        #print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))

        return parameters

########################################
########## Main pgm ####################
#######################################

train_X, train_Y_orig = load_dataset_rb_dots()
train_Y = convert_to_one_hot(train_Y_orig, 2)

print ("number of training examples = " + str(train_X.shape[1]))
print ("X_train shape: " + str(train_X.shape))
print ("Y_train shape: " + str(train_Y.shape))

### Mini batch gd #####
print("\n Running mini batch gd \n")
# train 3-layer model
layers_dims = [train_X.shape[0], 25, 12, 2]
parameters = model(train_X, train_Y, layers_dims, optimizer = "gd", num_epochs=NUM_EPOCH)

# Predict => Prediction already done within model()
#predictions = predict(train_X, train_Y, parameters)

# Plot decision boundary => Doesn't work as predict, predict_dec error out. Don't need anyway, as we got our accuracy numbers from model() above.
plt.title("Model with Gradient Descent optimization")
axes = plt.gca()
axes.set_xlim([-1.5,2.5])
axes.set_ylim([-1,1.5])
#plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
#plot_decision_boundary(lambda x: predict(parameters, x.T), train_X, train_Y)