0
votes

I try to implement Polynomial Regression with Gradient Descent. I want to fit the following function:

enter image description here

The code I use is:

import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
from sklearn.preprocessing import PolynomialFeatures
np.random.seed(seed=42)


def create_data():
    x = PolynomialFeatures(degree=5).fit_transform(np.linspace(-10,10,100).reshape(100,-1))
    l = lambda x_i: (1/3)*x_i**3-2*x_i**2+2*x_i+2
    data = l(x[:,1])
    noise = np.random.normal(0,0.1,size=np.shape(data))
    y = data+noise
    y= y.reshape(100,1)
    return {'x':x,'y':y}

def plot_function(x,y):
    fig = plt.figure(figsize=(10,10))
    plt.plot(x[:,1],[(1/3)*x_i**3-2*x_i**2+2*x_i+2 for x_i in x[:,1]],c='lightgreen',linewidth=3,zorder=0)
    plt.scatter(x[:,1],y)
    plt.show()


def w_update(y,x,batch,w_old,eta):
    derivative = np.sum([(y[i]-np.dot(w_old.T,x[i,:]))*x[i,:] for i in range(np.shape(x)[0])])
    print(derivative)
    return w_old+eta*(1/batch)*derivative



# initialize variables
w = np.random.normal(size=(6,1))
data = create_data()
x = data['x']
y = data['y']
plot_function(x,y)



# Update w
w_s = []
Error = []
for i in range(500):
    error = (1/2)*np.sum([(y[i]-np.dot(w.T,x[i,:]))**2 for i in range(len(x))])
    Error.append(error)
    w_prime = w_update(y,x,np.shape(x)[0],w,0.001)
    w = w_prime
    w_s.append(w)
# Plot the predicted function
plt.plot(x[:,1],np.dot(x,w))
plt.show()

# Plot the error
fig3 = plt.figure()
plt.scatter(range(len(Error[10:])),Error[10:])
plt.show()

But as result I receive smth. strange which is completely out of bounds...I have also tried to alter the number of iterations as well as the parameter theta but it did not help. I assume I have made an mistake in the update of w. enter image description here

1

1 Answers

1
votes

I have found the solution. The Problem is indeed in the part where I calculate the weights. Specifically in:

np.sum([(y[d]-np.dot(w_old.T,x[d,:]))*x[d,:] for d in range(np.shape(x)[0])])

which should be like:

np.sum([-(y[d]-np.dot(w.T.copy(),x[d,:]))*x[d,:].reshape(np.shape(w)) for d in range(len(x))],axis=0)

We have to add np.sum(axis=0) to get the dimensionality we want --> Dimensionality must be equal to w. The numpy sum documentation sais

The default, axis=None, will sum all of the elements of the input array.

This is not what we want to achieve. Adding axis = 0 sums over the first axis of our array which is of dimensionality (100,7,1) hence the 100 elements of dimensionality (7,1) are summed up and the resulting array is of dimensionality (7,1) which is exactly what we want. Implementing this and cleaning up the code yields:

import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
np.random.seed(seed=42)


def create_data():
    x = PolynomialFeatures(degree=6).fit_transform(np.linspace(-2,2,100).reshape(100,-1))
    x[:,1:] = MinMaxScaler(feature_range=(-2,2),copy=False).fit_transform(x[:,1:])
    l = lambda x_i: np.cos(0.8*np.pi*x_i)
    data = l(x[:,1])
    noise = np.random.normal(0,0.1,size=np.shape(data))
    y = data+noise
    y= y.reshape(100,1)
    # Normalize Data
    return {'x':x,'y':y}

def plot_function(x,y,w,Error,w_s):
    fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(40,10))
    ax[0].plot(x[:,1],[np.cos(0.8*np.pi*x_i) for x_i in x[:,1]],c='lightgreen',linewidth=3,zorder=0)
    ax[0].scatter(x[:,1],y)
    ax[0].plot(x[:,1],np.dot(x,w))
    ax[0].set_title('Function')
    ax[1].scatter(range(iterations),Error)
    ax[1].set_title('Error')



    plt.show()



# initialize variables
data = create_data()
x = data['x']
y = data['y']
w = np.random.normal(size=(np.shape(x)[1],1))
eta = 0.1
iterations = 10000
batch = 10



def stochastic_gradient_descent(x,y,w,eta):
    derivative = -(y-np.dot(w.T,x))*x.reshape(np.shape(w))
    return eta*derivative


def batch_gradient_descent(x,y,w,eta):
    derivative = np.sum([-(y[d]-np.dot(w.T.copy(),x[d,:]))*x[d,:].reshape(np.shape(w)) for d in range(len(x))],axis=0)
    return eta*(1/len(x))*derivative


def mini_batch_gradient_descent(x,y,w,eta,batch):
    gradient_sum = np.zeros(shape=np.shape(w))
    for b in range(batch):
        choice = np.random.choice(list(range(len(x))))
        gradient_sum += -(y[choice]-np.dot(w.T,x[choice,:]))*x[choice,:].reshape(np.shape(w))
        return eta*(1/batch)*gradient_sum

# Update w
w_s = []
Error = []
for i in range(iterations):
    # Calculate error
    error = (1/2)*np.sum([(y[i]-np.dot(w.T,x[i,:]))**2 for i in range(len(x))])
    Error.append(error)
    # Stochastic Gradient Descent
    """
    for d in range(len(x)):
        w-= stochastic_gradient_descent(x[d,:],y[d],w,eta)
        w_s.append(w.copy())
    """
    # Minibatch Gradient Descent
    """
    w-= mini_batch_gradient_descent(x,y,w,eta,batch)
    """

    # Batch Gradient Descent

    w -= batch_gradient_descent(x,y,w,eta)




# Show predicted weights
print(w_s)

# Plot the predicted function and the Error
plot_function(x,y,w,Error,w_s)

As result we receive: enter image description here

Which surely can be improved by altering eta and the number of iterations as well as switching to Stochastic or Mini Batch Gradient Descent or more sophisticated optimization algorithms.