Linear Regression: Gradient Descent

2 minute read

Linear Regression and Gradient Descent

Linear Regression & Gradient Descent is the first algorithm I came across When I decided to get into Data Science through Andrew Ng’s Machine Learning course and after that through my Master’s Program Every other algorithm I implemented since is based on these basic algorithms and it fascinates me every time. Lets dive into it:

Linear Regression:

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = pd.read_csv("/path")
data.head(1)
data.shape
data.head()

Feature scaling for Multivariate Problem

Why is Feature Scaling Important

data =data.values
m=len(data[:,-1])
X=data[:,0:2].reshape(m,2)
mean=np.mean(X,axis=0)
std=np.std(X,axis=0)
X = (X - mean)/std
X = np.append(np.ones((m,1)),X,axis=1)
y=data[:,-1].reshape(m,1)
theta=np.zeros((3,1))

Closed-Form Equation Vs Gradient Descent Vs Stochastic Gradient Descent

Explain the differences between alternative approaches to estimating the parameters of a model

Closed-form formula or Normal Equation

We dont need to implement feature scaling in this case.

Formula:

XT=X.T
XTX=XT@X

inv=np.linalg.pinv(XTX)

thetanew=(inv@XT)@y
print(thetanew)
[[89597.90954361]
 [  139.21067402]
 [-8738.01911255]]

Finding Cost Function or Loss Function for gradient descent

def computeCost(X,y,theta):
    m=len(y)
    err=((np.dot(X, theta)) - y)**2
    jtheta = (np.sum(err) *( 1/ (2*m)))
    return jtheta

computeCost(X,y,theta)

65591548106.45744

Gradient Descent

def gradientDescent(X, y, theta, alpha, iterations):

    m = len(y)
    history = []
    for i in range(iterations):
        descent = alpha * 1 / m * (np.dot(X.transpose(), ((np.dot(X,theta)) - y)))
        theta -= descent
        history.append(computeCost(X, y, theta))

    return theta, history

thetanew1,chistory1= gradientDescent(X,y,theta,1,50)
print(thetanew1)
[[340412.65957447]
 [109447.79646961]
 [ -6578.35485416]]
plt.plot(chistory1)
plt.xlabel("no. of Iteration")
plt.ylabel("J(Theta)")
plt.title("loss function using Gradient Descent")
Text(0.5, 1.0, 'loss function using Gradient Descent')

loss function using Gradient Descent

Stochastic Gradient Descent

def stocashtic_gradient_descent(X, y, theta, alpha, iterations):

    m = len(y)
    cost_history = np.zeros(iterations)

    for i in range(iterations):
        cost = 1.0
        for j in range(m):
            rand_ind = np.random.randint(0, m)
            Xi = X[rand_ind, :].reshape(1, X.shape[1])
            yi = y[rand_ind].reshape(1, 1)
            descent = (1 / m) * alpha * (Xi.transpose().dot(((np.dot(Xi, theta)) - yi)))
            theta -= descent
            cost += computeCost( Xi, yi, theta)
        cost_history[i] = cost

    return theta, cost_history

thetanew2,chistory2 = stocashtic_gradient_descent(X,y,theta,1,100)
print(thetanew2)
[[333857.22454677]
 [104732.15569427]
 [ -7188.6376759 ]]
n_iter = 100
theta = np.random.randn(3,1)
plt.plot(range(n_iter),chistory2)
plt.xlabel("no. of Iteration")
plt.ylabel("J(Theta)")
plt.title("Loss function using Gradient STOCHASTIC Descent")
Text(0.5, 1.0, 'Loss function using Gradient STOCHASTIC Descent')

loss function using stochastic Gradient Descent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
m = len(y_train)
computeCost(X_train,y_train,thetanew1)

1654614139.0828218
computeCost(X_test,y_test,thetanew1)

2796320254.1728415
computeCost(X_train,y_train,thetanew2)

1636964355.6928473
computeCost(X_test,y_test,thetanew2)

2833744177.3070307
thetap1,phistory1 = gradientDescent(X_train,y_train,theta,0.2,50)
print(thetap1)
[[319924.75939214]
 [ 93648.11271015]
 [ -2496.78859906]]
thetap2,phistory2 = stocashtic_gradient_descent(X_train,y_train,theta,0.2,90)
print(thetap2)
[[321087.62748613]
 [ 91245.71339344]
 [ -3415.62095041]]
predictions1 = []

for i in range(0,16) :
    predict1= np.dot(thetanew1.transpose(),X_test[i])
    predictions1 = np.append(predictions1,predict1)
plt.plot(predictions1)
plt.plot(y_test)

Prediction vs ground truth

predictions = []

for i in range(0,16) :
    predict= np.dot(thetanew2.transpose(),X_test[i])
    predictions = np.append(predictions,predict)
plt.plot(predictions)
plt.plot(y_test)

Prediction vs ground truth

Prediction vs ground truth

eval1,ehistory1 = gradientDescent(X_test,predictions1,theta,0.08,150)
print(thetap1)
eval2,ehistory2 = gradientDescent(X_test,y_test,theta,0.08,150)

predictions = []
for i in range(0,16) :
    predict= np.dot(thetap2.transpose(),X_test[i])
    predictions = np.append(predictions,predict)
print(predictions)
[381901.00005848 336311.78918155 253740.50903145 222780.39041256
 545581.6271754  227460.40834332 305580.70764912 440592.18871572
 436872.17446306 210780.34443624 244380.47316992 250609.53324454
 262860.54397345 280369.6472658  455221.28097376 612323.81046984]
testtheta1,ethistory1 = gradientDescent(X_test,y_test,theta,0.1,400)
print(testtheta1)

[[376464.08973221]
 [108344.58280849]
 [ -1408.17517797]]
m1=len(predictions)

ypred=predictions.reshape(m1,1)
eval1,ehistory1 = gradientDescent(X_test,ypred,theta,0.08,150)
print(eval1)

[[318537.53041957]
 [ 94344.63835206]
 [ -1437.15342375]]

Gradient Descent with Regularization

m1=len(predictions)

ypred=predictions.reshape(m1,1)
eval1,ehistory1 = gradientDescent(X_test,ypred,theta,0.08,150)
print(eval1)


def computeCostR(X,y,theta,Lambda):

    m=len(y)
    predictions=X.dot(theta)
    square_err=(predictions - y)**2
    cost = 1/(2*m) * np.sum(square_err)
    regcost = cost + Lambda/(2*m) * sum(theta**2)
    j_0= 1/m * (X.transpose() @ (predictions - y))[0]
    j_1 = 1/m * (X.transpose() @ (predictions - y))[1:] + (Lambda/m)* theta[1:]
    grad= np.vstack((j_0[:,np.newaxis],j_1))
    return regcost[0], grad

computeCostR(X,y,theta,0.1)

(2195587442.501597, array([[-1754.41032517],
        [-2677.15865359],
        [ 2644.26335941]]))
def gradientDescentR(X,y,theta,alpha,num_iters,Lambda):

    m=len(y)
    history =[]

    for i in range(num_iters):
        cost, grad = computeCostR(X,y,theta,Lambda)
        theta = theta - (alpha * grad)
        history.append(cost)

    return theta , history

thetanew3 , chistory3 = gradientDescentR(X,y,theta,1,10,0.1)
print("The regularized theta using ridge regression:\n",thetanew3)
The regularized theta using ridge regression:
 [[340412.65957447]
 [108768.39338594]
 [ -6362.50331981]]
plt.plot(chistory3)
plt.xlabel("no. of Iteration")
plt.ylabel("J(Theta)")
plt.title("loss function using Regularized Gradient Descent")
Text(0.5, 1.0, 'loss function using Regularized Gradient Descent')

loss function using Regularized Gradient Descent