In [1]:

import numpy as np
In [2]:

cd D:\Documents\Class\CSC478\Data
D:\Documents\Class\CSC478\Data

In [3]:

dataMat = genfromtxt('modified_jester_data.csv',delimiter=',')
In [8]:

print dataMat
[[  3.18  19.79   1.34 ...,   0.     0.     0.  ]
[ 15.08  10.71  17.36 ...,  11.34   6.68  12.07]
[  0.     0.     0.   ...,   0.     0.     0.  ]
...,
[ 16.58  16.63  15.85 ...,   0.     0.     0.  ]
[  3.67   4.45   3.67 ...,   3.77   3.77   3.28]
[  9.88  11.73   9.16 ...,   0.     0.     0.  ]]

In [38]:

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = numpy.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
        print "Step %d of %d; Error: %0.5f; Time: %0.2f" %(step+1, steps, e, time())
    return P, Q.T
In [39]:

M = len(dataMat)
N = len(dataMat[0])
K = 5
steps = 3000
In [40]:

P = numpy.random.rand(M,K)
Q = numpy.random.rand(N,K)
In [41]:

from time import time
t0 = time()
fP, fQ = matrix_factorization(dataMat, P, Q, K, steps=steps)
print("done in %0.3fs." % (time() - t0))
Step 1 of 3000; Error: 2868221.16586; Time: 1384587880.81
Step 2 of 3000; Error: 1512759.77139; Time: 1384587885.27
Step 3 of 3000; Error: 1388207.30863; Time: 1384587889.73
Step 4 of 3000; Error: 1349123.13502; Time: 1384587894.17
Step 5 of 3000; Error: 1335480.51889; Time: 1384587898.64
Step 6 of 3000; Error: 1330137.34727; Time: 1384587903.09
Step 7 of 3000; Error: 1327675.93351; Time: 1384587907.58
Step 8 of 3000; Error: 1326236.17043; Time: 1384587912.02
Step 9 of 3000; Error: 1325142.19446; Time: 1384587916.45
Step 10 of 3000; Error: 1324134.93046; Time: 1384587920.90
...

Step 2991 of 3000; Error: 1043435.97559; Time: 1384630971.85
Step 2992 of 3000; Error: 1043435.87466; Time: 1384630976.26
Step 2993 of 3000; Error: 1043435.77385; Time: 1384630980.68
Step 2994 of 3000; Error: 1043435.67319; Time: 1384630985.08
Step 2995 of 3000; Error: 1043435.57265; Time: 1384630989.48
Step 2996 of 3000; Error: 1043435.47224; Time: 1384630993.92
Step 2997 of 3000; Error: 1043435.37197; Time: 1384630998.33
Step 2998 of 3000; Error: 1043435.27182; Time: 1384631002.70
Step 2999 of 3000; Error: 1043435.17181; Time: 1384631007.11
Step 3000 of 3000; Error: 1043435.07193; Time: 1384631011.55
done in 43135.206s.

In [45]:

outP = open("jokes_p.csv", "w")
outQ = open("jokes_q.csv", "w")
savetxt(outP, fP, delimiter=',', fmt='%1.4f')
savetxt(outQ, fQ, delimiter=',', fmt='%1.4f')
In [46]:

Preds = dot(fP,fQ.T)
In [47]:

outPreds = open("jokes_predictions.csv", "w")
savetxt(outPreds, Preds, delimiter=',', fmt='%1.4f')
In [51]:

print dot(fP[979],fQ[9].T)
11.6722002871

In [53]:

totCount = 0
totError = 0
for u in range(M):
    err_u = 0
    rateCount_u = 0
    for j in range(N):
        if (dataMat[u,j] > 0):
            rateCount_u += 1
            err_u += abs(dot(fP[u],fQ[j]) - dataMat[u,j])
    print "Mean Absolute Error for User %d = %0.3f" %(u, err_u/rateCount_u)
    totCount += rateCount_u
    totError += err_u
print
print "Overall Mean Absolute Error = %0.3f" %(totError/totCount)

Mean Absolute Error for User 0 = 3.724
Mean Absolute Error for User 1 = 3.146
Mean Absolute Error for User 2 = 1.996
Mean Absolute Error for User 3 = 3.171
Mean Absolute Error for User 4 = 2.826
Mean Absolute Error for User 5 = 2.322
Mean Absolute Error for User 6 = 2.833
Mean Absolute Error for User 7 = 3.700
Mean Absolute Error for User 8 = 2.339
Mean Absolute Error for User 9 = 1.665
Mean Absolute Error for User 10 = 2.202
...
Mean Absolute Error for User 991 = 4.320
Mean Absolute Error for User 992 = 2.143
Mean Absolute Error for User 993 = 1.832
Mean Absolute Error for User 994 = 2.314
Mean Absolute Error for User 995 = 2.177
Mean Absolute Error for User 996 = 3.094
Mean Absolute Error for User 997 = 1.996
Mean Absolute Error for User 998 = 1.516
Mean Absolute Error for User 999 = 1.643

Overall Mean Absolute Error = 2.930

In [ ]: