Let's look at another example of using distances and similarities. This time, we'll look at a hypothetical search engine index and see how we can use k-nearest-neighbor search to identify the documents that are most similar to a specified query.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
cd D:\Documents\Class\CSC478\Data
D:\Documents\Class\CSC478\Data

In our data set, we have 15 documents. We assume that the documents have already been preprocessed, converted into word vectors (bags of words), and inserted into an index. After preprocessing and removing "stop words" we are left with 10 index terms (used as dimensions for the document vectors).

In [3]:
DF = pd.read_csv('term-doc-mat.csv', header=None)
DF
Out[3]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 database 24 32 12 6 43 2 0 3 1 6 4 0 0 0 0
1 index 9 5 5 2 20 0 1 0 0 0 27 14 3 2 11
2 likelihood 0 3 0 0 3 7 12 4 27 4 0 1 0 0 0
3 linear 3 0 0 0 0 16 0 2 25 23 7 12 21 3 2
4 matrix 1 0 0 0 0 33 2 0 7 12 14 5 12 4 0
5 query 12 2 0 0 27 0 0 0 0 22 9 4 0 5 3
6 regression 0 0 0 0 0 18 32 22 34 17 0 0 0 0 0
7 retrieval 1 0 0 0 2 0 0 0 3 9 27 7 5 4 4
8 sql 21 10 16 7 31 0 0 0 0 0 0 0 0 1 0
9 vector 2 0 0 2 0 27 4 2 11 8 33 16 14 7 3

Let's remove the column containing the terms

In [4]:
# TD = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),dtype=int)
TD = DF.ix[:,1:]
TD
Out[4]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 24 32 12 6 43 2 0 3 1 6 4 0 0 0 0
1 9 5 5 2 20 0 1 0 0 0 27 14 3 2 11
2 0 3 0 0 3 7 12 4 27 4 0 1 0 0 0
3 3 0 0 0 0 16 0 2 25 23 7 12 21 3 2
4 1 0 0 0 0 33 2 0 7 12 14 5 12 4 0
5 12 2 0 0 27 0 0 0 0 22 9 4 0 5 3
6 0 0 0 0 0 18 32 22 34 17 0 0 0 0 0
7 1 0 0 0 2 0 0 0 3 9 27 7 5 4 4
8 21 10 16 7 31 0 0 0 0 0 0 0 0 1 0
9 2 0 0 2 0 27 4 2 11 8 33 16 14 7 3
In [5]:
# terms = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(0),dtype=str)
terms = DF.ix[:,0]
terms
Out[5]:
0      database
1         index
2    likelihood
3        linear
4        matrix
5         query
6    regression
7     retrieval
8           sql
9        vector
Name: 0, dtype: object

Transposing the TD matrix.

In [6]:
DT = TD.T

Now we have a document-term matrix:

In [7]:
DT
Out[7]:
0 1 2 3 4 5 6 7 8 9
1 24 9 0 3 1 12 0 1 21 2
2 32 5 3 0 0 2 0 0 10 0
3 12 5 0 0 0 0 0 0 16 0
4 6 2 0 0 0 0 0 0 7 2
5 43 20 3 0 0 27 0 2 31 0
6 2 0 7 16 33 0 18 0 0 27
7 0 1 12 0 2 0 32 0 0 4
8 3 0 4 2 0 0 22 0 0 2
9 1 0 27 25 7 0 34 3 0 11
10 6 0 4 23 12 22 17 9 0 8
11 4 27 0 7 14 9 0 27 0 33
12 0 14 1 12 5 4 0 7 0 16
13 0 3 0 21 12 0 0 5 0 14
14 0 2 0 3 4 5 0 4 1 7
15 0 11 0 2 0 3 0 4 0 3
In [8]:
DT.shape
Out[8]:
(15, 10)
In [9]:
len(DT.ix[1])
Out[9]:
10
In [10]:
numTerms=DT.shape[1]
# could also say numTerms = len(DT.ix[1])
NDocs = DT.shape[0]
In [11]:
print numTerms
print NDocs
10
15

Next, let's compute term frequencies to get an idea of their distributions across the corpus.

In [12]:
termFreqs = TD.sum(axis=1)
print termFreqs
0    133
1     99
2     61
3    114
4     90
5     84
6    123
7     62
8     86
9    129
dtype: int64
In [16]:
dictTF = {}
for i in range(numTerms):
               dictTF[terms[i]] = termFreqs[i]
print sorted(dictTF.items())
sortedTF = sorted(dictTF.values(), reverse=True)
print sortedTF
[('database', 133), ('index', 99), ('likelihood', 61), ('linear', 114), ('matrix', 90), ('query', 84), ('regression', 123), ('retrieval', 62), ('sql', 86), ('vector', 129)]
[133, 129, 123, 114, 99, 90, 86, 84, 62, 61]
In [14]:
plt.plot(sortedTF)
plt.show()

We convert the dataframe into a Numpy array which will be used as input for our search function.

In [17]:
DT = np.array(DT)
DT
Out[17]:
array([[24,  9,  0,  3,  1, 12,  0,  1, 21,  2],
       [32,  5,  3,  0,  0,  2,  0,  0, 10,  0],
       [12,  5,  0,  0,  0,  0,  0,  0, 16,  0],
       [ 6,  2,  0,  0,  0,  0,  0,  0,  7,  2],
       [43, 20,  3,  0,  0, 27,  0,  2, 31,  0],
       [ 2,  0,  7, 16, 33,  0, 18,  0,  0, 27],
       [ 0,  1, 12,  0,  2,  0, 32,  0,  0,  4],
       [ 3,  0,  4,  2,  0,  0, 22,  0,  0,  2],
       [ 1,  0, 27, 25,  7,  0, 34,  3,  0, 11],
       [ 6,  0,  4, 23, 12, 22, 17,  9,  0,  8],
       [ 4, 27,  0,  7, 14,  9,  0, 27,  0, 33],
       [ 0, 14,  1, 12,  5,  4,  0,  7,  0, 16],
       [ 0,  3,  0, 21, 12,  0,  0,  5,  0, 14],
       [ 0,  2,  0,  3,  4,  5,  0,  4,  1,  7],
       [ 0, 11,  0,  2,  0,  3,  0,  4,  0,  3]], dtype=int64)

The search function takes a query object (in this case a vector of words), and searches for the K most similar (least distant) items in the data (our index of documents). The "measure" parameter allows us to use either the Euclidean distance or the inverse of Cosine similarity as our ditance metric. The function returns the indices of the K most similar neighbors and a list of their distances to the query object.

In [18]:
def knn_search(x, D, K, measure):
    """ find K nearest neighbours of data point x among D """
    if measure == 0:
        # euclidean distances from the other points
        dists = np.sqrt(((D - x)**2).sum(axis=1))
    elif measure == 1:
        D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
        x_norm = np.linalg.norm(x)
        sims = np.dot(D,x)/(D_norm * x_norm)
        dists = 1 - sims
    idx = np.argsort(dists) # sorting
    # return the indexes of K nearest neighbors
    return idx[:K], sorted(dists)[:K]

Let's now try this on a new query object

In [19]:
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
x
Out[19]:
array([ 3, 22,  0, 17,  9,  6,  1, 12,  0, 22])
In [20]:
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x, DT, 5, 1)
In [21]:
neigh_idx
Out[21]:
array([11, 10, 13, 14, 12], dtype=int64)
In [22]:
distances
Out[22]:
[0.0073848320598431938,
 0.069510849359296967,
 0.15227630019906346,
 0.17224860028549083,
 0.19440045873610889]
In [23]:
DT[neigh_idx]
Out[23]:
array([[ 0, 14,  1, 12,  5,  4,  0,  7,  0, 16],
       [ 4, 27,  0,  7, 14,  9,  0, 27,  0, 33],
       [ 0,  2,  0,  3,  4,  5,  0,  4,  1,  7],
       [ 0, 11,  0,  2,  0,  3,  0,  4,  0,  3],
       [ 0,  3,  0, 21, 12,  0,  0,  5,  0, 14]], dtype=int64)
In [28]:
# Finding the k=5 nearest neighbors using Euclidean distance metric
neigh_idx, distances = knn_search(x, DT, 5, 0)
In [29]:
neigh_idx
Out[29]:
array([11, 10, 12, 14, 13], dtype=int64)
In [30]:
distances
Out[30]:
[13.45362404707371,
 22.516660498395403,
 23.345235059857504,
 29.512709126747414,
 30.364452901377952]
In [31]:
DT[neigh_idx]
Out[31]:
array([[ 0, 14,  1, 12,  5,  4,  0,  7,  0, 16],
       [ 4, 27,  0,  7, 14,  9,  0, 27,  0, 33],
       [ 0,  3,  0, 21, 12,  0,  0,  5,  0, 14],
       [ 0, 11,  0,  2,  0,  3,  0,  4,  0,  3],
       [ 0,  2,  0,  3,  4,  5,  0,  4,  1,  7]], dtype=int64)