#### Let's look at another example of using distances and similarities. This time, we'll look at a hypothetical search engine index and see how we can use k-nearest-neighbor search to identify the documents that are most similar to a specified query.¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cd D:\Documents\Class\CSC478\Data

D:\Documents\Class\CSC478\Data


#### In our data set, we have 15 documents. We assume that the documents have already been preprocessed, converted into word vectors (bags of words), and inserted into an index. After preprocessing and removing "stop words" we are left with 10 index terms (used as dimensions for the document vectors).¶

In [3]:
DF = pd.read_csv('term-doc-mat.csv', header=None)
DF

Out[3]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 database 24 32 12 6 43 2 0 3 1 6 4 0 0 0 0
1 index 9 5 5 2 20 0 1 0 0 0 27 14 3 2 11
2 likelihood 0 3 0 0 3 7 12 4 27 4 0 1 0 0 0
3 linear 3 0 0 0 0 16 0 2 25 23 7 12 21 3 2
4 matrix 1 0 0 0 0 33 2 0 7 12 14 5 12 4 0
5 query 12 2 0 0 27 0 0 0 0 22 9 4 0 5 3
6 regression 0 0 0 0 0 18 32 22 34 17 0 0 0 0 0
7 retrieval 1 0 0 0 2 0 0 0 3 9 27 7 5 4 4
8 sql 21 10 16 7 31 0 0 0 0 0 0 0 0 1 0
9 vector 2 0 0 2 0 27 4 2 11 8 33 16 14 7 3

#### Let's remove the column containing the terms¶

In [4]:
# TD = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),dtype=int)
TD = DF.ix[:,1:]
TD

Out[4]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 24 32 12 6 43 2 0 3 1 6 4 0 0 0 0
1 9 5 5 2 20 0 1 0 0 0 27 14 3 2 11
2 0 3 0 0 3 7 12 4 27 4 0 1 0 0 0
3 3 0 0 0 0 16 0 2 25 23 7 12 21 3 2
4 1 0 0 0 0 33 2 0 7 12 14 5 12 4 0
5 12 2 0 0 27 0 0 0 0 22 9 4 0 5 3
6 0 0 0 0 0 18 32 22 34 17 0 0 0 0 0
7 1 0 0 0 2 0 0 0 3 9 27 7 5 4 4
8 21 10 16 7 31 0 0 0 0 0 0 0 0 1 0
9 2 0 0 2 0 27 4 2 11 8 33 16 14 7 3
In [5]:
# terms = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(0),dtype=str)
terms = DF.ix[:,0]
terms

Out[5]:
0      database
1         index
2    likelihood
3        linear
4        matrix
5         query
6    regression
7     retrieval
8           sql
9        vector
Name: 0, dtype: object

#### Transposing the TD matrix.¶

In [6]:
DT = TD.T


#### Now we have a document-term matrix:¶

In [7]:
DT

Out[7]:
0 1 2 3 4 5 6 7 8 9
1 24 9 0 3 1 12 0 1 21 2
2 32 5 3 0 0 2 0 0 10 0
3 12 5 0 0 0 0 0 0 16 0
4 6 2 0 0 0 0 0 0 7 2
5 43 20 3 0 0 27 0 2 31 0
6 2 0 7 16 33 0 18 0 0 27
7 0 1 12 0 2 0 32 0 0 4
8 3 0 4 2 0 0 22 0 0 2
9 1 0 27 25 7 0 34 3 0 11
10 6 0 4 23 12 22 17 9 0 8
11 4 27 0 7 14 9 0 27 0 33
12 0 14 1 12 5 4 0 7 0 16
13 0 3 0 21 12 0 0 5 0 14
14 0 2 0 3 4 5 0 4 1 7
15 0 11 0 2 0 3 0 4 0 3
In [8]:
DT.shape

Out[8]:
(15, 10)
In [9]:
len(DT.ix[1])

Out[9]:
10
In [10]:
numTerms=DT.shape[1]
# could also say numTerms = len(DT.ix[1])
NDocs = DT.shape[0]

In [11]:
print numTerms
print NDocs

10
15


#### Next, let's compute term frequencies to get an idea of their distributions across the corpus.¶

In [12]:
termFreqs = TD.sum(axis=1)
print termFreqs

0    133
1     99
2     61
3    114
4     90
5     84
6    123
7     62
8     86
9    129
dtype: int64

In [16]:
dictTF = {}
for i in range(numTerms):
dictTF[terms[i]] = termFreqs[i]
print sorted(dictTF.items())
sortedTF = sorted(dictTF.values(), reverse=True)
print sortedTF

[('database', 133), ('index', 99), ('likelihood', 61), ('linear', 114), ('matrix', 90), ('query', 84), ('regression', 123), ('retrieval', 62), ('sql', 86), ('vector', 129)]
[133, 129, 123, 114, 99, 90, 86, 84, 62, 61]

In [14]:
plt.plot(sortedTF)
plt.show()


#### We convert the dataframe into a Numpy array which will be used as input for our search function.¶

In [17]:
DT = np.array(DT)
DT

Out[17]:
array([[24,  9,  0,  3,  1, 12,  0,  1, 21,  2],
[32,  5,  3,  0,  0,  2,  0,  0, 10,  0],
[12,  5,  0,  0,  0,  0,  0,  0, 16,  0],
[ 6,  2,  0,  0,  0,  0,  0,  0,  7,  2],
[43, 20,  3,  0,  0, 27,  0,  2, 31,  0],
[ 2,  0,  7, 16, 33,  0, 18,  0,  0, 27],
[ 0,  1, 12,  0,  2,  0, 32,  0,  0,  4],
[ 3,  0,  4,  2,  0,  0, 22,  0,  0,  2],
[ 1,  0, 27, 25,  7,  0, 34,  3,  0, 11],
[ 6,  0,  4, 23, 12, 22, 17,  9,  0,  8],
[ 4, 27,  0,  7, 14,  9,  0, 27,  0, 33],
[ 0, 14,  1, 12,  5,  4,  0,  7,  0, 16],
[ 0,  3,  0, 21, 12,  0,  0,  5,  0, 14],
[ 0,  2,  0,  3,  4,  5,  0,  4,  1,  7],
[ 0, 11,  0,  2,  0,  3,  0,  4,  0,  3]], dtype=int64)

#### The search function takes a query object (in this case a vector of words), and searches for the K most similar (least distant) items in the data (our index of documents). The "measure" parameter allows us to use either the Euclidean distance or the inverse of Cosine similarity as our ditance metric. The function returns the indices of the K most similar neighbors and a list of their distances to the query object.¶

In [18]:
def knn_search(x, D, K, measure):
""" find K nearest neighbours of data point x among D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
sims = np.dot(D,x)/(D_norm * x_norm)
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], sorted(dists)[:K]


#### Let's now try this on a new query object¶

In [19]:
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
x

Out[19]:
array([ 3, 22,  0, 17,  9,  6,  1, 12,  0, 22])
In [20]:
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x, DT, 5, 1)

In [21]:
neigh_idx

Out[21]:
array([11, 10, 13, 14, 12], dtype=int64)
In [22]:
distances

Out[22]:
[0.0073848320598431938,
0.069510849359296967,
0.15227630019906346,
0.17224860028549083,
0.19440045873610889]
In [23]:
DT[neigh_idx]

Out[23]:
array([[ 0, 14,  1, 12,  5,  4,  0,  7,  0, 16],
[ 4, 27,  0,  7, 14,  9,  0, 27,  0, 33],
[ 0,  2,  0,  3,  4,  5,  0,  4,  1,  7],
[ 0, 11,  0,  2,  0,  3,  0,  4,  0,  3],
[ 0,  3,  0, 21, 12,  0,  0,  5,  0, 14]], dtype=int64)
In [28]:
# Finding the k=5 nearest neighbors using Euclidean distance metric
neigh_idx, distances = knn_search(x, DT, 5, 0)

In [29]:
neigh_idx

Out[29]:
array([11, 10, 12, 14, 13], dtype=int64)
In [30]:
distances

Out[30]:
[13.45362404707371,
22.516660498395403,
23.345235059857504,
29.512709126747414,
30.364452901377952]
In [31]:
DT[neigh_idx]

Out[31]:
array([[ 0, 14,  1, 12,  5,  4,  0,  7,  0, 16],
[ 4, 27,  0,  7, 14,  9,  0, 27,  0, 33],
[ 0,  3,  0, 21, 12,  0,  0,  5,  0, 14],
[ 0, 11,  0,  2,  0,  3,  0,  4,  0,  3],
[ 0,  2,  0,  3,  4,  5,  0,  4,  1,  7]], dtype=int64)