import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
cd D:\Documents\Class\CSC478\Data
DF = pd.read_csv('term-doc-mat.csv', header=None)
DF
# TD = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),dtype=int)
TD = DF.ix[:,1:]
TD
# terms = genfromtxt('term-doc-mat.csv',delimiter=',',usecols=(0),dtype=str)
terms = DF.ix[:,0]
terms
DT = TD.T
DT
DT.shape
len(DT.ix[1])
numTerms=DT.shape[1]
# could also say numTerms = len(DT.ix[1])
NDocs = DT.shape[0]
print numTerms
print NDocs
termFreqs = TD.sum(axis=1)
print termFreqs
dictTF = {}
for i in range(numTerms):
dictTF[terms[i]] = termFreqs[i]
print sorted(dictTF.items())
sortedTF = sorted(dictTF.values(), reverse=True)
print sortedTF
plt.plot(sortedTF)
plt.show()
DT = np.array(DT)
DT
def knn_search(x, D, K, measure):
""" find K nearest neighbours of data point x among D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
sims = np.dot(D,x)/(D_norm * x_norm)
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], sorted(dists)[:K]
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
x
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x, DT, 5, 1)
neigh_idx
distances
DT[neigh_idx]
# Finding the k=5 nearest neighbors using Euclidean distance metric
neigh_idx, distances = knn_search(x, DT, 5, 0)
neigh_idx
distances
DT[neigh_idx]