Document Classification using KNN

2 minute read

This was the first model that I built from scratch

Text Preprocessing:

Lemmatization/Stemming helps to reduce the variations among the words so that necessary words can be processed

Stop-words are removed so that only significant text features would be captured


from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
from collections import Counter

stemmer = PorterStemmer() #stemming
lemmatizer = WordNetLemmatizer() #lemmatizing
stop_words = set(stopwords.words('english')) #removing stop words

Note: List Comprehensions are used which is an efficient way of iterating in python

def StemTokenizer(tokens):
    return [stemmer.stem(token) for token in tokens] #List comprehensions


def Stemmer(text):
    word_tokens = word_tokenize(text.lower())
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return StemTokenizer(filtered_sentence)


def lemmaTokenizer(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]


def lemmatize(text):
    word_tokens = word_tokenize(text.lower())
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return lemmaTokenizer(filtered_sentence)

Here, if documents have to be classified according to the genre or feature we have to consider a bag of words approach. KNN is implemented from scratch using cosine similarity as a distance measure to predict if the document is classified accurately enough. Standard approach is:

  • Consider the lemmatize/stemmed words and convert them to vectors using TF-TfidfVectorizer.
  • Consider training and testing dataset
  • Implement KNN to classify the documents accurately.
  • Train the model and test the model.

Implementing a K-Nearest Neighbour Classifier

  • Fitting a model & Predicting using a fitted model Refer this
# KNN Classifier
class KNNClassifier():

    def fit(self, dtrain, ltrain):
        self.dtrain = dtrain
        self.ltrain = ltrain
        self.vec_train = vec_tfidf.fit_transform(dtrain)

    def predict(self, data_test, k):
        vec_test = vec_tfidf.transform(data_test)
        # print(vec_test.shape)
        cos_sim = cosine_similarity(self.vec_train, vec_test)
        tcos_sim = np.transpose(cos_sim)
        neighbours = []
        for val in tcos_sim:
            distances = []
            for x in range(len(val)):
                distances.append([x, val[x]])
            distances.sort(key=operator.itemgetter(1), reverse=True)
            neighbour = []
            print(distances[1][0])
            for x in range(k):
                rowNo = distances[x][0]
                # neighbour.append([self.ltrain[rowNo], distances[x][1] ])
                neighbour.append(int(self.ltrain[rowNo]));
            # print(neighbour)
            counter = Counter(neighbour)
            neighbours.append(counter.most_common(1)[0][0])
            print(counter)
        return neighbours


def Load_text(x):
    text = []
    label = []
    counter = 1
    with open(x) as f:
        for line in f:
            if counter == 0:
                counter = counter + 1
                continue
            line.strip()
            label.append(int(line[:1]))
            text.append(line[2:])
    return text, label


def Load_test(x):
    text = []
    counter = 1
    with open(x) as f:
        for line in f:
            if counter == 0:
                counter = counter + 1
                continue
            line.strip()
            text.append(line)
    return text


# 1 load train text data
print("loading train data....")
data_train, lable_train = Load_text('train.txt')

# 2 load test data
print("loading test data....")
test_data = Load_test("test.txt")
# X_train, X_test, y_train, y_test = train_test_split(data_train, lable_train, test_size=0.2)

# 3 initialize Vectorizer
vec_tfidf = TfidfVectorizer(tokenizer=Stemmer, sublinear_tf=True, min_df=0.005, stop_words='english')
# vec_tfidf = TfidfVectorizer(tokenizer=lemmatize, stop_words = 'english' )

# 4 initialize KNNClassifier
print("initializing classifier")
classifier = KNNClassifier()

classifier.fit(data_train, lable_train)

print("prediction starting.....")
result = classifier.predict(test_data, 11)

print("writing result into file...")
np.savetxt('final1output.txt', result, delimiter='; ')
print("prediction completed....xxxxx")

An accuracy of 0.96 is obtained for this problem

Tags:

Updated: