IF4072 Pemrosesan Teks dan Suara Bahasa Alami¶

Anggota Kelompok¶

13515021 Dewita Sonya Tarabunga
13515057 Erick Wijaya
13515107 Roland Hartanto

Import¶

import os, stat, string, re
import pandas as pd
import numpy as np
import nltk
import math

from collections import Counter
from nltk import word_tokenize, NaiveBayesClassifier, DecisionTreeClassifier, MaxentClassifier, classify
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

Load Data¶

def load_data(src, size):
    emails = ['' for _ in range(size)]
    if not os.path.exists(src):
        os.makedirs(src)
    files = os.listdir(src)
    for file in files:
        idx = file.replace('TRAIN_','').replace('.eml','').replace('TEST_', '')
        path = os.path.join(src, file)
        info = os.stat(path)
        if not stat.S_ISDIR(info.st_mode):
            fp = open(path, encoding='utf-8', errors='ignore')
            body = fp.read()
            fp.close()
            
            emails[int(idx) - 1] = body
    return emails
    
labels = pd.read_csv('spam-mail.tr.label')
labels = labels['Prediction'].tolist()
emails = load_data('TR-extracted', 2500)
test = load_data('TT-extracted', 1827)

Preprocessing¶

Remove Punctuations¶

trans = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
emails = [email.translate(trans) for email in emails]
test = [email.translate(trans) for email in test]

Tokenize¶

tokens = [word_tokenize(email.lower()) for email in emails]
test = [word_tokenize(email.lower()) for email in test]

Lemmatization¶

tagged = [nltk.pos_tag(token) for token in tokens]
test_tagged = [nltk.pos_tag(token) for token in test]

def get_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
tokens = [[lemmatizer.lemmatize(word[0], pos=get_pos(word[1])) for word in token] for token in tagged]
test = [[lemmatizer.lemmatize(word[0], pos=get_pos(word[1])) for word in token] for token in test_tagged]

Feature Selection¶

Stop Words Elimination¶

stop_words = set(stopwords.words('english'))
tokens = [[word for word in token if not word in stop_words] for token in tokens]
test = [[word for word in token if not word in stop_words] for token in test]

Convert to Trainable Data¶

def get_features(text):
    return {word: True for word in text}

train_data = []
idx = 0
for email in tokens:
    train_data.append((get_features(email), labels[idx]))
    idx += 1

test_data = []
idx = 0
for email in test:
    test_data.append(get_features(email))
    idx += 1

Training Model¶

Naive Bayes¶

classifierNB = NaiveBayesClassifier.train(train_data)

Decision Tree¶

classifierDT = DecisionTreeClassifier.train(train_data, depth_cutoff=30)

Maximum Entropy¶

classifierMaxentGIS = MaxentClassifier.train(train_data, 'GIS', trace=0, max_iter=500)

Classification¶

Naive Bayes¶

fp = open('answerNB.csv', 'w')
fp.write('Id,Prediction\n')

idx = 1
for data in test_data:
    ans = classifierNB.classify(data)
    fp.write(str(idx)+','+str(ans)+'\n')
    idx += 1
    
fp.close()

classify.accuracy(classifierNB, train_data)

0.9636

Decision Tree¶

fp = open('answerDT.csv', 'w')
fp.write('Id,Prediction\n')

idx = 1
for data in test_data:
    ans = classifierDT.classify(data)
    fp.write(str(idx)+','+str(ans)+'\n')
    idx += 1
    
fp.close()

classify.accuracy(classifierDT, train_data)

0.9744

Maximum Entropy¶

fp = open('answerMaxentGIS.csv', 'w')
fp.write('Id,Prediction\n')

idx = 1
for data in test_data:
    ans = classifierMaxentGIS.classify(data)
    fp.write(str(idx)+','+str(ans)+'\n')
    idx += 1
    
fp.close()

classify.accuracy(classifierMaxentGIS, train_data)

0.95