데이터 사이언스 08 자연어 처리(2)-Naive Bayes 분류

게시: January 23, 2018 by 주인장

주제별:
Data-Science 13

1. 나이브 베이즈 분류

1. 나이브 베이즈 분류

위키백과 - 나이브 베이즈 분류

1) 예제1

from nltk.tokenize import word_tokenize
import nltk

train = [
    ( 'i like you', 'pos'),
    ( 'i hate you', 'neg'),
    ( 'you like me', 'neg'),
    ( 'i like her', 'pos')
]

all_words = set( word.lower() for passage in train for word in word_tokenize(passage[0]) )

all_words

{'hate', 'her', 'i', 'like', 'me', 'you'}

t = [
    (
        { word: ( word in word_tokenize(x[0]) )  for word in all_words }, 
        x[1]
    )
    for x in train
]

t[0]

({'hate': False,
  'her': False,
  'i': True,
  'like': True,
  'me': False,
  'you': True},
 'pos')

classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                     her = False             neg : pos    =      1.7 : 1.0
                    like = True              pos : neg    =      1.7 : 1.0
                       i = True              pos : neg    =      1.7 : 1.0
                    hate = False             pos : neg    =      1.7 : 1.0
                     you = True              neg : pos    =      1.7 : 1.0
                      me = False             pos : neg    =      1.7 : 1.0

test_sentence = "i like MeRui"

test_sent_features = {
    word.lower()  :  ( word in word_tokenize(test_sentence.lower()))
    for word in all_words
}

test_sent_features

{'hate': False,
 'her': False,
 'i': True,
 'like': True,
 'me': False,
 'you': False}

classifier.classify( test_sent_features )

'pos'

2) 예제2

train = [
    ( 'I love this sandwich.', 'pos'),
    ( 'This is an amazing place!', 'pos' ),
    ( 'I feel very good about these beers.', 'pos'),
    ( 'This is my best work.', 'pos'),
    ( 'What an awesome view', 'pos'),
    ( 'I do not like this restaurant', 'neg'),
    ( 'I am tired of this stuff.', 'neg'),
    ( "I can't deal with this", 'neg'), 
    ( 'He is my sworn enemy!', 'neg'),
    ( 'My boss is horrible.', 'neg')    
]

from nltk.tokenize import word_tokenize

all_words = set( word.lower() for passage in train for word in word_tokenize(passage[0]) )

len(all_words)

t = [
    (
        { word: ( word in word_tokenize(x[0]) )  for word in all_words }, 
        x[1]
    )
    for x in train
]

t[0]

({'!': False,
  '.': True,
  'about': False,
  'am': False,
  'amazing': False,
  'an': False,
  'awesome': False,
  'beers': False,
  'best': False,
  'boss': False,
  'ca': False,
  'deal': False,
  'do': False,
  'enemy': False,
  'feel': False,
  'good': False,
  'he': False,
  'horrible': False,
  'i': False,
  'is': False,
  'like': False,
  'love': True,
  'my': False,
  "n't": False,
  'not': False,
  'of': False,
  'place': False,
  'restaurant': False,
  'sandwich': True,
  'stuff': False,
  'sworn': False,
  'these': False,
  'this': True,
  'tired': False,
  'very': False,
  'view': False,
  'what': False,
  'with': False,
  'work': False},
 'pos')

classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                    this = True              neg : pos    =      2.3 : 1.0
                    this = False             pos : neg    =      1.8 : 1.0
                      an = False             neg : pos    =      1.6 : 1.0
                       . = True              pos : neg    =      1.4 : 1.0
                       . = False             neg : pos    =      1.4 : 1.0
                    like = False             pos : neg    =      1.2 : 1.0
                   these = False             neg : pos    =      1.2 : 1.0
                horrible = False             pos : neg    =      1.2 : 1.0
                     not = False             pos : neg    =      1.2 : 1.0
                      am = False             pos : neg    =      1.2 : 1.0

test_sentence = "This is the best band I've ever heards!"

test_sent_features = {
    word.lower()  :  ( word in word_tokenize(test_sentence.lower()))
    for word in all_words
}

test_sent_features

{'!': True,
 '.': False,
 'about': False,
 'am': False,
 'amazing': False,
 'an': False,
 'awesome': False,
 'beers': False,
 'best': True,
 'boss': False,
 'ca': False,
 'deal': False,
 'do': False,
 'enemy': False,
 'feel': False,
 'good': False,
 'he': False,
 'horrible': False,
 'i': True,
 'is': True,
 'like': False,
 'love': False,
 'my': False,
 "n't": False,
 'not': False,
 'of': False,
 'place': False,
 'restaurant': False,
 'sandwich': False,
 'stuff': False,
 'sworn': False,
 'these': False,
 'this': True,
 'tired': False,
 'very': False,
 'view': False,
 'what': False,
 'with': False,
 'work': False}

classifier.classify( test_sent_features )

'pos'

3) 한글 예제

(1) 영어와 같은 방식

from nltk.tokenize import  word_tokenize
import nltk
from konlpy.tag import Twitter

pos_tagger = Twitter()

train = [
    ( '메리가 좋아', 'pos' ),
    ( '고양이도 좋아', 'pos' ),
    ( '난 수업이 지루해', 'neg' ),
    ( '메리는 이쁜 고양이야', 'pos' ),
    ( '난 마치고 메리랑 놀거야', 'pos' )
]

all_words = set( word.lower() for passage in train for word in word_tokenize( passage[0]) )
all_words

{'고양이도',
 '고양이야',
 '난',
 '놀거야',
 '마치고',
 '메리가',
 '메리는',
 '메리랑',
 '수업이',
 '이쁜',
 '좋아',
 '지루해'}

t = [
    (
        { word: ( word in word_tokenize(x[0]) )  for word in all_words }, 
        x[1]
    )
    for x in train
]

t[0]

({'고양이도': False,
  '고양이야': False,
  '난': False,
  '놀거야': False,
  '마치고': False,
  '메리가': True,
  '메리는': False,
  '메리랑': False,
  '수업이': False,
  '이쁜': False,
  '좋아': True,
  '지루해': False},
 'pos')

classifier = nltk.NaiveBayesClassifier.train(t)

classifier.show_most_informative_features()

Most Informative Features
                       난 = True              neg : pos    =      2.5 : 1.0
                      좋아 = False             neg : pos    =      1.5 : 1.0
                      이쁜 = False             neg : pos    =      1.1 : 1.0
                     마치고 = False             neg : pos    =      1.1 : 1.0
                     놀거야 = False             neg : pos    =      1.1 : 1.0
                     메리가 = False             neg : pos    =      1.1 : 1.0
                    고양이야 = False             neg : pos    =      1.1 : 1.0
                     메리랑 = False             neg : pos    =      1.1 : 1.0
                     메리는 = False             neg : pos    =      1.1 : 1.0
                    고양이도 = False             neg : pos    =      1.1 : 1.0

test_sentence = '난 수업이 마치면 메리랑 놀거야'

test_sent_features = { word.lower() : ( word in word_tokenize(test_sentence.lower() ) ) for word in all_words }

test_sent_features

{'고양이도': False,
 '고양이야': False,
 '난': True,
 '놀거야': True,
 '마치고': False,
 '메리가': False,
 '메리는': False,
 '메리랑': True,
 '수업이': True,
 '이쁜': False,
 '좋아': False,
 '지루해': False}

classifier.classify( test_sent_features )

'neg'

test_sentence가 'neg'로 분류되었다.
영어와 같은 방식으로 나이브 베이즈 분류하는 것은 실패하였다.
한글은 형태소 분석을 해야한다.

(2) 형태소 분석

def tokenize(doc):
    return [ '/'.join(t) for t in pos_tagger.pos( doc, norm=True, stem=True ) ]

train_docs = [ ( tokenize(row[0]), row[1] )  for  row in train ]

train_docs

[(['메리/Noun', '가/Josa', '좋다/Adjective'], 'pos'),
 (['고양이/Noun', '도/Josa', '좋다/Adjective'], 'pos'),
 (['난/Noun', '수업/Noun', '이/Josa', '지루하다/Adjective'], 'neg'),
 (['메리/Noun', '는/Josa', '이쁘다/Adjective', '고양이/Noun', '야/Josa'], 'pos'),
 (['난/Noun', '마치/Noun', '고/Josa', '메리/Noun', '랑/Josa', '놀다/Verb'], 'pos')]

tokens = [ t for d in train_docs for t in d[0]  ]

tokens[ : 10]

['메리/Noun',
 '가/Josa',
 '좋다/Adjective',
 '고양이/Noun',
 '도/Josa',
 '좋다/Adjective',
 '난/Noun',
 '수업/Noun',
 '이/Josa',
 '지루하다/Adjective']

def term_exists(doc):
    return { 'exists({})'.format(word) : (word in set(doc)) for word in tokens }

train_xy = [ ( term_exists(d),  c)  for  d,c in train_docs ]

train_xy[0]

({'exists(가/Josa)': True,
  'exists(고/Josa)': False,
  'exists(고양이/Noun)': False,
  'exists(난/Noun)': False,
  'exists(놀다/Verb)': False,
  'exists(는/Josa)': False,
  'exists(도/Josa)': False,
  'exists(랑/Josa)': False,
  'exists(마치/Noun)': False,
  'exists(메리/Noun)': True,
  'exists(수업/Noun)': False,
  'exists(야/Josa)': False,
  'exists(이/Josa)': False,
  'exists(이쁘다/Adjective)': False,
  'exists(좋다/Adjective)': True,
  'exists(지루하다/Adjective)': False},
 'pos')

classifier = nltk.NaiveBayesClassifier.train(train_xy)

classifier.show_most_informative_features()

Most Informative Features
          exists(난/Noun) = True              neg : pos    =      2.5 : 1.0
         exists(메리/Noun) = False             neg : pos    =      2.5 : 1.0
        exists(고양이/Noun) = False             neg : pos    =      1.5 : 1.0
    exists(좋다/Adjective) = False             neg : pos    =      1.5 : 1.0
          exists(는/Josa) = False             neg : pos    =      1.1 : 1.0
         exists(놀다/Verb) = False             neg : pos    =      1.1 : 1.0
          exists(야/Josa) = False             neg : pos    =      1.1 : 1.0
          exists(고/Josa) = False             neg : pos    =      1.1 : 1.0
          exists(도/Josa) = False             neg : pos    =      1.1 : 1.0
          exists(랑/Josa) = False             neg : pos    =      1.1 : 1.0

test_sentence = [ ('난 수업이 마치면 메리랑 놀거야') ]

test_docs = pos_tagger.pos( test_sentence[0] )

test_docs

[('난', 'Noun'),
 ('수업', 'Noun'),
 ('이', 'Josa'),
 ('마치', 'Noun'),
 ('면', 'Josa'),
 ('메리', 'Noun'),
 ('랑', 'Josa'),
 ('놀거', 'Verb'),
 ('야', 'Eomi')]

test_docs = tokenize( test_sentence[0] )

test_docs

['난/Noun',
 '수업/Noun',
 '이/Josa',
 '마치/Noun',
 '면/Josa',
 '메리/Noun',
 '랑/Josa',
 '놀다/Verb']

test_sent_features = { word : (word in tokens)   for word in test_docs  }

test_sent_features

{'난/Noun': True,
 '놀다/Verb': True,
 '랑/Josa': True,
 '마치/Noun': True,
 '메리/Noun': True,
 '면/Josa': False,
 '수업/Noun': True,
 '이/Josa': True}

classifier.classify( test_sent_features )

'pos'