Word_Tokenize

Updated: July 16, 2020

!pip install nltk 
!pip install konlpy 
!pip install kss 

import nltk

nltk.download('punkt')

다양한 문장 처리 tokenize

from nltk.tokenize import word_tokenize

text = """
I'm only one call away.
I'll be there to save the day.
Superman got nothing on me.
I'm only one call away.
Call me, baby, if you need a friend.
I just wanna give you love.
Come on, come on, come on.
"""

print(word_tokenize(text))

[‘I’, “‘m”, ‘only’, ‘one’, ‘call’, ‘away’, ‘.’, ‘I’, “‘ll”, ‘be’, ‘there’, ‘to’, ‘save’, ‘the’, ‘day’, ‘.’, ‘Superman’, ‘got’, ‘nothing’, ‘on’, ‘me’, ‘.’, ‘I’, “‘m”, ‘only’, ‘one’, ‘call’, ‘away’, ‘.’, ‘Call’, ‘me’, ‘,’, ‘baby’, ‘,’, ‘if’, ‘you’, ‘need’, ‘a’, ‘friend’, ‘.’, ‘I’, ‘just’, ‘wan’, ‘na’, ‘give’, ‘you’, ‘love’, ‘.’, ‘Come’, ‘on’, ‘,’, ‘come’, ‘on’, ‘,’, ‘come’, ‘on’, ‘.’]

from nltk.tokenize import WordPunctTokenizer

print(WordPunctTokenizer().tokenize(text))

#(') 포함

[‘I’, “’”, ‘m’, ‘only’, ‘one’, ‘call’, ‘away’, ‘.’, ‘I’, “’”, ‘ll’, ‘be’, ‘there’, ‘to’, ‘save’, ‘the’, ‘day’, ‘.’, ‘Superman’, ‘got’, ‘nothing’, ‘on’, ‘me’, ‘.’, ‘I’, “’”, ‘m’, ‘only’, ‘one’, ‘call’, ‘away’, ‘.’, ‘Call’, ‘me’, ‘,’, ‘baby’, ‘,’, ‘if’, ‘you’, ‘need’, ‘a’, ‘friend’, ‘.’, ‘I’, ‘just’, ‘wanna’, ‘give’, ‘you’, ‘love’, ‘.’, ‘Come’, ‘on’, ‘,’, ‘come’, ‘on’, ‘,’, ‘come’, ‘on’, ‘.’]

from nltk.tokenize import TreebankWordTokenizer

print(TreebankWordTokenizer().tokenize(text))

#('.) 포함 단어 구분

[‘I’, “‘m”, ‘only’, ‘one’, ‘call’, ‘away.’, ‘I’, “‘ll”, ‘be’, ‘there’, ‘to’, ‘save’, ‘the’, ‘day.’, ‘Superman’, ‘got’, ‘nothing’, ‘on’, ‘me.’, ‘I’, “‘m”, ‘only’, ‘one’, ‘call’, ‘away.’, ‘Call’, ‘me’, ‘,’, ‘baby’, ‘,’, ‘if’, ‘you’, ‘need’, ‘a’, ‘friend.’, ‘I’, ‘just’, ‘wan’, ‘na’, ‘give’, ‘you’, ‘love.’, ‘Come’, ‘on’, ‘,’, ‘come’, ‘on’, ‘,’, ‘come’, ‘on’, ‘.’]

from nltk.tokenize import RegexpTokenizer

print(RegexpTokenizer('\w+').tokenize(text))

# 정규 표현식으로 구분

[‘I’, ‘m’, ‘only’, ‘one’, ‘call’, ‘away’, ‘I’, ‘ll’, ‘be’, ‘there’, ‘to’, ‘save’, ‘the’, ‘day’, ‘Superman’, ‘got’, ‘nothing’, ‘on’, ‘me’, ‘I’, ‘m’, ‘only’, ‘one’, ‘call’, ‘away’, ‘Call’, ‘me’, ‘baby’, ‘if’, ‘you’, ‘need’, ‘a’, ‘friend’, ‘I’, ‘just’, ‘wanna’, ‘give’, ‘you’, ‘love’, ‘Come’, ‘on’, ‘come’, ‘on’, ‘come’, ‘on’]

from nltk.tokenize import sent_tokenize

print(sent_tokenize(text))

#(.)을 사용해서 문장 구분

[“\nI’m only one call away.”, “I’ll be there to save the day.”, ‘Superman got nothing on me.’, “I’m only one call away.”, ‘Call me, baby, if you need a friend.’, ‘I just wanna give you love.’, ‘Come on, come on, come on.’]

from konlpy.tag import Okt, Kkma
import kss
# 한글에도 사용할 수 있다
ktext = '안녕하세요? 저는 임준형이라고 합니다'

print(Okt().morphs(ktext))

[‘안녕하세요’, ‘?’, ‘저’, ‘는’, ‘임준’, ‘형’, ‘이라고’, ‘합니다’]

print(Kkma().morphs(ktext))

[‘안녕’, ‘하’, ‘세요’, ‘?’, ‘저’, ‘는’, ‘임’, ‘준형’, ‘이’, ‘라고’, ‘하’, ‘ㅂ니다’]

from nltk.stem import PorterStemmer

words = word_tokenize(text)
for word in words:
    print(PorterStemmer().stem(word), end = ' ')

I ‘m onli one call away . I ‘ll be there to save the day . superman got noth on me . I ‘m onli one call away . call me , babi , if you need a friend . I just wan na give you love . come on , come on , come on .

nltk.download('stopwords')
# stopwords

True

from nltk.corpus import stopwords

sw = stopwords.words('english')
print(sw)

# 문장에서 중요하지 않는 단어를 정리해서 담아둔다

[‘i’, ‘me’, ‘my’, ‘myself’, ‘we’, ‘our’, ‘ours’, ‘ourselves’, ‘you’, “you’re”, “you’ve”, “you’ll”, “you’d”, ‘your’, ‘yours’, ‘yourself’, ‘yourselves’, ‘he’, ‘him’, ‘his’, ‘himself’, ‘she’, “she’s”, ‘her’, ‘hers’, ‘herself’, ‘it’, “it’s”, ‘its’, ‘itself’, ‘they’, ‘them’, ‘their’, ‘theirs’, ‘themselves’, ‘what’, ‘which’, ‘who’, ‘whom’, ‘this’, ‘that’, “that’ll”, ‘these’, ‘those’, ‘am’, ‘is’, ‘are’, ‘was’, ‘were’, ‘be’, ‘been’, ‘being’, ‘have’, ‘has’, ‘had’, ‘having’, ‘do’, ‘does’, ‘did’, ‘doing’, ‘a’, ‘an’, ‘the’, ‘and’, ‘but’, ‘if’, ‘or’, ‘because’, ‘as’, ‘until’, ‘while’, ‘of’, ‘at’, ‘by’, ‘for’, ‘with’, ‘about’, ‘against’, ‘between’, ‘into’, ‘through’, ‘during’, ‘before’, ‘after’, ‘above’, ‘below’, ‘to’, ‘from’, ‘up’, ‘down’, ‘in’, ‘out’, ‘on’, ‘off’, ‘over’, ‘under’, ‘again’, ‘further’, ‘then’, ‘once’, ‘here’, ‘there’, ‘when’, ‘where’, ‘why’, ‘how’, ‘all’, ‘any’, ‘both’, ‘each’, ‘few’, ‘more’, ‘most’, ‘other’, ‘some’, ‘such’, ‘no’, ‘nor’, ‘not’, ‘only’, ‘own’, ‘same’, ‘so’, ‘than’, ‘too’, ‘very’, ‘s’, ‘t’, ‘can’, ‘will’, ‘just’, ‘don’, “don’t”, ‘should’, “should’ve”, ‘now’, ‘d’, ‘ll’, ‘m’, ‘o’, ‘re’, ‘ve’, ‘y’, ‘ain’, ‘aren’, “aren’t”, ‘couldn’, “couldn’t”, ‘didn’, “didn’t”, ‘doesn’, “doesn’t”, ‘hadn’, “hadn’t”, ‘hasn’, “hasn’t”, ‘haven’, “haven’t”, ‘isn’, “isn’t”, ‘ma’, ‘mightn’, “mightn’t”, ‘mustn’, “mustn’t”, ‘needn’, “needn’t”, ‘shan’, “shan’t”, ‘shouldn’, “shouldn’t”, ‘wasn’, “wasn’t”, ‘weren’, “weren’t”, ‘won’, “won’t”, ‘wouldn’, “wouldn’t”]

print(words)

중요하지 않는 단어 제거하기

# 내 버전
str = []
for i in range(len(words)):
    check  = True
    for j in range(len(sw)):
        if words[i].upper() == sw[j].upper():
            check = False
            break
    if check:
        str.append(words[i])
print(str)

[“‘m”, ‘one’, ‘call’, ‘away’, ‘.’, “‘ll”, ‘save’, ‘day’, ‘.’, ‘Superman’, ‘got’, ‘nothing’, ‘.’, “‘m”, ‘one’, ‘call’, ‘away’, ‘.’, ‘Call’, ‘,’, ‘baby’, ‘,’, ‘need’, ‘friend’, ‘.’, ‘wan’, ‘na’, ‘give’, ‘love’, ‘.’, ‘Come’, ‘,’, ‘come’, ‘,’, ‘come’, ‘.’]

# 선생님 버전
sw = [',','.']
# ,와 .만 제거
sw_removed = []
for i in words:
    if i.lower() not in sw:
        sw_removed.append(i)
        
print(sw_removed)

[‘I’, “‘m”, ‘only’, ‘one’, ‘call’, ‘away’, ‘I’, “‘ll”, ‘be’, ‘there’, ‘to’, ‘save’, ‘the’, ‘day’, ‘Superman’, ‘got’, ‘nothing’, ‘on’, ‘me’, ‘I’, “‘m”, ‘only’, ‘one’, ‘call’, ‘away’, ‘Call’, ‘me’, ‘baby’, ‘if’, ‘you’, ‘need’, ‘a’, ‘friend’, ‘I’, ‘just’, ‘wan’, ‘na’, ‘give’, ‘you’, ‘love’, ‘Come’, ‘on’, ‘come’, ‘on’, ‘come’, ‘on’]

단어를 빈도수를 기준으로 정렬

from collections import Counter

count_list = Counter(sw_removed)
print(count_list)

Counter({‘I’: 4, ‘on’: 4, “‘m”: 2, ‘only’: 2, ‘one’: 2, ‘call’: 2, ‘away’: 2, ‘me’: 2, ‘you’: 2, ‘come’: 2, “‘ll”: 1, ‘be’: 1, ‘there’: 1, ‘to’: 1, ‘save’: 1, ‘the’: 1, ‘day’: 1, ‘Superman’: 1, ‘got’: 1, ‘nothing’: 1, ‘Call’: 1, ‘baby’: 1, ‘if’: 1, ‘need’: 1, ‘a’: 1, ‘friend’: 1, ‘just’: 1, ‘wan’: 1, ‘na’: 1, ‘give’: 1, ‘love’: 1, ‘Come’: 1})

# 가장 많이 나온 단어 10개 출력
common_cl = count_list.most_common(10)
print(common_cl)

[(‘I’, 4), (‘on’, 4), (“‘m”, 2), (‘only’, 2), (‘one’, 2), (‘call’, 2), (‘away’, 2), (‘me’, 2), (‘you’, 2), (‘come’, 2)]

우선순위 주기

# 내 버전
common_cl_dict = {}

for i in range(10):
    common_cl_dict[common_cl[i][0]] = i

print(common_cl_dict)

{‘I’: 0, ‘on’: 1, “‘m”: 2, ‘only’: 3, ‘one’: 4, ‘call’: 5, ‘away’: 6, ‘me’: 7, ‘you’: 8, ‘come’: 9}

# 선생님 버전

common_cl_dict = {}
i = 0
for (key, value) in common_cl:
    common_cl_dict[key] = i
    i = i+1

print(common_cl_dict)

{‘I’: 0, ‘on’: 1, “‘m”: 2, ‘only’: 3, ‘one’: 4, ‘call’: 5, ‘away’: 6, ‘me’: 7, ‘you’: 8, ‘come’: 9}

One-Hot Vector

# 선생님 버전
oh_vector_list = []

for value in common_cl_dict.values():
    oh_vector = [0] * len(common_cl_dict)
    oh_vector[value] = 1
    oh_vector_list.append(oh_vector)

print(oh_vector_list)

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

워드 클라우드

from wordcloud import WordCloud
import matplotlib.pyplot as plt

word_wc = WordCloud(background_color = 'white')
plt.imshow(word_wc.generate_from_frequencies(count_list))
plt.show()

Word_Cloud

Share on

Twitter Facebook LinkedIn

Lim Junhyeong

Word_Tokenize

다양한 문장 처리 tokenize

중요하지 않는 단어 제거하기

단어를 빈도수를 기준으로 정렬

우선순위 주기

One-Hot Vector

워드 클라우드

Share on

Leave a comment

You may also enjoy

mariaDB 환경설정 계정 생성 및 권한 부여

프로젝트에 필요한 Git

여러개의 원격 저장소(git)

벽 부수고 이동하기 4_16946