실습_4. 워드 클라우드 프로젝트
Updated:
- 여러개의 기사 내용으로 워드클라우드 출력하기
main.py
- 크롤링
import requests
from bs4 import BeautifulSoup
from wc import create_word_cloud
def crawling(soup) :
result = ""
for children in soup.find("div", class_="_article_body_contents").children :
if children.name == None :
result += children
start = result.find("// TV플레이어")
result = result[start+len("// TV플레이어")+1:]
end = result.find("// 본문 내용")
result = result[:end]
return result.replace("\n", "").replace("\t", "")
def get_href(soup) :
result = []
cluster_body = soup.find("div", class_ = "cluster_body")
for cluster_text in cluster_body.find_all("div", class_ = "cluster_text") :
result.append(cluster_text.find("a")["href"])
return result
def get_request(section) :
url = "https://news.naver.com/main/main.nhn"
section_dict = { "정치" : 100,
"경제" : 101,
"사회" : 102,
"생활" : 103,
"세계" : 104,
"과학" : 105 }
return requests.get(url, params={"sid1":section_dict[section]})
def main() :
list_href = []
result = []
# 섹션을 입력하세요.
section = input('"정치", "경제", "사회", "생활", "세계", "과학" 중 하나를 입력하세요.\n > ')
req = get_request(section)
soup = BeautifulSoup(req.text, "html.parser")
list_href = get_href(soup)
for href in list_href :
href_req = requests.get(href)
href_soup = BeautifulSoup(href_req.text, "html.parser")
result.append(crawling(href_soup))
text = " ".join(result)
create_word_cloud(text)
if __name__ == "__main__" :
main()
wc.py
- 워드클라우드
from wordcloud import WordCloud
from count import count_word_freq
from elice_utils import EliceUtils
elice_utils = EliceUtils()
def create_word_cloud(data) :
counter = count_word_freq(data)
cloud = WordCloud(font_path='NanumBarunGothic.ttf' ,background_color='white')
cloud.fit_words(counter)
cloud.to_file('cloud.png')
elice_utils.send_image('cloud.png')
if __name__ == "__main__" :
create_word_cloud(data)
count.py
- 단어 수 세기
from collections import Counter
from string import punctuation
import mecab
mecab = mecab.MeCab()
def count_word_freq(data) :
_data = data.lower()
for p in punctuation :
_data = _data.replace(p, "")
# 명사 추출
_data = mecab.nouns(_data)
counter = Counter(_data)
return counter
더 많은 기사 클라우딩
main.py
import requests
from bs4 import BeautifulSoup
from wc import create_word_cloud
def crawling(soup) :
result = ""
for children in soup.find("div", class_="_article_body_contents").children :
if children.name == None :
result += children
start = result.find("// TV플레이어")
result = result[start+len("// TV플레이어")+1:]
end = result.find("// 본문 내용")
result = result[:end]
return result.replace("\n", "").replace("\t", "")
def get_href(soup) :
result = []
cluster_head = soup.find("h2", class_="cluster_head_topic")
href = cluster_head.find("a")["href"]
url = "https://news.naver.com" + href
req = requests.get(url)
new_soup = BeautifulSoup(req.text, "html.parser")
main_content = new_soup.find("div", id="main_content")
for ul in main_content.find_all("ul") :
for a in ul.find_all("a") :
result.append(a["href"])
return result
def get_request(section) :
url = "https://news.naver.com/main/main.nhn"
section_dict = { "정치" : 100,
"경제" : 101,
"사회" : 102,
"생활" : 103,
"세계" : 104,
"과학" : 105 }
return requests.get(url, params={"sid1":section_dict[section]})
def main() :
list_href = []
result = []
# 섹션을 입력하세요.
section = input('"정치", "경제", "사회", "생활", "세계", "과학" 중 하나를 입력하세요.\n > ')
req = get_request(section)
soup = BeautifulSoup(req.text, "html.parser")
list_href = get_href(soup)
for href in list_href :
href_req = requests.get(href)
href_soup = BeautifulSoup(href_req.text, "html.parser")
result.append(crawling(href_soup))
text = " ".join(result)
create_word_cloud(text)
if __name__ == "__main__" :
main()
Leave a comment