실습_4. 워드 클라우드 프로젝트

Updated:

  • 여러개의 기사 내용으로 워드클라우드 출력하기

main.py

  • 크롤링
import requests
from bs4 import BeautifulSoup
from wc import create_word_cloud

def crawling(soup) :
    result = ""
    for children in soup.find("div", class_="_article_body_contents").children :
        if children.name == None :
            result += children
    start = result.find("// TV플레이어")
    result = result[start+len("// TV플레이어")+1:]
    
    end = result.find("// 본문 내용")
    result = result[:end]
    
    return result.replace("\n", "").replace("\t", "")
    
def get_href(soup) :
    result = []
    
    cluster_body = soup.find("div", class_ = "cluster_body")
    
    for cluster_text in cluster_body.find_all("div", class_ = "cluster_text") :
        result.append(cluster_text.find("a")["href"])
    
    return result

def get_request(section) :
    url = "https://news.naver.com/main/main.nhn"
    section_dict = { "정치" : 100,
                     "경제" : 101,
                     "사회" : 102,
                     "생활" : 103,
                     "세계" : 104,
                     "과학" : 105 }
    return requests.get(url, params={"sid1":section_dict[section]})

def main() :
    list_href = []
    result = []
    
    # 섹션을 입력하세요.
    section = input('"정치", "경제", "사회", "생활", "세계", "과학" 중 하나를 입력하세요.\n  > ')
    
    req = get_request(section)
    soup = BeautifulSoup(req.text, "html.parser")
    
    list_href = get_href(soup)
    
    for href in list_href :
        href_req = requests.get(href)
        href_soup = BeautifulSoup(href_req.text, "html.parser")
        result.append(crawling(href_soup))
    
    text = " ".join(result)
    create_word_cloud(text)

if __name__ == "__main__" :
    main()

wc.py

  • 워드클라우드

from wordcloud import WordCloud
from count import count_word_freq
from elice_utils import EliceUtils
elice_utils = EliceUtils()

def create_word_cloud(data) :
    counter = count_word_freq(data)
    
    cloud = WordCloud(font_path='NanumBarunGothic.ttf' ,background_color='white')
    cloud.fit_words(counter)
    cloud.to_file('cloud.png')
    elice_utils.send_image('cloud.png')
    
if __name__ == "__main__" :
    create_word_cloud(data)
    

count.py

  • 단어 수 세기

from collections import Counter
from string import punctuation
import mecab
mecab = mecab.MeCab()

def count_word_freq(data) :
    _data = data.lower()
    
    for p in punctuation :
        _data = _data.replace(p, "")
    
    # 명사 추출
    _data = mecab.nouns(_data)
    
    counter = Counter(_data)
    
    return counter


더 많은 기사 클라우딩

main.py

import requests
from bs4 import BeautifulSoup
from wc import create_word_cloud

def crawling(soup) :
    result = ""
    for children in soup.find("div", class_="_article_body_contents").children :
        if children.name == None :
            result += children
    
    start = result.find("// TV플레이어")
    result = result[start+len("// TV플레이어")+1:]
    
    end = result.find("// 본문 내용")
    result = result[:end]
    
    return result.replace("\n", "").replace("\t", "")
    

def get_href(soup) :
    result = []
    
    cluster_head = soup.find("h2", class_="cluster_head_topic")
    href = cluster_head.find("a")["href"]
    
    url = "https://news.naver.com" + href
    req = requests.get(url)
    new_soup = BeautifulSoup(req.text, "html.parser")
    
    main_content = new_soup.find("div", id="main_content")
    
    for ul in main_content.find_all("ul") :
        for a in ul.find_all("a") :
            result.append(a["href"])
    
    return result


def get_request(section) :
    url = "https://news.naver.com/main/main.nhn"
    section_dict = { "정치" : 100,
                     "경제" : 101,
                     "사회" : 102,
                     "생활" : 103,
                     "세계" : 104,
                     "과학" : 105 }
    return requests.get(url, params={"sid1":section_dict[section]})


def main() :
    list_href = []
    result = []
    
    # 섹션을 입력하세요.
    section = input('"정치", "경제", "사회", "생활", "세계", "과학" 중 하나를 입력하세요.\n  > ')
    
    req = get_request(section)
    soup = BeautifulSoup(req.text, "html.parser")
    
    list_href = get_href(soup)
    
    for href in list_href :
        href_req = requests.get(href)
        href_soup = BeautifulSoup(href_req.text, "html.parser")
        result.append(crawling(href_soup))
    
    text = " ".join(result)
    create_word_cloud(text)
    


if __name__ == "__main__" :
    main()

Leave a comment