실습_1. 크롤링의 기초

Updated:

네이버 뉴스 헤더 리스트 출력

import requests
from bs4 import BeautifulSoup

def crawling(soup) :
    list = soup.find("div",class_="list_issue").find_all("a")
    # class 가 list_issue인 div태그에서 a태그를 모두가져와 리스트형태로 list에 저장
    
    result = []
    
    for i in list:
        result.append(i.get_text())
        # 가져온 a태그 리스트에서 text만 result에 저장
        
    return result
    
def main() :
    url = "http://www.naver.com"
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    print(crawling(soup))

if __name__ == "__main__" :
    main()

연합뉴스 속보 기사 제목 추출

import requests
from bs4 import BeautifulSoup

def crawling(soup) :
    div = soup.find("div",class_="list_body").find_all("a")
   
    result = []
    
    for i in div:
        print(i.get_text())
        result.append(i.get_text())
    
    return result
    
def main() :
    url = "https://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&sid1=001&sid2=140&oid=001&isYeonhapFlash=Y"
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    print(crawling(soup))

if __name__ == "__main__" :
    main()

bugs 실시간 음원차트 순위 추출


import requests
from bs4 import BeautifulSoup

def crawling(soup) :
    
    p = soup.find_all("p",class_ = "title")
    result = []
    
    print(len(p))
    
    for i in p:
        result.append(i.find("a").get_text())
    
    return result
    
def main() :
    url = "https://music.bugs.co.kr/chart"
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    print(crawling(soup))

if __name__ == "__main__" :
    main()

영화 후기 수집하기

import requests
from bs4 import BeautifulSoup

def crawling(soup) :
    ul = soup.find("ul", class_="rvw_list_area").find_all("strong")
    
    result = []
    
    for i in ul:
        result.append(i.get_text())
        
    return result

def main() :
    url = "https://movie.naver.com/movie/bi/mi/review.nhn?code=168058#"
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    print(crawling(soup))

if __name__ == "__main__" :
    main()

커뮤니티에서 댓글

import requests
from bs4 import BeautifulSoup

def crawling(soup) :

    dl = soup.find_all("dd", class_="usertxt")
    result = []
    
    for i in dl:
        result.append(i.find("span").get_text().replace("\n","").replace("\t",""))
    
    print(result)

    return result
def main() :
    url = "https://pann.nate.com/talk/350939697"
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    print(crawling(soup))


if __name__ == "__main__" :
    main()

Leave a comment