실습_1. 크롤링의 기초
Updated:
네이버 뉴스 헤더 리스트 출력
import requests
from bs4 import BeautifulSoup
def crawling(soup) :
list = soup.find("div",class_="list_issue").find_all("a")
# class 가 list_issue인 div태그에서 a태그를 모두가져와 리스트형태로 list에 저장
result = []
for i in list:
result.append(i.get_text())
# 가져온 a태그 리스트에서 text만 result에 저장
return result
def main() :
url = "http://www.naver.com"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(crawling(soup))
if __name__ == "__main__" :
main()
연합뉴스 속보 기사 제목 추출
import requests
from bs4 import BeautifulSoup
def crawling(soup) :
div = soup.find("div",class_="list_body").find_all("a")
result = []
for i in div:
print(i.get_text())
result.append(i.get_text())
return result
def main() :
url = "https://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&sid1=001&sid2=140&oid=001&isYeonhapFlash=Y"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(crawling(soup))
if __name__ == "__main__" :
main()
bugs 실시간 음원차트 순위 추출
import requests
from bs4 import BeautifulSoup
def crawling(soup) :
p = soup.find_all("p",class_ = "title")
result = []
print(len(p))
for i in p:
result.append(i.find("a").get_text())
return result
def main() :
url = "https://music.bugs.co.kr/chart"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(crawling(soup))
if __name__ == "__main__" :
main()
영화 후기 수집하기
import requests
from bs4 import BeautifulSoup
def crawling(soup) :
ul = soup.find("ul", class_="rvw_list_area").find_all("strong")
result = []
for i in ul:
result.append(i.get_text())
return result
def main() :
url = "https://movie.naver.com/movie/bi/mi/review.nhn?code=168058#"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(crawling(soup))
if __name__ == "__main__" :
main()
커뮤니티에서 댓글
import requests
from bs4 import BeautifulSoup
def crawling(soup) :
dl = soup.find_all("dd", class_="usertxt")
result = []
for i in dl:
result.append(i.find("span").get_text().replace("\n","").replace("\t",""))
print(result)
return result
def main() :
url = "https://pann.nate.com/talk/350939697"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(crawling(soup))
if __name__ == "__main__" :
main()
Leave a comment