Topic Modeling

(1) 데이터 수집

"코로나19" 관련 네이버 뉴스 기사를 페이지당 10개씩 101페이지를 최신순으로 크롤링하여 총 1010개의 데이터를 수집하였다.

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from konlpy.tag import Komoran
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from pyclustering.cluster import kmedoids
import numpy as np
import re
import requests
from gensim import corpora, models
import gensim

'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
< naver 뉴스 검색시 리스트 크롤링하는 프로그램 > _select사용
- 크롤링 해오는 것 : 링크,제목,신문사,날짜,내용요약본
- 내용요약본  -> 정제 작업 필요
- 리스트 -> 딕셔너리 -> df -> xlsx로 저장 
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''

title_text=[]
link_text=[]
source_text=[]
contents_text=[]
result={}

RESULT_PATH ='./'

def contents_cleansing(contents):
    first_cleansing_contents = re.sub('<dl>.*?</a> </div> </dd> <dd>', '', 
                                      str(contents)).strip()  #앞에 필요없는 부분 제거
    second_cleansing_contents = re.sub('<ul class="relation_lst">.*?</dd>', '', 
                                       first_cleansing_contents).strip()#뒤에 필요없는 부분 제거 (새끼 기사)
    third_cleansing_contents = re.sub('<.+?>', '', second_cleansing_contents).strip()
    contents_text.append(third_cleansing_contents)
    #print(contents_text)

def crawler(maxpage,query,sort,s_date,e_date):

    s_from = s_date.replace(".","")
    e_to = e_date.replace(".","")
    page = 1  
    maxpage_t =(int(maxpage)-1)*10+1   # 11= 2페이지 21=3페이지 31=4페이지  ...81=9페이지 , 91=10페이지, 101=11페이지
    
    while page <= maxpage_t:
        url = "<https://search.naver.com/search.naver?where=news&query=>" + query + "&sort="+sort+"&ds=" + s_date + "&de=" + e_date + "&nso=so%3Ar%2Cp%3Afrom" + s_from + "to" + e_to + "%2Ca%3A&start=" + str(page)
        
        response = requests.get(url)
        html = response.text
 
        soup = BeautifulSoup(html, 'html.parser')
 
        atags = soup.select('._sp_each_title')
        for atag in atags:
            title_text.append(atag.text)     
            link_text.append(atag['href'])   
            
        source_lists = soup.select('._sp_each_source')
        for source_list in source_lists:
            source_text.append(source_list.text)
               
        contents_lists = soup.select('ul.type01 dl')
        for contents_list in contents_lists:
            #print('==='*40)
            #print(contents_list)
            contents_cleansing(contents_list) 
        
        result= {"title":title_text ,  "source" : source_text ,"contents": contents_text ,"link":link_text }  
        print(page)
        
        df = pd.DataFrame(result)  
        page += 10
             
    outputFileName = '1번.xlsx'
    df.to_excel(RESULT_PATH+outputFileName, sheet_name='sheet1')

def main():
    info_main = input("="*50+"\\n"+"입력 형식에 맞게 입력해주세요."+"\\n"+" 시작하시려면 Enter를 눌러주세요."+"\\n"+"="*50)
    
    maxpage = input("최대 크롤링할 페이지 수 입력하시오: ")  
    query = input("검색어 입력: ")  
    sort = input("뉴스 검색 방식 입력(관련도순=0  최신순=1  오래된순=2): ")    
    s_date = input("시작날짜 입력(2019.01.04):") 
    e_date = input("끝날짜 입력(2019.01.05):")   
    
    crawler(maxpage,query,sort,s_date,e_date) 
    
main()

komo = Komoran()
Data = pd.read_excel('1번.xlsx')

Data.tail()

contents = []
for i in Data['contents']:
    contents.append(i)
contents

(2) 전처리

기자의 이메일 주소나 말줄임표 등의 특수기호, 한글 자음, 모음이 모두 걸러지도록 cleansing 함수를 정의하였다.

def cleansing(text):
    pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+)' 
    text = re.sub(pattern=pattern, repl='', string=text)
   # pattern = '(http|ftp|https)://(?:[-\\w.]|(?:%[\\da-fA-F]{2}))+' 
   # text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' 
    text = re.sub(pattern=pattern, repl='', string=text)
   # pattern = '<[^>]*>'     
   # text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '[^\\w\\s]'
    text = re.sub(pattern=pattern, repl='', string=text)
    return text

clean_contents = []
for text in contents:
    a = cleansing(str(text))
    clean_contents.append(a)
clean_contents

(3) 형태소 분석기를 사용한 명사 추출

komoran을 사용하여 각 문장의 형태를 리스트로 유지하며 명사만 추출하였다.