Skip to content

Commit

Permalink
Merge pull request #68 from Pseudo-Lab/project/news
Browse files Browse the repository at this point in the history
Project/news
  • Loading branch information
HWALIMLEE authored Nov 11, 2023
2 parents 4a67cef + 0386083 commit 7807a12
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 0 deletions.
18 changes: 18 additions & 0 deletions project/news_dashboard/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM python:3.10

RUN apt-get update && \
apt-get install -y openjdk-17-jdk && \
apt-get clean;

ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-amd64

ENV PATH $JAVA_HOME/bin:$PATH

COPY requirements.txt requirements.txt
RUN pip install -U pip && \
pip install -r requirements.txt

COPY . /app
WORKDIR /app

CMD ["python", "main.py"]
39 changes: 39 additions & 0 deletions project/news_dashboard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## Hot News DashBoard

> 네이버 많이 본 뉴스 제목 키워드 기반 대시보드
네이버에서 많이 본 뉴스 섹션에서 제목을 크롤링하여, 엘라스틱서치에 저장 후 키바나로 시각화했습니다.


![news_dashboard.png](img%2Fnews_dashboard.png)
<br/><br>

### 개발환경
- Airflow
- ElasticSearch
- Kibana
- Python
<br/><br>


### 프로젝트 아키텍처
![architecture.png](img%2Farchitecture.png)
<br/><br>

### 설명
```
1. Python 으로 네이버 뉴스를 api 로 가져옵니다.
2. 토크나이징해서 엘라스틱서치에 저장합니다.
3. 키바나를 이용해 대시보드를 생성합니다.
4. Airflow 를 활용해 주기적으로 데이터를 가져옵니다.
(프로젝트 파이썬 코드를 Docker image 화해서
Airflow KubernetesPodOperator를 활용해 실행시킵니다.)
```
<br/><br>
### Airflow 는 Kubernetes Cluster 위에 구축했습니다.
**spec**
- CPU 2, MEM 8GB
- node 3개 (worker 3개)
- CeleryExecutor

Helm Chart: https://artifacthub.io/packages/helm/airflow-helm/airflow
59 changes: 59 additions & 0 deletions project/news_dashboard/es/connection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from elasticsearch import Elasticsearch


class ES:
def __init__(self, url):
self.client = Elasticsearch(url)

def create_analysis_index(self):
body = {
"settings": {
"analysis": {
"tokenizer": {
"nori_none": {
"type": "nori_tokenizer",
"decompound_mode": "none"
},
"nori_discard": {
"type": "nori_tokenizer",
"decompound_mode": "discard"
},
"nori_mixed": {
"type": "nori_tokenizer",
"decompound_mode": "mixed"
}
},
"analyzer": {
"my_nori": {
"type": "custom",
"tokenizer": "nori_mixed"
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "my_nori",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"@timestamp": {"type": "date"},
"url": {"type": "text"}
}
}
}
res = self.client.indices.create(index='news', body=body)
return res
Binary file added project/news_dashboard/img/architecture.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added project/news_dashboard/img/news_dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 7 additions & 0 deletions project/news_dashboard/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import time
from src.save_data import save_to_es

if __name__ == '__main__':
start = time.time()
save_to_es()
end = time.time()
16 changes: 16 additions & 0 deletions project/news_dashboard/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
beautifulsoup4==4.12.2
certifi==2023.7.22
charset-normalizer==3.3.2
elastic-transport==8.10.0
elasticsearch==8.10.1
fake-useragent==1.3.0
idna==3.4
JPype1==1.4.1
konlpy==0.6.0
lxml==4.9.3
numpy==1.26.1
packaging==23.2
pytz==2023.3.post1
requests==2.31.0
soupsieve==2.5
urllib3==2.0.7
57 changes: 57 additions & 0 deletions project/news_dashboard/src/get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import pytz
from fake_useragent import UserAgent

from src.preprocess_data import tokenize


class News:
def __init__(self):
self.request = requests.session()

def get_channel(self):
user_agent = UserAgent()
url = 'https://news.naver.com/main/ranking/popularDay.naver'
res = self.request.get(url, headers={'User-Agent': user_agent.random})
soup = BeautifulSoup(res.text, 'html.parser')

channel_ids = soup.find_all('a', class_='rankingnews_box_head')
channel_name = soup.find_all('strong', class_='rankingnews_name')

channels = {}

for (idx, name) in zip(channel_ids, channel_name):
channels[str(re.search(r'press/(\d{3})/', idx['href']).group(1))] = name.text
return channels

def get_popular_news(self, channel):
# 많이 본 뉴스
news_link = []
url = f'https://media.naver.com/press/{channel}/ranking?type=popular'
res = self.request.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

hot_news = soup.find_all('a', class_='_es_pc_link')

for news in hot_news:
news_link.append(news['href'])
return news_link

def get_title_and_tag(self, link, channel):
res = self.request.get(link)
soup = BeautifulSoup(res.text, 'html.parser')
title = soup.find('meta', property='og:title').get('content')

categorize_item = soup.find('em', class_='media_end_categorize_item')
category_text = ''
if categorize_item:
category_text = categorize_item.get_text(strip=True)
return {'title': title,
'tags': tokenize(title),
'category': category_text,
'@timestamp': datetime.now(pytz.timezone('Asia/Seoul')).isoformat(),
'url': link,
'channel': channel}
10 changes: 10 additions & 0 deletions project/news_dashboard/src/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import re

from konlpy.tag import Hannanum


def tokenize(text):
han = Hannanum()
processed_word = [re.sub(r'[^\w\s]', '', word) for word in han.nouns(text)]
filtered_word = [word for word in processed_word if len(word) > 1]
return filtered_word
14 changes: 14 additions & 0 deletions project/news_dashboard/src/save_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from src.get_data import News
from es.connection import ES


def save_to_es():
news = News()
es = ES("http://34.64.93.25:9200").client

channels = news.get_channel()
for idx, name in channels.items():
news_links = news.get_popular_news(idx)
for news_link in news_links:
data = news.get_title_and_tag(news_link, name)
response = es.index(index='news', body=data)

0 comments on commit 7807a12

Please sign in to comment.