Merge pull request #68 from Pseudo-Lab/project/news

Project/news
Pseudo-Lab · Nov 11, 2023 · 7807a12 · 7807a12
2 parents 4a67cef + 0386083
commit 7807a12
Show file tree

Hide file tree

Showing 10 changed files with 220 additions and 0 deletions.
diff --git a/project/news_dashboard/Dockerfile b/project/news_dashboard/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.10
+
+RUN apt-get update && \
+    apt-get install -y openjdk-17-jdk && \
+    apt-get clean;
+
+ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-amd64
+
+ENV PATH $JAVA_HOME/bin:$PATH
+
+COPY requirements.txt requirements.txt
+RUN pip install -U pip &&  \
+    pip install -r requirements.txt
+
+COPY . /app
+WORKDIR /app
+
+CMD ["python", "main.py"]
diff --git a/project/news_dashboard/README.md b/project/news_dashboard/README.md
@@ -0,0 +1,39 @@
+## Hot News DashBoard
+
+> 네이버 많이 본 뉴스 제목 키워드 기반 대시보드
+
+네이버에서 많이 본 뉴스 섹션에서 제목을 크롤링하여, 엘라스틱서치에 저장 후 키바나로 시각화했습니다. 
+
+
+![news_dashboard.png](img%2Fnews_dashboard.png)
+<br/><br>
+
+### 개발환경
+- Airflow
+- ElasticSearch
+- Kibana
+- Python
+<br/><br>
+
+
+### 프로젝트 아키텍처
+![architecture.png](img%2Farchitecture.png)
+<br/><br>
+
+### 설명
+```
+1. Python 으로 네이버 뉴스를 api 로 가져옵니다.
+2. 토크나이징해서 엘라스틱서치에 저장합니다.
+3. 키바나를 이용해 대시보드를 생성합니다.
+4. Airflow 를 활용해 주기적으로 데이터를 가져옵니다.
+(프로젝트 파이썬 코드를 Docker image 화해서 
+ Airflow KubernetesPodOperator를 활용해 실행시킵니다.)
+```
+<br/><br>
+### Airflow 는 Kubernetes Cluster 위에 구축했습니다.
+**spec**
+- CPU 2, MEM 8GB
+- node 3개 (worker 3개)
+- CeleryExecutor
+
+Helm Chart: https://artifacthub.io/packages/helm/airflow-helm/airflow
diff --git a/project/news_dashboard/es/connection.py b/project/news_dashboard/es/connection.py
@@ -0,0 +1,59 @@
+from elasticsearch import Elasticsearch
+
+
+class ES:
+    def __init__(self, url):
+        self.client = Elasticsearch(url)
+
+    def create_analysis_index(self):
+        body = {
+            "settings": {
+                "analysis": {
+                    "tokenizer": {
+                        "nori_none": {
+                            "type": "nori_tokenizer",
+                            "decompound_mode": "none"
+                        },
+                        "nori_discard": {
+                            "type": "nori_tokenizer",
+                            "decompound_mode": "discard"
+                        },
+                        "nori_mixed": {
+                            "type": "nori_tokenizer",
+                            "decompound_mode": "mixed"
+                        }
+                    },
+                    "analyzer": {
+                        "my_nori": {
+                            "type": "custom",
+                            "tokenizer": "nori_mixed"
+                        }
+                    }
+                }
+            },
+            "mappings": {
+                "properties": {
+                    "title": {
+                        "type": "text",
+                        "analyzer": "my_nori",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword"
+                            }
+                        }
+                    },
+                    "category": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword"
+                            }
+                        }
+                    },
+                    "@timestamp": {"type": "date"},
+                    "url": {"type": "text"}
+                }
+            }
+        }
+        res = self.client.indices.create(index='news', body=body)
+        return res
diff --git a/project/news_dashboard/img/architecture.png b/project/news_dashboard/img/architecture.png
diff --git a/project/news_dashboard/img/news_dashboard.png b/project/news_dashboard/img/news_dashboard.png
diff --git a/project/news_dashboard/main.py b/project/news_dashboard/main.py
@@ -0,0 +1,7 @@
+import time
+from src.save_data import save_to_es
+
+if __name__ == '__main__':
+    start = time.time()
+    save_to_es()
+    end = time.time()
diff --git a/project/news_dashboard/requirements.txt b/project/news_dashboard/requirements.txt
@@ -0,0 +1,16 @@
+beautifulsoup4==4.12.2
+certifi==2023.7.22
+charset-normalizer==3.3.2
+elastic-transport==8.10.0
+elasticsearch==8.10.1
+fake-useragent==1.3.0
+idna==3.4
+JPype1==1.4.1
+konlpy==0.6.0
+lxml==4.9.3
+numpy==1.26.1
+packaging==23.2
+pytz==2023.3.post1
+requests==2.31.0
+soupsieve==2.5
+urllib3==2.0.7
diff --git a/project/news_dashboard/src/get_data.py b/project/news_dashboard/src/get_data.py
@@ -0,0 +1,57 @@
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+import re
+import pytz
+from fake_useragent import UserAgent
+
+from src.preprocess_data import tokenize
+
+
+class News:
+    def __init__(self):
+        self.request = requests.session()
+
+    def get_channel(self):
+        user_agent = UserAgent()
+        url = 'https://news.naver.com/main/ranking/popularDay.naver'
+        res = self.request.get(url, headers={'User-Agent': user_agent.random})
+        soup = BeautifulSoup(res.text, 'html.parser')
+
+        channel_ids = soup.find_all('a', class_='rankingnews_box_head')
+        channel_name = soup.find_all('strong', class_='rankingnews_name')
+
+        channels = {}
+
+        for (idx, name) in zip(channel_ids, channel_name):
+            channels[str(re.search(r'press/(\d{3})/', idx['href']).group(1))] = name.text
+        return channels
+
+    def get_popular_news(self, channel):
+        # 많이 본 뉴스
+        news_link = []
+        url = f'https://media.naver.com/press/{channel}/ranking?type=popular'
+        res = self.request.get(url)
+        soup = BeautifulSoup(res.text, 'html.parser')
+
+        hot_news = soup.find_all('a', class_='_es_pc_link')
+
+        for news in hot_news:
+            news_link.append(news['href'])
+        return news_link
+
+    def get_title_and_tag(self, link, channel):
+        res = self.request.get(link)
+        soup = BeautifulSoup(res.text, 'html.parser')
+        title = soup.find('meta', property='og:title').get('content')
+
+        categorize_item = soup.find('em', class_='media_end_categorize_item')
+        category_text = ''
+        if categorize_item:
+            category_text = categorize_item.get_text(strip=True)
+        return {'title': title,
+                'tags': tokenize(title),
+                'category': category_text,
+                '@timestamp': datetime.now(pytz.timezone('Asia/Seoul')).isoformat(),
+                'url': link,
+                'channel': channel}
diff --git a/project/news_dashboard/src/preprocess_data.py b/project/news_dashboard/src/preprocess_data.py
@@ -0,0 +1,10 @@
+import re
+
+from konlpy.tag import Hannanum
+
+
+def tokenize(text):
+    han = Hannanum()
+    processed_word = [re.sub(r'[^\w\s]', '', word) for word in han.nouns(text)]
+    filtered_word = [word for word in processed_word if len(word) > 1]
+    return filtered_word
diff --git a/project/news_dashboard/src/save_data.py b/project/news_dashboard/src/save_data.py
@@ -0,0 +1,14 @@
+from src.get_data import News
+from es.connection import ES
+
+
+def save_to_es():
+    news = News()
+    es = ES("http://34.64.93.25:9200").client
+
+    channels = news.get_channel()
+    for idx, name in channels.items():
+        news_links = news.get_popular_news(idx)
+        for news_link in news_links:
+            data = news.get_title_and_tag(news_link, name)
+            response = es.index(index='news', body=data)