-
Notifications
You must be signed in to change notification settings - Fork 9
Candidate model #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Candidate model #11
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
|
||
|
||
from typing import Dict, List | ||
from recommendation_system.candidate_layer.base import BaseCandidateModel | ||
from candidate_model_util import FastBM25 | ||
|
||
|
||
# demo data | ||
from organic_data import user_features, article_data | ||
|
||
|
||
# class BaseCandidateModel: | ||
# def __init__(self): | ||
# a = 0 | ||
|
||
|
||
|
||
class DemoCandidateModel(BaseCandidateModel): | ||
def __init__(self): | ||
super().__init__() | ||
# init simple article data data process | ||
Comment on lines
+18
to
+21
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice, thx for adding comment |
||
self.document_list, self.title2organic = self.simple_article_data_process() | ||
title_list = list(self.title2organic.keys()) | ||
|
||
# init BM25 model | ||
self.fast_bm25_model = FastBM25(corpus=self.document_list, title_list=title_list, batch_size=3) | ||
|
||
# init LM model | ||
|
||
|
||
def get_candidates(self, user_features: Dict) -> List[Dict]: | ||
tags = user_features['tags'] | ||
query = ' '.join(tags) | ||
doc_element_with_score = self.fast_bm25_model.main(query=query) | ||
candidates = [self.title2organic[doc_element[0]] for doc_element in doc_element_with_score] | ||
return candidates | ||
|
||
|
||
def simple_article_data_process(self): | ||
document_list = [] | ||
title2organic = dict() | ||
for article_data_element in article_data: | ||
data = article_data_element['title'] + ',' + article_data_element['subtitle'] | ||
document_list.append(data) | ||
title2organic[article_data_element['title']] = article_data_element | ||
return document_list, title2organic | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
print(DemoCandidateModel().get_candidates(user_features=user_features)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
|
||
|
||
|
||
|
||
|
||
from gensim.summarization.bm25 import BM25 | ||
import jieba | ||
|
||
|
||
|
||
|
||
class FastBM25: | ||
def __init__(self, corpus=list, title_list=list, batch_size=32): | ||
# init | ||
self.title_list = title_list | ||
|
||
# word segmentation | ||
self.corpus_ws = self.word_segmentation(corpus=corpus, batch_size=batch_size) | ||
|
||
# lauch bm25 moidel | ||
Comment on lines
+14
to
+20
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto, nice! |
||
self.bm25 = BM25(self.corpus_ws) | ||
|
||
|
||
def word_segmentation(self, corpus=list ,batch_size=int): | ||
# init parameter and container | ||
corpus_ws = [] | ||
batch_num = int(len(corpus) / batch_size) + 1 | ||
# main | ||
for i in range(batch_num): | ||
batch_corpus = corpus[i*batch_size : (i+1)*batch_size] | ||
batch_corpus_ws = self.word_segmentation_func(batch_corpus=batch_corpus) | ||
corpus_ws += batch_corpus_ws | ||
return corpus_ws | ||
|
||
def word_segmentation_func(self, batch_corpus=list): | ||
batch_corpus_ws = [] | ||
for sent in batch_corpus: | ||
sent_ws = ' '.join(jieba.cut(sent)) | ||
sent_ws = sent_ws.split() | ||
batch_corpus_ws.append(sent_ws) | ||
return batch_corpus_ws | ||
|
||
|
||
def main(self, query=str): | ||
query_ws = self.word_segmentation_func(batch_corpus=[query])[0] | ||
scores = self.bm25.get_scores(query_ws) | ||
element_with_score = [[self.title_list[i], sent_ws, scores[i]] for i, sent_ws in enumerate(self.corpus_ws)] | ||
element_with_score = sorted(element_with_score, reverse=True, key=lambda x:x[2]) | ||
return element_with_score | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
|
||
|
||
|
||
user_features = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do you think if we put them into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can create any folder if you want |
||
"demographic": { | ||
"age": 20, | ||
"gender": "M", | ||
"city": "Taipei", | ||
"career": "Software Engineer", | ||
}, | ||
"is_volunteer": True, | ||
"python_experience": 4, | ||
"tags": ["youtube", "AI"], | ||
"vectors": [0, 0, 0] | ||
} | ||
|
||
|
||
|
||
article_data = [ | ||
{ | ||
"title": "2021 PyCon TW x PyHug Meetup", | ||
"image_url": "https://pbs.twimg.com/media/E_Skh8MVQAUmPHm.jpg", | ||
"subtitle": "PyHug 簡介: 歡迎來到 PyHUG。我們是一群活動於新竹周邊的 Python 程式員。 我們會定期舉辦技術討論與程式設計的聚會。非常歡迎你加入我們!", | ||
}, | ||
{ | ||
"title": "#7 | FAANG 工作環境跟外面有什麼不一樣?想進入 FAANG 就要聽這集!- Kir Chou", | ||
"image_url": "https://i.imgur.com/GrMYBUa.png", | ||
"subtitle": "這次邀請到的來賓是正在日本 Google 工作的 Kir 跟我們分享他在兩間 FAANG 工作過的經驗。想知道 Kir 在 FAANG 擔任軟體工程師的時候怎麼使用 Python 以及在公司內部推動重要的專案?另外,聽說他沒有刷題就加入 FAANG?!Wow 懶得刷題的聽眾快來聽,這集聽到賺到!PyCast 終於回歸拉!主持人在今年大會過後忙到被 👻 抓走沒時間錄新節目QQ為了讓 PyCast 再次偉大,邀請 Apple Podcast 的聽眾動動手指給我們五星跟留言建議🙏🏼🙏🏼🙏🏼#faang #japan #swe #makepycastgreatagain", | ||
}, | ||
{ | ||
"title": "贊助商 - Berry AI", | ||
"image_url": "https://i.imgur.com/ktvzhsu.jpg", | ||
"subtitle": "Berry AI 是一間位於台北的 AI 新創,致力於運用電腦視覺技術幫助速食業者蒐集數據,改善現有營運流程。技術團隊由一群充滿熱情的 AI 及軟體工程師組成,分別來自海內外知名學術機構與大型科技公司。此外,我們得到台灣上市公司飛捷科技的注資與支持,該公司擁有多年為大型企業落地工業電腦的經驗,提供穩定的資金來源與客戶關係。如今,Berry AI 已與數間全球 Top-10 速食業者展開合作,業務與團隊都迅速擴張中。欲了解更多訊息,請瀏覽 berry-ai.com。", | ||
}, | ||
{ | ||
"title": "他媽的給我買票喔!", | ||
"image_url": "https://i.imgur.com/WYiNl3z.png", | ||
"subtitle": "公道價八萬一", | ||
}] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's a good idea and maybe you can share these fixtures with other DS.
c.c. hane1818 PuChenTW