Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions recommendation_system/candidate_layer/candidate_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@


from typing import Dict, List
from recommendation_system.candidate_layer.base import BaseCandidateModel
from candidate_model_util import FastBM25


# demo data
from organic_data import user_features, article_data
Comment on lines +8 to +9
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's a good idea and maybe you can share these fixtures with other DS.
c.c. hane1818 PuChenTW



# class BaseCandidateModel:
# def __init__(self):
# a = 0



class DemoCandidateModel(BaseCandidateModel):
def __init__(self):
super().__init__()
# init simple article data data process
Comment on lines +18 to +21
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, thx for adding comment

self.document_list, self.title2organic = self.simple_article_data_process()
title_list = list(self.title2organic.keys())

# init BM25 model
self.fast_bm25_model = FastBM25(corpus=self.document_list, title_list=title_list, batch_size=3)

# init LM model


def get_candidates(self, user_features: Dict) -> List[Dict]:
tags = user_features['tags']
query = ' '.join(tags)
doc_element_with_score = self.fast_bm25_model.main(query=query)
candidates = [self.title2organic[doc_element[0]] for doc_element in doc_element_with_score]
return candidates


def simple_article_data_process(self):
document_list = []
title2organic = dict()
for article_data_element in article_data:
data = article_data_element['title'] + ',' + article_data_element['subtitle']
document_list.append(data)
title2organic[article_data_element['title']] = article_data_element
return document_list, title2organic







if __name__ == '__main__':
print(DemoCandidateModel().get_candidates(user_features=user_features))
52 changes: 52 additions & 0 deletions recommendation_system/candidate_layer/candidate_model_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@





from gensim.summarization.bm25 import BM25
import jieba




class FastBM25:
def __init__(self, corpus=list, title_list=list, batch_size=32):
# init
self.title_list = title_list

# word segmentation
self.corpus_ws = self.word_segmentation(corpus=corpus, batch_size=batch_size)

# lauch bm25 moidel
Comment on lines +14 to +20
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto, nice!

self.bm25 = BM25(self.corpus_ws)


def word_segmentation(self, corpus=list ,batch_size=int):
# init parameter and container
corpus_ws = []
batch_num = int(len(corpus) / batch_size) + 1
# main
for i in range(batch_num):
batch_corpus = corpus[i*batch_size : (i+1)*batch_size]
batch_corpus_ws = self.word_segmentation_func(batch_corpus=batch_corpus)
corpus_ws += batch_corpus_ws
return corpus_ws

def word_segmentation_func(self, batch_corpus=list):
batch_corpus_ws = []
for sent in batch_corpus:
sent_ws = ' '.join(jieba.cut(sent))
sent_ws = sent_ws.split()
batch_corpus_ws.append(sent_ws)
return batch_corpus_ws


def main(self, query=str):
query_ws = self.word_segmentation_func(batch_corpus=[query])[0]
scores = self.bm25.get_scores(query_ws)
element_with_score = [[self.title_list[i], sent_ws, scores[i]] for i, sent_ws in enumerate(self.corpus_ws)]
element_with_score = sorted(element_with_score, reverse=True, key=lambda x:x[2])
return element_with_score



39 changes: 39 additions & 0 deletions recommendation_system/candidate_layer/organic_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@



user_features = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you think if we put them into fixtures folder?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can create any folder if you want

"demographic": {
"age": 20,
"gender": "M",
"city": "Taipei",
"career": "Software Engineer",
},
"is_volunteer": True,
"python_experience": 4,
"tags": ["youtube", "AI"],
"vectors": [0, 0, 0]
}



article_data = [
{
"title": "2021 PyCon TW x PyHug Meetup",
"image_url": "https://pbs.twimg.com/media/E_Skh8MVQAUmPHm.jpg",
"subtitle": "PyHug 簡介: 歡迎來到 PyHUG。我們是一群活動於新竹周邊的 Python 程式員。 我們會定期舉辦技術討論與程式設計的聚會。非常歡迎你加入我們!",
},
{
"title": "#7 | FAANG 工作環境跟外面有什麼不一樣?想進入 FAANG 就要聽這集!- Kir Chou",
"image_url": "https://i.imgur.com/GrMYBUa.png",
"subtitle": "這次邀請到的來賓是正在日本 Google 工作的 Kir 跟我們分享他在兩間 FAANG 工作過的經驗。想知道 Kir 在 FAANG 擔任軟體工程師的時候怎麼使用 Python 以及在公司內部推動重要的專案?另外,聽說他沒有刷題就加入 FAANG?!Wow 懶得刷題的聽眾快來聽,這集聽到賺到!PyCast 終於回歸拉!主持人在今年大會過後忙到被 👻 抓走沒時間錄新節目QQ為了讓 PyCast 再次偉大,邀請 Apple Podcast 的聽眾動動手指給我們五星跟留言建議🙏🏼🙏🏼🙏🏼#faang #japan #swe #makepycastgreatagain",
},
{
"title": "贊助商 - Berry AI",
"image_url": "https://i.imgur.com/ktvzhsu.jpg",
"subtitle": "Berry AI 是一間位於台北的 AI 新創,致力於運用電腦視覺技術幫助速食業者蒐集數據,改善現有營運流程。技術團隊由一群充滿熱情的 AI 及軟體工程師組成,分別來自海內外知名學術機構與大型科技公司。此外,我們得到台灣上市公司飛捷科技的注資與支持,該公司擁有多年為大型企業落地工業電腦的經驗,提供穩定的資金來源與客戶關係。如今,Berry AI 已與數間全球 Top-10 速食業者展開合作,業務與團隊都迅速擴張中。欲了解更多訊息,請瀏覽 berry-ai.com。",
},
{
"title": "他媽的給我買票喔!",
"image_url": "https://i.imgur.com/WYiNl3z.png",
"subtitle": "公道價八萬一",
}]
2 changes: 1 addition & 1 deletion recommendation_system/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class RecommendationSystem(object):
def recommend(cls, recipient_id: Text) -> List[Dict]:
"""
main logic is as follow:
1. get experiment config
1. get experiment config
2. get feature
3. use candidate layer to get candidates
4. use ranking layer to sort candidates
Expand Down