-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
178 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
lib/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
"""Fetch frequencies""" | ||
from typing import Any, Dict | ||
|
||
from sqlalchemy.orm import Session | ||
|
||
from server.lib.utils import get_texts, get_type_and_pos_dicts | ||
|
||
|
||
def fetch_frequencies(category: str, tagset: str, data: Dict[str, Any], db: Session) -> Dict[str, Any]: | ||
language = data["lang"] | ||
texts = get_texts(db, language, category=category) | ||
type_dict, pos_dict = get_type_and_pos_dicts(category=category, tagset=tagset, texts=texts) | ||
|
||
return { | ||
f"{category}_pos": [ | ||
{ | ||
"count": c, "pos": k.split("_", maxsplit=1)[-1], category: k.rsplit("_", maxsplit=1)[0] | ||
} for k, c in sorted(list(type_dict.items()), key=lambda x:x[1], reverse=True) | ||
], | ||
"pos_list": sorted(pos_dict.items(), key=lambda x:x[1], reverse=True), | ||
"number_of_texts": len(texts) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
"""Fetch frequencies""" | ||
from typing import Any, Dict | ||
|
||
from sqlalchemy.orm import Session | ||
|
||
from server.lib.utils import get_texts, get_type_and_pos_dicts | ||
from swegram_main.config import PT_TAGS, SUC_TAGS | ||
|
||
|
||
PUNCT_TAGS = [*SUC_TAGS[-3:], *PT_TAGS[-10:], "PUNCT"] | ||
|
||
|
||
def fetch_lengths(category: str, tagset: str, data: Dict[str, Any], db: Session) -> Dict[str, Any]: | ||
language = data["lang"] | ||
texts = get_texts(db, language, category=category) | ||
type_dict, pos_dict = get_type_and_pos_dicts(category=category, tagset=tagset, texts=texts) | ||
|
||
sorted_pos_list = [pos for pos, _ in sorted(pos_dict.items(), key=lambda x:x[1], reverse=True)] | ||
length_dict = {} # {1: {PP: {word: count}}} | ||
|
||
for type_pos, count in type_dict.items(): | ||
_type, pos = type_pos.rsplit("_", maxsplit=1) | ||
if pos in PUNCT_TAGS: | ||
continue | ||
length = len(_type) | ||
if length in length_dict: | ||
if pos in length_dict[length]: | ||
if _type in length_dict[length][pos]: | ||
length_dict[length][pos][_type] += count | ||
else: | ||
length_dict[length][pos][_type] = count | ||
else: | ||
length_dict[length][pos] = {_type: count} | ||
else: | ||
length_dict[length] = {pos: {_type: count}} | ||
|
||
# breakpoint() | ||
length_list = [{ | ||
"Length": { | ||
"total": length, | ||
"data": [ | ||
{"type": pos, "count": sum(length_dict[length][pos].values())} for pos in length_dict[length].keys() | ||
] | ||
}, | ||
**{ | ||
pos: { | ||
"total": sum(length_dict[length].get(pos, {}).values()), | ||
"data": [{"type": k, "count": v} for k, v in length_dict[length].get(pos, {}).items()] | ||
} for pos in sorted_pos_list | ||
} | ||
} for length in sorted(length_dict.keys())] | ||
|
||
data = { | ||
"number_of_texts": len(texts), | ||
"pos_list": [ | ||
{ | ||
"label": e, "prop": e | ||
} for e in ["Length", *sorted_pos_list, "Total"] | ||
], | ||
"length_list": [{ | ||
**length, | ||
"Total": { | ||
"total": sum([data_dict["count"] for data_dict in length["Length"]["data"]]), | ||
"data": [] | ||
} | ||
} for length in length_list] | ||
} | ||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
"""utils.py""" | ||
from typing import Any, Dict, List, Optional, Tuple | ||
|
||
from sqlalchemy.orm import Session | ||
|
||
from server.models import Text | ||
|
||
|
||
def get_texts(db: Session, language: str, category: Optional[str] = None) -> List[Text]: | ||
|
||
texts = db.query(Text).filter( Text.language == language ).filter( Text.activated == True ) | ||
|
||
if category == "norm": | ||
return [ text for text in texts.filter( Text.normalized == True )] | ||
if category == "lemma": | ||
return [ text for text in texts.filter( Text.tagged == True )] | ||
|
||
return [ text for text in texts] | ||
|
||
|
||
def get_type_and_pos_dicts(category: str, tagset: str, texts: List[Text]) -> Tuple[Dict[str, Any], Dict[str, Any]]: | ||
type_dict, pos_dict = {}, {} | ||
for text in texts: | ||
for type_pos, count in text.as_dict()[f"freq_{category}_dict_{tagset}"].items(): | ||
_, pos = type_pos.rsplit("_", maxsplit=1) | ||
if pos in pos_dict: | ||
pos_dict[pos] += count | ||
else: | ||
pos_dict[pos] = count | ||
if type_pos in type_dict: | ||
type_dict[type_pos] += count | ||
else: | ||
type_dict[type_pos] = count | ||
return type_dict, pos_dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from typing import Any, Dict | ||
|
||
from fastapi import APIRouter, Body, Depends, Path | ||
from fastapi.responses import JSONResponse | ||
from sqlalchemy.orm import Session | ||
|
||
from server.lib.fetch_frequencies import fetch_frequencies | ||
from server.routers.database import get_db | ||
|
||
|
||
router = APIRouter() | ||
|
||
|
||
@router.post("/{category}/{tagset}/") | ||
def fetch_word_and_tag( | ||
category: str = Path(..., title="Category"), | ||
tagset: str = Path(..., title="Tagset"), | ||
data: Dict[str, Any] = Body(...), | ||
db: Session = Depends(get_db) | ||
) -> JSONResponse: | ||
return JSONResponse(fetch_frequencies(category, tagset, data, db)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from typing import Any, Dict | ||
|
||
from fastapi import APIRouter, Body, Depends, Path | ||
from fastapi.responses import JSONResponse | ||
from sqlalchemy.orm import Session | ||
|
||
from server.routers.database import get_db | ||
from server.lib.fetch_lengths import fetch_lengths | ||
|
||
|
||
router = APIRouter() | ||
|
||
|
||
@router.post("/{category}/{tagset}/") | ||
def fetch_word_and_tag( | ||
category: str = Path(..., title="Category"), | ||
tagset: str = Path(..., title="Tagset"), | ||
data: Dict[str, Any] = Body(...), | ||
db: Session = Depends(get_db) | ||
) -> JSONResponse: | ||
|
||
return JSONResponse(fetch_lengths(category, tagset, data, db)) |