Skip to content

Commit a30e538

Browse files
authored
[Feat & Fix & Comment] 오답분석 기능 답변 고도화 및 연관 개념 답변, 서버 로그 추가
[Feat & Fix & Comment] 오답분석 기능 답변 고도화 및 연관 개념 답변, 서버 로그 추가
2 parents 49a0a52 + 98ed29d commit a30e538

File tree

1 file changed

+81
-45
lines changed

1 file changed

+81
-45
lines changed

app/main.py

Lines changed: 81 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import os
1313
from openai import OpenAI
1414
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
15+
import time
16+
from datetime import datetime
1517

1618
# 로깅 추가
1719
logging.basicConfig(level=logging.INFO)
@@ -20,10 +22,10 @@
2022
app = FastAPI()
2123

2224
# 클라이언트 생성
23-
s3_client = boto3.client( "s3",
24-
region_name="ap-northeast-2")
25+
s3_client = boto3.client("s3",
26+
region_name="ap-northeast-2",)
2527
ssm_client = boto3.client('ssm',
26-
region_name='ap-northeast-2')
28+
region_name="ap-northeast-2",)
2729

2830
# s3 버킷 연결
2931
try:
@@ -150,7 +152,7 @@ async def upload_directly(upload_file: UploadFile = File(...)):
150152

151153

152154
@app.get("/analysis/whole")
153-
async def analysis(problem_url = None):
155+
async def analysis(problem_url=None):
154156
""" Curriculum-based Chat Completion API with CLOVA OCR & ChatGPT """
155157
await connect_milvus() # milvus 서버 연결
156158

@@ -159,11 +161,15 @@ async def analysis(problem_url = None):
159161
else:
160162
problem_text = await ocr(problem_url)
161163

162-
retrieving_result = await retrieve(problem_text)
164+
retrieving_result, subjects, units, concepts = await retrieve(problem_text)
163165
question = await augment(retrieving_result, problem_text)
164166
answer = await generate(question)
165167

166-
return JSONResponse(content={"message": "Problem Analysis Finished Successfully", "answer": answer})
168+
return JSONResponse(content={"message": "Problem Analysis Finished Successfully",
169+
"subject": list(set(subjects)),
170+
"unit": list(set(units)),
171+
"key_concept": list(set(concepts)),
172+
"answer": answer})
167173

168174

169175
@app.get("/analysis/ocr")
@@ -174,12 +180,13 @@ async def ocr(problem_url: str):
174180
import time
175181
import json
176182

177-
logger.info("Analyzing problem from this image URL: %s", problem_url)
178183
try:
184+
dt3 = datetime.fromtimestamp(time.time())
179185
s3_key = parse_s3_url(problem_url)
180186
img_bytes = download_image_from_s3(s3_key) # download from S3
181187
extension = s3_key.split(".")[-1]
182-
logger.info("Completed Download & Sending Requests... '%s'", s3_key)
188+
dt4 = datetime.fromtimestamp(time.time())
189+
logger.info(f"{dt3}~{dt4}: 이미지 다운로드 완료")
183190

184191
clova_api_url = ssm_client.get_parameter(
185192
Name='/ono/fastapi/CLOVA_API_URL',
@@ -211,18 +218,18 @@ async def ocr(problem_url: str):
211218
files = [
212219
('file', image_file)
213220
]
214-
logger.info("Processing OCR & Receiving Responses...")
215221

222+
dt3 = datetime.fromtimestamp(time.time())
216223
ocr_response = requests.request("POST", clova_api_url, headers=headers, data=payload, files=files).text
217224
ocr_response_json = json.loads(ocr_response)
218-
logger.info("***** Finished OCR Successfully *****")
225+
dt4 = datetime.fromtimestamp(time.time())
226+
logger.info(f"{dt3}~{dt4}: 이미지 OCR 완료")
219227

220228
infer_texts = []
221229
for image in ocr_response_json["images"]:
222230
for field in image["fields"]:
223231
infer_texts.append(field["inferText"])
224232
result = ' '.join(infer_texts)
225-
226233
return result
227234

228235
except Exception as pe:
@@ -256,14 +263,14 @@ async def upload_curriculum_txt(upload_file: UploadFile = File(...)):
256263

257264
# Mivlus DB 연결
258265
SERVER = os.getenv('SERVER')
259-
logger.info(f"* log >> 환경변수 SERVER: {SERVER}로 받아왔습니다.")
266+
logger.info(f"* log >> 환경변수를 SERVER({SERVER})로 받아왔습니다.")
260267

261268
MILVUS_HOST = ssm_client.get_parameter(
262269
Name=f'/ono/{SERVER}/fastapi/MILVUS_HOST_NAME',
263270
WithDecryption=False
264271
)['Parameter']['Value']
265272
MILVUS_PORT = 19530
266-
COLLECTION_NAME = 'Math2015Curriculum'
273+
COLLECTION_NAME = 'Curriculum2015'
267274
DIMENSION = 1536
268275
INDEX_TYPE = "IVF_FLAT"
269276

@@ -272,13 +279,16 @@ async def upload_curriculum_txt(upload_file: UploadFile = File(...)):
272279
async def connect_milvus():
273280
try:
274281
# Milvus 서버 연결
275-
connections.connect(alias="default", host=MILVUS_HOST, port=MILVUS_PORT)
276-
logger.info(f"* log >> Milvus Server is connected to {MILVUS_HOST}:{MILVUS_PORT}")
282+
dt1 = str(datetime.fromtimestamp(time.time()))
283+
connections.connect(alias="default", host=MILVUS_HOST, port=MILVUS_PORT) # server 용
284+
# connections.connect(host="127.0.0.1", port=19530, db="default") # localhost 용
285+
dt2 = str(datetime.fromtimestamp(time.time()))
286+
logger.info(f"{dt1} ~ {dt2}: Milvus 서버 {MILVUS_HOST}:{MILVUS_PORT}에 연결 완료")
277287

278288
# 컬렉션의 스키마 출력
279289
if utility.has_collection(COLLECTION_NAME):
280290
collection = Collection(COLLECTION_NAME)
281-
logger.info("* 존재하는 Collection Schema:")
291+
logger.info(f"* 존재하는 Collection {COLLECTION_NAME} Schema:")
282292
for field in collection.schema.fields:
283293
logger.info(f" - Field Name: {field.name}, Data Type #: {field.dtype}")
284294

@@ -297,18 +307,16 @@ async def create_milvus():
297307
FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=True),
298308
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
299309
FieldSchema(name='content_embedding', dtype=DataType.FLOAT_VECTOR, dim=DIMENSION),
310+
FieldSchema(name='subject_name', dtype=DataType.VARCHAR, max_length=100), # Meta Data1
311+
FieldSchema(name='unit_name', dtype=DataType.VARCHAR, max_length=100), # Meta Data2
312+
FieldSchema(name='main_concept', dtype=DataType.VARCHAR, max_length=100), # Meta Data3
300313
]
301-
schema = CollectionSchema(fields=fields, description='Math2015Curriculum embedding collection')
314+
schema = CollectionSchema(fields=fields, description='2015 Korean High School Curriculum Collection')
302315
collection = Collection(name=COLLECTION_NAME, schema=schema)
303-
logger.info(f"* log >> Collection [{COLLECTION_NAME}] is created.")
316+
logger.info(f"* log >> New Collection [{COLLECTION_NAME}] is created.")
304317

305318
# 인덱스 생성
306-
# 스칼라 인덱스
307-
collection.create_index(
308-
field_name="id"
309-
)
310-
# 벡터 인덱스
311-
index_params = {
319+
index_params = { # 벡터 인덱스
312320
'index_type': INDEX_TYPE,
313321
'metric_type': 'COSINE',
314322
'params': {
@@ -342,15 +350,15 @@ def get_embedding(client, text_list):
342350

343351

344352
@app.get("/milvus/insert")
345-
async def insert_curriculum_embeddings():
353+
async def insert_curriculum_embeddings(subject: str):
346354
""" s3에서 교과과정을 읽고 임베딩하여 Milvus에 삽입 """
347355
# Milvus 연결
348356
await connect_milvus()
349357
collection = Collection(COLLECTION_NAME)
350358

351359
# S3 내 커리큘럼 데이터 로드
352-
texts = []
353-
prefix = 'curriculum/science2015/' # 경로
360+
texts, subject_names, unit_names, main_concepts = [], [], [], []
361+
prefix = f'curriculum/{subject}2015/' # 경로
354362
try:
355363
# 버킷에서 파일 목록 가져오기
356364
s3_curriculum_response = s3_client.list_objects_v2(Bucket=BUCKET_NAME, Prefix=prefix)
@@ -360,20 +368,34 @@ async def insert_curriculum_embeddings():
360368
# S3 객체 가져오기
361369
obj = s3_client.get_object(Bucket=BUCKET_NAME, Key=s3_key)
362370
# 텍스트 읽기
363-
text = obj['Body'].read().decode('utf-8')
371+
data = obj['Body'].read().decode('utf-8')
372+
lines = data.splitlines()
373+
# 메타 데이터 추출
374+
meatdata_lines = [line.strip('#').strip() for line in lines[:3]]
375+
subject_name = meatdata_lines[0]
376+
subject_names.append(subject_name)
377+
unit_name = meatdata_lines[1]
378+
unit_names.append(unit_name)
379+
main_concept = meatdata_lines[2]
380+
main_concepts.append(main_concept)
381+
# 교과과정 내용 추출
382+
text = '\n'.join(lines[3:]).strip()
364383
texts.append(text)
365384
logger.info(f"* log >> read {len(texts)} texts from S3")
366385
except Exception as e:
367386
logger.error(f"Error reading curriculum from S3: {e}")
368387

369-
# 데이터 임베딩
388+
# 교과과정 내용 임베딩
370389
content_embeddings = get_embedding(openai_client, texts)
371390
logger.info(f"* log >> embedding 완료. dimension: {DIMENSION}")
372391

373392
# 데이터 삽입
374393
data = [
375394
texts, # content 필드
376-
content_embeddings # content_embedding 필드
395+
content_embeddings, # content_embedding 필드
396+
subject_names, # subject_name 필드
397+
unit_names, # unit_name 필드
398+
main_concepts # main_concept 필드
377399
]
378400
collection.insert(data)
379401

@@ -390,66 +412,80 @@ async def retrieve(problem_text: str):
390412

391413
# 검색 테스트
392414
query = problem_text
415+
dt5 = str(datetime.fromtimestamp(time.time()))
393416
query_embeddings = [get_embedding(openai_client, [query])]
394417
if not query_embeddings or query_embeddings[0] is None:
395418
raise ValueError("Embedding generation failed")
396-
logger.info(f"* log >> Query embedding 완료")
419+
dt6 = str(datetime.fromtimestamp(time.time()))
420+
logger.info(f"{dt5} ~ {dt6}: 쿼리 임베딩 완료")
397421

398422
search_params = {
399423
'metric_type': 'COSINE',
400424
'params': {
401425
'probe': 20
402426
},
403427
}
428+
dt5 = str(datetime.fromtimestamp(time.time()))
404429
results = collection.search(
405430
data=query_embeddings[0],
406431
anns_field='content_embedding',
407432
param=search_params,
408433
limit=3,
409434
expr=None,
410-
output_fields=['content']
435+
output_fields=['content', 'subject_name', 'unit_name', 'main_concept']
411436
)
437+
dt6 = str(datetime.fromtimestamp(time.time()))
412438
context = ' '.join([result.entity.get('content') for result in results[0]])
413-
logger.info(f"* log >> context found")
414-
415-
# 결과 확인
416-
'''logger.info(f"* log >> 쿼리 결과")
439+
subjects_list = [result.entity.get('subject_name') for result in results[0]]
440+
unit_list = [result.entity.get('unit_name') for result in results[0]]
441+
main_concept_list = [result.entity.get('main_concept') for result in results[0]]
442+
logger.info(f"{dt5} ~ {dt6}: 검색 완료")
443+
logs = ""
417444
for result in results[0]:
418-
logger.info("\n-------------------------------------------------------------------")
419-
logger.info(f"Score : {result.distance}, \nText : \n{result.entity.get('content')}")'''
420-
421-
return context
445+
logs += ("\n"+f"Score : {result.distance}, \
446+
\nInfo: {result.entity.get('subject_name')}\
447+
> {result.entity.get('unit_name')}\
448+
> {result.entity.get('main_concept')}, \
449+
\nText : {result.entity.get('content')}"+"\n\n")
450+
logger.info(f"* log >> 검색 결과: {logs}")
451+
452+
return context, subjects_list, unit_list, main_concept_list
422453
except Exception as e:
423454
logger.error(f"Error in search: {e}")
424455
raise HTTPException(status_code=500, detail=str(e))
425456

426457
@app.get("/analysis/augmentation")
427458
async def augment(curriculum_context, query):
428-
prompt = "교과과정에 기반하여 이 문제에 필요한 개념을 말해줘. 응답은 자연어처럼 제공해줘. \n"
429-
context = curriculum_context
430-
passage = query
459+
prompt = "너는 공책이야. 내가 준 교육과정 중에 아래 문제와 관련된 교육과정을 골라서 고등학생 한 명에게\
460+
이 문제를 왜 틀리거나 헷갈릴 수 있으며, 어떤 개념과 사고과정 등이 필요해 보이는지를 \
461+
이 문제의 의도와 핵심을 짚어가되 너무 길지 않게 정리해서 고등학생에게 보여줘.\n"
462+
context = f"교과과정은 이렇고 {curriculum_context}"
463+
passage = f"문제는 이러해 {query}."
431464
augmented_query = prompt + context + passage
432465
return augmented_query
433466

434467
@app.get("/analysis/generation")
435468
async def generate(question):
436469
def get_chatgpt_response(client, question, model="gpt-4o-mini"):
437470
try:
471+
dt7 = str(datetime.fromtimestamp(time.time()))
438472
gpt_response = client.chat.completions.create(
439473
model=model,
440474
messages=[
441475
{"role": "user",
442476
"content": question
443477
}
444478
],
445-
temperature=0.5
479+
temperature=0.7
446480
)
481+
dt8 = str(datetime.fromtimestamp(time.time()))
482+
logger.info(f"{dt7} ~ {dt8}: LLM 응답 완료")
447483
return gpt_response.choices[0].message.content
448484
except Exception as e:
449485
logger.info(f"Error during GPT querying: {e}")
450486
return None
451487

452488
chatgpt_response = get_chatgpt_response(openai_client, question)
453-
logger.info(f"* log >> ChatGPT Response: {chatgpt_response}")
489+
logger.info(f"* log >> 응답 결과 \n {chatgpt_response}")
454490
return chatgpt_response
455491

0 commit comments

Comments
 (0)