12
12
import os
13
13
from openai import OpenAI
14
14
from pymilvus import connections , utility , FieldSchema , CollectionSchema , DataType , Collection
15
+ import time
16
+ from datetime import datetime
15
17
16
18
# 로깅 추가
17
19
logging .basicConfig (level = logging .INFO )
20
22
app = FastAPI ()
21
23
22
24
# 클라이언트 생성
23
- s3_client = boto3 .client ( "s3" ,
24
- region_name = "ap-northeast-2" )
25
+ s3_client = boto3 .client ("s3" ,
26
+ region_name = "ap-northeast-2" , )
25
27
ssm_client = boto3 .client ('ssm' ,
26
- region_name = ' ap-northeast-2' )
28
+ region_name = " ap-northeast-2" , )
27
29
28
30
# s3 버킷 연결
29
31
try :
@@ -150,7 +152,7 @@ async def upload_directly(upload_file: UploadFile = File(...)):
150
152
151
153
152
154
@app .get ("/analysis/whole" )
153
- async def analysis (problem_url = None ):
155
+ async def analysis (problem_url = None ):
154
156
""" Curriculum-based Chat Completion API with CLOVA OCR & ChatGPT """
155
157
await connect_milvus () # milvus 서버 연결
156
158
@@ -159,11 +161,15 @@ async def analysis(problem_url = None):
159
161
else :
160
162
problem_text = await ocr (problem_url )
161
163
162
- retrieving_result = await retrieve (problem_text )
164
+ retrieving_result , subjects , units , concepts = await retrieve (problem_text )
163
165
question = await augment (retrieving_result , problem_text )
164
166
answer = await generate (question )
165
167
166
- return JSONResponse (content = {"message" : "Problem Analysis Finished Successfully" , "answer" : answer })
168
+ return JSONResponse (content = {"message" : "Problem Analysis Finished Successfully" ,
169
+ "subject" : list (set (subjects )),
170
+ "unit" : list (set (units )),
171
+ "key_concept" : list (set (concepts )),
172
+ "answer" : answer })
167
173
168
174
169
175
@app .get ("/analysis/ocr" )
@@ -174,12 +180,13 @@ async def ocr(problem_url: str):
174
180
import time
175
181
import json
176
182
177
- logger .info ("Analyzing problem from this image URL: %s" , problem_url )
178
183
try :
184
+ dt3 = datetime .fromtimestamp (time .time ())
179
185
s3_key = parse_s3_url (problem_url )
180
186
img_bytes = download_image_from_s3 (s3_key ) # download from S3
181
187
extension = s3_key .split ("." )[- 1 ]
182
- logger .info ("Completed Download & Sending Requests... '%s'" , s3_key )
188
+ dt4 = datetime .fromtimestamp (time .time ())
189
+ logger .info (f"{ dt3 } ~{ dt4 } : 이미지 다운로드 완료" )
183
190
184
191
clova_api_url = ssm_client .get_parameter (
185
192
Name = '/ono/fastapi/CLOVA_API_URL' ,
@@ -211,18 +218,18 @@ async def ocr(problem_url: str):
211
218
files = [
212
219
('file' , image_file )
213
220
]
214
- logger .info ("Processing OCR & Receiving Responses..." )
215
221
222
+ dt3 = datetime .fromtimestamp (time .time ())
216
223
ocr_response = requests .request ("POST" , clova_api_url , headers = headers , data = payload , files = files ).text
217
224
ocr_response_json = json .loads (ocr_response )
218
- logger .info ("***** Finished OCR Successfully *****" )
225
+ dt4 = datetime .fromtimestamp (time .time ())
226
+ logger .info (f"{ dt3 } ~{ dt4 } : 이미지 OCR 완료" )
219
227
220
228
infer_texts = []
221
229
for image in ocr_response_json ["images" ]:
222
230
for field in image ["fields" ]:
223
231
infer_texts .append (field ["inferText" ])
224
232
result = ' ' .join (infer_texts )
225
-
226
233
return result
227
234
228
235
except Exception as pe :
@@ -256,14 +263,14 @@ async def upload_curriculum_txt(upload_file: UploadFile = File(...)):
256
263
257
264
# Mivlus DB 연결
258
265
SERVER = os .getenv ('SERVER' )
259
- logger .info (f"* log >> 환경변수 SERVER: { SERVER } 로 받아왔습니다." )
266
+ logger .info (f"* log >> 환경변수를 SERVER( { SERVER } ) 로 받아왔습니다." )
260
267
261
268
MILVUS_HOST = ssm_client .get_parameter (
262
269
Name = f'/ono/{ SERVER } /fastapi/MILVUS_HOST_NAME' ,
263
270
WithDecryption = False
264
271
)['Parameter' ]['Value' ]
265
272
MILVUS_PORT = 19530
266
- COLLECTION_NAME = 'Math2015Curriculum '
273
+ COLLECTION_NAME = 'Curriculum2015 '
267
274
DIMENSION = 1536
268
275
INDEX_TYPE = "IVF_FLAT"
269
276
@@ -272,13 +279,16 @@ async def upload_curriculum_txt(upload_file: UploadFile = File(...)):
272
279
async def connect_milvus ():
273
280
try :
274
281
# Milvus 서버 연결
275
- connections .connect (alias = "default" , host = MILVUS_HOST , port = MILVUS_PORT )
276
- logger .info (f"* log >> Milvus Server is connected to { MILVUS_HOST } :{ MILVUS_PORT } " )
282
+ dt1 = str (datetime .fromtimestamp (time .time ()))
283
+ connections .connect (alias = "default" , host = MILVUS_HOST , port = MILVUS_PORT ) # server 용
284
+ # connections.connect(host="127.0.0.1", port=19530, db="default") # localhost 용
285
+ dt2 = str (datetime .fromtimestamp (time .time ()))
286
+ logger .info (f"{ dt1 } ~ { dt2 } : Milvus 서버 { MILVUS_HOST } :{ MILVUS_PORT } 에 연결 완료" )
277
287
278
288
# 컬렉션의 스키마 출력
279
289
if utility .has_collection (COLLECTION_NAME ):
280
290
collection = Collection (COLLECTION_NAME )
281
- logger .info ("* 존재하는 Collection Schema:" )
291
+ logger .info (f "* 존재하는 Collection { COLLECTION_NAME } Schema:" )
282
292
for field in collection .schema .fields :
283
293
logger .info (f" - Field Name: { field .name } , Data Type #: { field .dtype } " )
284
294
@@ -297,18 +307,16 @@ async def create_milvus():
297
307
FieldSchema (name = 'id' , dtype = DataType .INT64 , is_primary = True , auto_id = True ),
298
308
FieldSchema (name = 'content' , dtype = DataType .VARCHAR , max_length = 65535 ),
299
309
FieldSchema (name = 'content_embedding' , dtype = DataType .FLOAT_VECTOR , dim = DIMENSION ),
310
+ FieldSchema (name = 'subject_name' , dtype = DataType .VARCHAR , max_length = 100 ), # Meta Data1
311
+ FieldSchema (name = 'unit_name' , dtype = DataType .VARCHAR , max_length = 100 ), # Meta Data2
312
+ FieldSchema (name = 'main_concept' , dtype = DataType .VARCHAR , max_length = 100 ), # Meta Data3
300
313
]
301
- schema = CollectionSchema (fields = fields , description = 'Math2015Curriculum embedding collection ' )
314
+ schema = CollectionSchema (fields = fields , description = '2015 Korean High School Curriculum Collection ' )
302
315
collection = Collection (name = COLLECTION_NAME , schema = schema )
303
- logger .info (f"* log >> Collection [{ COLLECTION_NAME } ] is created." )
316
+ logger .info (f"* log >> New Collection [{ COLLECTION_NAME } ] is created." )
304
317
305
318
# 인덱스 생성
306
- # 스칼라 인덱스
307
- collection .create_index (
308
- field_name = "id"
309
- )
310
- # 벡터 인덱스
311
- index_params = {
319
+ index_params = { # 벡터 인덱스
312
320
'index_type' : INDEX_TYPE ,
313
321
'metric_type' : 'COSINE' ,
314
322
'params' : {
@@ -342,15 +350,15 @@ def get_embedding(client, text_list):
342
350
343
351
344
352
@app .get ("/milvus/insert" )
345
- async def insert_curriculum_embeddings ():
353
+ async def insert_curriculum_embeddings (subject : str ):
346
354
""" s3에서 교과과정을 읽고 임베딩하여 Milvus에 삽입 """
347
355
# Milvus 연결
348
356
await connect_milvus ()
349
357
collection = Collection (COLLECTION_NAME )
350
358
351
359
# S3 내 커리큘럼 데이터 로드
352
- texts = []
353
- prefix = 'curriculum/science2015 /' # 경로
360
+ texts , subject_names , unit_names , main_concepts = [], [], [], []
361
+ prefix = f 'curriculum/{ subject } 2015 /' # 경로
354
362
try :
355
363
# 버킷에서 파일 목록 가져오기
356
364
s3_curriculum_response = s3_client .list_objects_v2 (Bucket = BUCKET_NAME , Prefix = prefix )
@@ -360,20 +368,34 @@ async def insert_curriculum_embeddings():
360
368
# S3 객체 가져오기
361
369
obj = s3_client .get_object (Bucket = BUCKET_NAME , Key = s3_key )
362
370
# 텍스트 읽기
363
- text = obj ['Body' ].read ().decode ('utf-8' )
371
+ data = obj ['Body' ].read ().decode ('utf-8' )
372
+ lines = data .splitlines ()
373
+ # 메타 데이터 추출
374
+ meatdata_lines = [line .strip ('#' ).strip () for line in lines [:3 ]]
375
+ subject_name = meatdata_lines [0 ]
376
+ subject_names .append (subject_name )
377
+ unit_name = meatdata_lines [1 ]
378
+ unit_names .append (unit_name )
379
+ main_concept = meatdata_lines [2 ]
380
+ main_concepts .append (main_concept )
381
+ # 교과과정 내용 추출
382
+ text = '\n ' .join (lines [3 :]).strip ()
364
383
texts .append (text )
365
384
logger .info (f"* log >> read { len (texts )} texts from S3" )
366
385
except Exception as e :
367
386
logger .error (f"Error reading curriculum from S3: { e } " )
368
387
369
- # 데이터 임베딩
388
+ # 교과과정 내용 임베딩
370
389
content_embeddings = get_embedding (openai_client , texts )
371
390
logger .info (f"* log >> embedding 완료. dimension: { DIMENSION } " )
372
391
373
392
# 데이터 삽입
374
393
data = [
375
394
texts , # content 필드
376
- content_embeddings # content_embedding 필드
395
+ content_embeddings , # content_embedding 필드
396
+ subject_names , # subject_name 필드
397
+ unit_names , # unit_name 필드
398
+ main_concepts # main_concept 필드
377
399
]
378
400
collection .insert (data )
379
401
@@ -390,66 +412,80 @@ async def retrieve(problem_text: str):
390
412
391
413
# 검색 테스트
392
414
query = problem_text
415
+ dt5 = str (datetime .fromtimestamp (time .time ()))
393
416
query_embeddings = [get_embedding (openai_client , [query ])]
394
417
if not query_embeddings or query_embeddings [0 ] is None :
395
418
raise ValueError ("Embedding generation failed" )
396
- logger .info (f"* log >> Query embedding 완료" )
419
+ dt6 = str (datetime .fromtimestamp (time .time ()))
420
+ logger .info (f"{ dt5 } ~ { dt6 } : 쿼리 임베딩 완료" )
397
421
398
422
search_params = {
399
423
'metric_type' : 'COSINE' ,
400
424
'params' : {
401
425
'probe' : 20
402
426
},
403
427
}
428
+ dt5 = str (datetime .fromtimestamp (time .time ()))
404
429
results = collection .search (
405
430
data = query_embeddings [0 ],
406
431
anns_field = 'content_embedding' ,
407
432
param = search_params ,
408
433
limit = 3 ,
409
434
expr = None ,
410
- output_fields = ['content' ]
435
+ output_fields = ['content' , 'subject_name' , 'unit_name' , 'main_concept' ]
411
436
)
437
+ dt6 = str (datetime .fromtimestamp (time .time ()))
412
438
context = ' ' .join ([result .entity .get ('content' ) for result in results [0 ]])
413
- logger .info (f"* log >> context found" )
414
-
415
- # 결과 확인
416
- '''logger.info(f"* log >> 쿼리 결과")
439
+ subjects_list = [result .entity .get ('subject_name' ) for result in results [0 ]]
440
+ unit_list = [result .entity .get ('unit_name' ) for result in results [0 ]]
441
+ main_concept_list = [result .entity .get ('main_concept' ) for result in results [0 ]]
442
+ logger .info (f"{ dt5 } ~ { dt6 } : 검색 완료" )
443
+ logs = ""
417
444
for result in results [0 ]:
418
- logger.info("\n -------------------------------------------------------------------")
419
- logger.info(f"Score : {result.distance}, \n Text : \n {result.entity.get('content')}")'''
420
-
421
- return context
445
+ logs += ("\n " + f"Score : { result .distance } , \
446
+ \n Info: { result .entity .get ('subject_name' )} \
447
+ > { result .entity .get ('unit_name' )} \
448
+ > { result .entity .get ('main_concept' )} , \
449
+ \n Text : { result .entity .get ('content' )} "+ "\n \n " )
450
+ logger .info (f"* log >> 검색 결과: { logs } " )
451
+
452
+ return context , subjects_list , unit_list , main_concept_list
422
453
except Exception as e :
423
454
logger .error (f"Error in search: { e } " )
424
455
raise HTTPException (status_code = 500 , detail = str (e ))
425
456
426
457
@app .get ("/analysis/augmentation" )
427
458
async def augment (curriculum_context , query ):
428
- prompt = "교과과정에 기반하여 이 문제에 필요한 개념을 말해줘. 응답은 자연어처럼 제공해줘. \n "
429
- context = curriculum_context
430
- passage = query
459
+ prompt = "너는 공책이야. 내가 준 교육과정 중에 아래 문제와 관련된 교육과정을 골라서 고등학생 한 명에게\
460
+ 이 문제를 왜 틀리거나 헷갈릴 수 있으며, 어떤 개념과 사고과정 등이 필요해 보이는지를 \
461
+ 이 문제의 의도와 핵심을 짚어가되 너무 길지 않게 정리해서 고등학생에게 보여줘.\n "
462
+ context = f"교과과정은 이렇고 { curriculum_context } "
463
+ passage = f"문제는 이러해 { query } ."
431
464
augmented_query = prompt + context + passage
432
465
return augmented_query
433
466
434
467
@app .get ("/analysis/generation" )
435
468
async def generate (question ):
436
469
def get_chatgpt_response (client , question , model = "gpt-4o-mini" ):
437
470
try :
471
+ dt7 = str (datetime .fromtimestamp (time .time ()))
438
472
gpt_response = client .chat .completions .create (
439
473
model = model ,
440
474
messages = [
441
475
{"role" : "user" ,
442
476
"content" : question
443
477
}
444
478
],
445
- temperature = 0.5
479
+ temperature = 0.7
446
480
)
481
+ dt8 = str (datetime .fromtimestamp (time .time ()))
482
+ logger .info (f"{ dt7 } ~ { dt8 } : LLM 응답 완료" )
447
483
return gpt_response .choices [0 ].message .content
448
484
except Exception as e :
449
485
logger .info (f"Error during GPT querying: { e } " )
450
486
return None
451
487
452
488
chatgpt_response = get_chatgpt_response (openai_client , question )
453
- logger .info (f"* log >> ChatGPT Response: { chatgpt_response } " )
489
+ logger .info (f"* log >> 응답 결과 \n { chatgpt_response } " )
454
490
return chatgpt_response
455
491
0 commit comments