@@ -76,12 +76,9 @@ std::vector<torch::jit::IValue> LlamacppHandler::Preprocess(
7676 std::pair<std::string&, std::map<uint8_t , std::string>&>& idx_to_req_id,
7777 std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
7878 std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
79- std::cout << " Initializing llama context" << std::endl;
80-
79+
8180 initialize_context ();
8281
83- std::cout << " Llama context initialized" << std::endl;
84-
8582 std::vector<torch::jit::IValue> batch_ivalue;
8683 std::vector<torch::Tensor> batch_tensors;
8784 uint8_t idx = 0 ;
@@ -113,10 +110,6 @@ std::vector<torch::jit::IValue> LlamacppHandler::Preprocess(
113110 continue ;
114111 }
115112
116- std::cout << " Received Input: " << data_it->second << std::endl;
117-
118- // std::vector new_data = request.parameters["data"];
119- // std::string msg = torchserve::Converter::VectorToStr(new_data);
120113 std::string msg = torchserve::Converter::VectorToStr (data_it->second );
121114
122115 // tokenization
@@ -228,10 +221,6 @@ torch::Tensor LlamacppHandler::Inference(
228221 break ;
229222 }
230223
231- // print the new token :
232- std::cout << " New Token: " << llama_token_to_piece (llama_ctx, new_token_id)
233- << std::endl;
234-
235224 // push this new token for next evaluation
236225 tokens_list.push_back (new_token_id);
237226 }
@@ -265,7 +254,6 @@ void LlamacppHandler::Postprocess(
265254 }
266255
267256 std::string generated_text_str = generated_text_stream.str ();
268- std::cout << " Generated Text Str: " << generated_text_str << std::endl;
269257
270258 auto response = (*response_batch)[kv.second ];
271259
0 commit comments