diff --git a/jsk_perception/docker/ofa/server.py b/jsk_perception/docker/ofa/server.py index 6c667e3328..82bb636486 100644 --- a/jsk_perception/docker/ofa/server.py +++ b/jsk_perception/docker/ofa/server.py @@ -61,7 +61,7 @@ def __init__(self, task, model_scale): utils.split_paths(param_path), arg_overrides=overrides) elif task == "refcoco": - tasks.register_task(self.task, RefcocoTask) + tasks.register_task(task, RefcocoTask) self.models, self.cfg, self.task = checkpoint_utils.load_model_ensemble_and_task( utils.split_paths(param_path), arg_overrides=overrides) @@ -140,6 +140,15 @@ def encode_text(self, text, length=None, append_bos=False, append_eos=False): s = torch.cat([s, eos_item]) return s + def convert_objects_to_text(self, text): + if len(text) == 1: + object_text = text[0] + elif len(text) >= 2: + object_text = ', '.join(text[:-1]) + f' or {text[-1]}' + else: + object_text = '' + return object_text + def construct_sample(self, image, text): if self.task_name == "caption" or self.task_name == "vqa_gen": patch_image = self.patch_resize_transform(image).unsqueeze(0) @@ -176,7 +185,8 @@ def construct_sample(self, image, text): h_resize_ratio = torch.tensor(patch_image_size / h).unsqueeze(0) patch_image = self.patch_resize_transform(image).unsqueeze(0) patch_mask = torch.tensor([True]) - src_text = self.encode_text(' which region does the text " {} " describe?'.format(text), append_bos=True, + object_text = self.convert_objects_to_text(text) + src_text = self.encode_text(' which region does the text " {} " describe?'.format(object_text), append_bos=True, append_eos=True).unsqueeze(0) src_length = torch.LongTensor([s.ne(self.pad_idx).long().sum() for s in src_text]) sample = { @@ -214,7 +224,24 @@ def infer(self, img, text): text = result[0]['answer'] return text elif self.task_name == "refcoco": - pass + # image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely + # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + image = Image.fromarray(image) + # Construct input sample & preprocess for GPU if cuda available for VG + sample = self.construct_sample(image, text) + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + sample = utils.apply_to_sample(apply_half, sample) if self.use_fp16 else sample + with torch.no_grad(): + result, scores = eval_step(self.task, self.generator, self.models, sample) + results = {} + object_text = self.convert_objects_to_text(text) + for i in range(len(result)): + box = result[i]["box"] + logit = scores[i].item() + results[i] = {"box": box, "logit": logit, "phrase": object_text} + + return results # run if __name__ == "__main__": @@ -232,6 +259,9 @@ def infer(self, img, text): elif ofa_task == "vqa_gen": vqa_infer = Inference("vqa_gen", ofa_model_scale) + elif ofa_task == "detection": + detection_infer = Inference("refcoco", ofa_model_scale) + else: raise RuntimeError("No application is available") @@ -274,5 +304,25 @@ def vqa_request(): return Response(response=json.dumps({"results": results}), status=200) except NameError: print("Skipping create vqa_gen app") - + + try: + @app.route("/detection", methods=['POST']) + def detection_request(): + data = request.data.decode("utf-8") + data_json = json.loads(data) + # process image + image_b = data_json['image'] + image_dec = base64.b64decode(image_b) + data_np = np.fromstring(image_dec, dtype='uint8') + img = cv2.imdecode(data_np, 1) + # get text + texts = data_json['queries'] + infer_results = detection_infer.infer(img, texts) + results = [] + for i in range(len(infer_results)): + results.append({"id": i, "box": infer_results[i]["box"], "logit": infer_results[i]["logit"], "phrase": infer_results[i]["phrase"]}) + return Response(response=json.dumps({"results": results}), status=200) + except NameError: + print("Skipping create detection app") + app.run("0.0.0.0", 8080, threaded=True) diff --git a/jsk_perception/src/jsk_perception/vil_inference_client.py b/jsk_perception/src/jsk_perception/vil_inference_client.py index 8ff251bcc0..76233faf5f 100644 --- a/jsk_perception/src/jsk_perception/vil_inference_client.py +++ b/jsk_perception/src/jsk_perception/vil_inference_client.py @@ -211,6 +211,7 @@ def __init__(self): DetectionTaskFeedback, DetectionTaskResult, "detection") + self.model_name = rospy.get_param("~model", default="dino") self.pub_class = rospy.Publisher('~class', ClassificationResult, queue_size=1) self.pub_rects = rospy.Publisher('~rects', RectArray, queue_size=1) self.pub_image = rospy.Publisher('~output/image', Image, queue_size=1) @@ -255,7 +256,7 @@ def inference(self, img_msg, queries): classification_msg.label_names = labels classification_msg.label_proba = scores # cosine similarities classification_msg.probabilities = scores # sum(probabilities) is 1 - classification_msg.classifier = 'dino' + classification_msg.classifier = self.model_name classification_msg.target_names = queries self.pub_class.publish(classification_msg)