diff --git a/.gitignore b/.gitignore index 7d7b138f9..506c445eb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,61 @@ +# Data files and directories common in repo root +datasets/ +logs/ +*.h5 +*.weights +results/ +temp/ +test/ *.jpg *.jpeg -*.weights -*.h5 + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# VS Studio Code +.vscode + +# PyCharm +.idea/ + +# Dropbox +.dropbox.attr + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ diff --git a/README.md b/README.md index 0875cfcba..b698777cf 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,26 @@ VOC (20 classes) (http://host.robots.ox.ac.uk/pascal/VOC/voc2012/) | 72% | check Grab the pretrained weights of yolo3 from https://pjreddie.com/media/files/yolov3.weights. +```wget -c https://pjreddie.com/media/files/yolov3.weights``` + +Environment setup: + +```pip install -r requirements.txt``` + +If Nvidia GPU is available: + +```pip uninstall tensorflow``` + +```pip install tensorflow-gpu``` + +Detection on single image: + ```python yolo3_one_file_to_detect_them_all.py -w yolo3.weights -i dog.jpg``` +If your webcam is available: + +```python yolo3_cam.py -w yolov3.weights``` + ## Training ### 1. Data preparation diff --git a/RMS/RMS_yolo3.py b/RMS/RMS_yolo3.py new file mode 100644 index 000000000..0ac94c613 --- /dev/null +++ b/RMS/RMS_yolo3.py @@ -0,0 +1,223 @@ +import base64 +import os +import sys +import argparse +import warnings +warnings.filterwarnings("ignore") +os.environ.setdefault('PATH', '') +import numpy as np +import redis +import time +import json +from io import BytesIO +from multiprocessing import Process, Pipe, current_process, Lock +import GPUtil +from skimage.measure import find_contours +import struct +import cv2 +import numpy as np +import config + +# connect to Redis server +redispool = redis.ConnectionPool(host=config.REDIS_HOST, + port=config.REDIS_PORT, + db=config.REDIS_DB, + socket_keepalive=True) + +try: + print('Testing Redis Connection') + redisdbSession = redis.StrictRedis(connection_pool=redispool) + response = redisdbSession.client_list() + print('Redis Connection Established') +except redis.ConnectionError as e: + print(e) + sys.exit(1) + +np.set_printoptions(threshold=np.nan) +os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"]="0" + +# set some parameters +net_h, net_w = 416, 416 +obj_thresh, nms_thresh = 0.7, 0.7 +anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] +labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", \ + "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", \ + "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", \ + "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", \ + "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", \ + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", \ + "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", \ + "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", \ + "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", \ + "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] + +class mlWorker(Process): + def __init__(self, LOCK, GPU="", FRAC=0): + Process.__init__(self) + self.lock = LOCK + if GPU: + print('{} using GPUid: {}, Name: {}'.format(self.name, str(GPU.id), str(GPU.name))) + os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU.id) + self.device = '/device:GPU:0' + else: + self.device = '' + self.GPU = GPU + self.frac = FRAC + self.counter = 0 + self.dt = 0.0 + + def run(self): + from utils.weightreader import WeightReader + from utils.bbox import BoundBox + from utils.tools import preprocess_input, decode_netout + from utils.tools import correct_yolo_boxes, do_nms, draw_boxes + from model.yolo3 import make_yolov3_model + import tensorflow as tf + from PIL import Image + self.Image = Image + self.preprocess_input = preprocess_input + self.decode_netout = decode_netout + self.correct_yolo_boxes = correct_yolo_boxes + self.do_nms = do_nms + self.draw_boxes = draw_boxes + if self.GPU: + print('ML Process: {} starting, using GPU: {}, frac: {}'.format(self.name,self.GPU.id,self.frac)) + keras.backend.clear_session() + conf = tf.ConfigProto() + conf.gpu_options.per_process_gpu_memory_fraction = self.frac + set_session(tf.Session(config=conf)) + # make the yolov3 model to predict 80 classes on COCO + _model = make_yolov3_model() + + # load the weights trained on COCO into the model + weight_reader = WeightReader(config.MODEL_PATH) + weight_reader.load_weights(_model) + + graph = tf.get_default_graph() + print('ML Process: {} started'.format(self.name)) + self.mainloop(model=_model, graph=graph) + + def mainloop(self, model='', graph=''): + while True: + # attempt to grab a batch of images from the database, then + # initialize the image IDs and batch of images themselves + try: + redisdbSession = redis.StrictRedis(connection_pool=redispool) + self.lock.acquire() + query = redisdbSession.lrange(config.IMAGE_QUEUE, 0, config.BATCH_SIZE - 1) + redisdbSession.ltrim(config.IMAGE_QUEUE, len(query), -1) + self.lock.release() + imageIDs = [] + thresholds = {} + batch = [] + # loop over the queue + # deserialize the object and obtain the input image + if query: + for item in query: + data = json.loads(item) + image = self.base64_decode_image(data["image"]) + image = self.preprocess_input(image, net_h, net_w) + # check to see if the batch list is None + batch.append(image) + # update the list of image IDs + imageIDs.append(data["id"]) + thresholds[data["id"]] = data["threshold"] + + # check to see if we need to process the batch + if len(imageIDs) > 0: + #print('{}: Procesing {} images!'.format(self.name, len(imageIDs))) + start = time.time() + with graph.as_default(): + results = model.predict(batch[0]) + end = time.time() + et = end - start + self.dt += float(et) + self.counter += 1 + adt = float(self.dt)/float(self.counter) + print('avg dt: %f' % adt) + # loop over the image IDs and their corresponding set of + # results from our model + output = [] + output = self.extract_result(results, labels, + throttle=float(thresholds[imageID])) + redisdbSession.set(imageID, json.dumps(output)) + # sleep for a small amount + time.sleep(config.SERVER_SLEEP*2) + except Exception as e: + print(e) + time.sleep(config.SERVER_SLEEP) + continue + + def extract_result(self, results, labels, throttle='0.95'): + boxes = [] + + for i in range(len(yolos)): + # decode the output of the network + boxes.append(decode_netout(yolos[i][0], anchors[i], obj_thresh, net_h, net_w)) + # correct the sizes of the bounding boxes + correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w) + # suppress non-maximal boxes + do_nms(boxes, nms_thresh) + output = [] + for box in boxes: + label_str = '' + label = -1 + for i in range(len(labels)): + if box.classes[i] > obj_thresh: + label_str += labels[i] + label = i + if label >= 0: + output.append({ + bbox: [box.xmin,box.ymin,box.xmax,box.ymax], + label: label_str + score: str(box.get_score()) + }) + return output + + def base64_decode_image(self, a): + """ + return: + """ + img = self.Image.open(BytesIO(base64.b64decode(a))) + if img.mode != "RGB": + img = img.convert("RGB") + img = np.array(img) + return img + + + +if __name__ == "__main__": + LOCK = Lock() + AVAIL_DEVICE_LIST = config.AVAIL_DEVICE_LIST + AVAIL_DEVICE_MEMFRAC = config.AVAIL_DEVICE_MEMFRAC + AVAIL_DEVICE_MAXTHREAD = config.AVAIL_DEVICE_MAXTHREAD + + proc_list = [] + print('{} GPUs Available'.format(len(AVAIL_DEVICE_LIST))) + if AVAIL_DEVICE_LIST: + for index, device in enumerate(AVAIL_DEVICE_LIST): + thread_count = int(AVAIL_DEVICE_MAXTHREAD[index]) + mem_frac = float(AVAIL_DEVICE_MEMFRAC[index]) + if config.MAX_FRAC < mem_frac: + mem_frac = config.MAX_FRAC + print('Preparing {} process on GPU: {}, frac: {}'.format(thread_count, device.id, mem_frac)) + if config.MAX_THREADS < thread_count: + thread_count = config.MAX_THREADS + for thread in range(thread_count): + p = mlWorker(LOCK, GPU=device, FRAC=mem_frac) + p.daemon = True + proc_list.append(p) + print('Starting total: {} processes'.format(len(proc_list))) + for proc in proc_list: + proc.start() + print('All processes started') + else: + p = mlWorker(LOCK) + p.daemon = True + p.start() + p.join() + + if proc_list: + for proc in proc_list: + proc.join() diff --git a/RMS/config.py b/RMS/config.py new file mode 100644 index 000000000..c6fb41df9 --- /dev/null +++ b/RMS/config.py @@ -0,0 +1,83 @@ +import os +import urllib.request +import shutil +from mrcnn.tools.config import Config + + +ALLOWED_EXTENSIONS = set(['jpg', 'jpeg']) + +ROOT_DIR = os.getcwd() +UPLOAD_FOLDER = os.path.join(ROOT_DIR, "images") +if not os.path.exists(UPLOAD_FOLDER): + os.makedirs(UPLOAD_FOLDER) +MODEL_DIR = os.path.join(ROOT_DIR, "logs") +if not os.path.exists(MODEL_DIR): + os.makedirs(MODEL_DIR) +# Local path to trained weights file +MODEL_DIR = os.path.join(ROOT_DIR, "weights") +if not os.path.exists(COCO_MODEL_DIR): + os.makedirs(COCO_MODEL_DIR) +MODEL_PATH = os.path.join(ROOT_DIR, "weights/mask_rcnn_coco.h5") + +def download_trained_weights(coco_model_path, verbose=1): + """Download COCO trained weights from Releases. + + coco_model_path: local path of COCO trained weights + """ + MODEL_URL = "https://pjreddie.com/media/files/yolov3.weights" + if verbose > 0: + print("Downloading pretrained model to " + MODEL_PATH + " ...") + with urllib.request.urlopen(MODEL_URL) as resp, open(MODEL_PATH, 'wb') as out: + shutil.copyfileobj(resp, out) + if verbose > 0: + print("... done downloading pretrained model!") + +# Download COCO trained weights from Releases if needed +if not os.path.exists(MODEL_PATH): + download_trained_weights(MODEL_PATH, verbose=VERBOSE) + +import GPUtil + +LEAST_GMEM = 2250 # MB +MAX_THREADS = 1 +MIN_FRAC = 0.3 +MAX_FRAC = 0.3 +GPU_LAOD = 0.5 +GMEM_LAOD_LIMIT = 1.0 +AVAIL_DEVICE_LIST = [] +AVAIL_DEVICE_MAT = [] +AVAIL_DEVICE_MEMFRAC = [] +AVAIL_DEVICE_MAXTHREAD = [] +try: + GPUs = GPUtil.getGPUs() + Gall = '' + Gfree = '' + for GPU in GPUs: + Gall = GPU.memoryTotal + Gfree = GPU.memoryFree + GMEM_LAOD_LIMIT = float(format(float(LEAST_GMEM / Gall), '.2f')) + if int(GPUtil.getAvailability([GPU], maxLoad=GPU_LAOD, maxMemory=GMEM_LAOD_LIMIT)) == 1: + AVAIL_DEVICE_LIST.append(GPU) + if GMEM_LAOD_LIMIT < MIN_FRAC: + GMEM_LAOD_LIMIT = MIN_FRAC + if GMEM_LAOD_LIMIT > MAX_FRAC: + GMEM_LAOD_LIMIT = MAX_FRAC + AVAIL_DEVICE_MEMFRAC.append(GMEM_LAOD_LIMIT) + AVAIL_DEVICE_MAXTHREAD.append(int(1.0/GMEM_LAOD_LIMIT)) +except Exception as e: + print(e) + +# initialize Redis connection settings +REDIS_HOST = "localhost" +REDIS_PORT = 6379 +REDIS_DB = 0 + +BATCH_SIZE = 1 +# initialize constants used for server queuing +IMAGE_QUEUE = "yolo3_queue" + +SERVER_SLEEP = 0.1 +CLIENT_SLEEP = 0.1 + +# Output Throttle +THROTTLE = 0.9 diff --git a/anchors/gen_openimg_anchors.py b/anchors/gen_openimg_anchors.py new file mode 100644 index 000000000..8c45d3005 --- /dev/null +++ b/anchors/gen_openimg_anchors.py @@ -0,0 +1,137 @@ +import os, sys +import random +import argparse +import numpy as np +import json + +ROOT_DIR = os.path.abspath("../") +sys.path.append(ROOT_DIR) +from dataset.openimg import parse_openimg_annotation + +def IOU(ann, centroids): + w, h = ann + similarities = [] + + for centroid in centroids: + c_w, c_h = centroid + + if c_w >= w and c_h >= h: + similarity = w*h/(c_w*c_h) + elif c_w >= w and c_h <= h: + similarity = w*c_h/(w*h + (c_w-w)*c_h) + elif c_w <= w and c_h >= h: + similarity = c_w*h/(w*h + c_w*(c_h-h)) + else: #means both w,h are bigger than c_w and c_h respectively + similarity = (c_w*c_h)/(w*h) + similarities.append(similarity) # will become (k,) shape + + return np.array(similarities) + +def avg_IOU(anns, centroids): + n,d = anns.shape + sum = 0. + + for i in range(anns.shape[0]): + sum+= max(IOU(anns[i], centroids)) + + return sum/n + +def print_anchors(centroids): + out_string = '' + + anchors = centroids.copy() + + widths = anchors[:, 0] + sorted_indices = np.argsort(widths) + + r = "anchors: [" + for i in sorted_indices: + out_string += str(int(anchors[i,0]*416)) + ',' + str(int(anchors[i,1]*416)) + ', ' + + print(out_string[:-2]) + +def run_kmeans(ann_dims, anchor_num): + ann_num = ann_dims.shape[0] + iterations = 0 + prev_assignments = np.ones(ann_num)*(-1) + iteration = 0 + old_distances = np.zeros((ann_num, anchor_num)) + + indices = [random.randrange(ann_dims.shape[0]) for i in range(anchor_num)] + centroids = ann_dims[indices] + anchor_dim = ann_dims.shape[1] + + while True: + distances = [] + iteration += 1 + for i in range(ann_num): + d = 1 - IOU(ann_dims[i], centroids) + distances.append(d) + distances = np.array(distances) # distances.shape = (ann_num, anchor_num) + + print("iteration {}: dists = {}".format(iteration, np.sum(np.abs(old_distances-distances)))) + + #assign samples to centroids + assignments = np.argmin(distances,axis=1) + + if (assignments == prev_assignments).all() : + return centroids + + #calculate new centroids + centroid_sums=np.zeros((anchor_num, anchor_dim), np.float) + for i in range(ann_num): + centroid_sums[assignments[i]]+=ann_dims[i] + for j in range(anchor_num): + centroids[j] = centroid_sums[j]/(np.sum(assignments==j) + 1e-6) + + prev_assignments = assignments.copy() + old_distances = distances.copy() + +def _main_(argv): + config_path = args.conf + num_anchors = args.anchors + + with open(config_path) as config_buffer: + config = json.loads(config_buffer.read()) + + train_imgs, train_labels = parse_openimg_annotation( + config['train']['train_annot_file'], + config['train']['train_image_folder'], + config['train']['label_map'], + config['train']['cache_name'], + config['model']['labels'] + ) + + # run k_mean to find the anchors + annotation_dims = [] + for image in train_imgs: + print(image['filename']) + print(image) + for obj in image['object']: + relative_w = (float(obj['xmax']) - float(obj['xmin']))/image['width'] + relatice_h = (float(obj["ymax"]) - float(obj['ymin']))/image['height'] + annotation_dims.append(tuple(map(float, (relative_w,relatice_h)))) + print(train_imgs) + annotation_dims = np.array(annotation_dims) + centroids = run_kmeans(annotation_dims, num_anchors) + + # write anchors to file + print('\naverage IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU(annotation_dims, centroids)) + print_anchors(centroids) + +if __name__ == '__main__': + argparser = argparse.ArgumentParser() + + argparser.add_argument( + '-c', + '--conf', + default='config.json', + help='path to configuration file') + argparser.add_argument( + '-a', + '--anchors', + default=9, + help='number of anchors to use') + + args = argparser.parse_args() + _main_(args) diff --git a/gen_anchors.py b/anchors/gen_voc_anchors.py similarity index 100% rename from gen_anchors.py rename to anchors/gen_voc_anchors.py diff --git a/config/config_carplate.json b/config/config_carplate.json new file mode 100644 index 000000000..0da74d04e --- /dev/null +++ b/config/config_carplate.json @@ -0,0 +1,38 @@ +{ + "model" : { + "min_input_size": 352, + "max_input_size": 448, + "anchors": [2,5, 2,7, 3,8, 4,11, 5,15, 7,20, 11,27, 16,46, 29,59], + "labels": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9","A", "B", "C", "D", "E", "F", + "G", "H", "I", "J", "K", "L", "M","N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "carPlate"] + }, + + "train": { + "train_image_folder": "/workspace/carPlateDataset/20180607/JPEGImages/", + "train_annot_folder": "/workspace/carPlateDataset/20180607/Annotations/", + "cache_name": "cache/carplate_train.pkl", + + "train_times": 8, + "batch_size": 12, + "learning_rate": 1e-4, + "nb_epochs": 100, + "warmup_epochs": 3, + "ignore_thresh": 0.5, + "gpus": "0", + "grid_scales": [1,1,1], + "obj_scale": 5, + "noobj_scale": 1, + "xywh_scale": 1, + "class_scale": 1, + "tensorboard_dir": "logs", + "saved_weights_name": "weights/carplate.h5", + "debug": false + }, + "valid": { + "valid_image_folder": "", + "valid_annot_folder": "", + "cache_name": "", + + "valid_times": 1 + } +} diff --git a/config/config_openimg.json b/config/config_openimg.json new file mode 100644 index 000000000..6730746d3 --- /dev/null +++ b/config/config_openimg.json @@ -0,0 +1,41 @@ +{ + "model" : { + "min_input_size": 448, + "max_input_size": 448, + "anchors": [55,69, 75,234, 133,240, 136,129, 142,363, 203,290, 228,184, 285,359, 341,260], + "labels": ["Person"] + }, + + "train": { + "train_image_folder": "/data1/openimages/600c_bbox/images/train/", + "train_annot_file": "/data1/openimages/600c_bbox/annotation/train-annotations-bbox.csv", + "train_annot_folder": "/data1/openimages/600c_bbox/annotation/", + "cache_name": "cache/openimg_train.pkl", + "label_map": "/data1/openimages/600c_bbox/annotation/class-descriptions-boxable.csv", + "train_times": 8, + "batch_size": 16, + "learning_rate": 1e-4, + "nb_epochs": 100, + "warmup_epochs": 3, + "ignore_thresh": 0.5, + "gpus": "0,1", + + "grid_scales": [1,1,1], + "obj_scale": 5, + "noobj_scale": 1, + "xywh_scale": 1, + "class_scale": 1, + + "tensorboard_dir": "logs", + "saved_weights_name": "weights/openimg_600c.h5", + "debug": true + }, + + "valid": { + "valid_image_folder": "/data1/openimages/600c_bbox/images/validation/", + "valid_annot_folder": "/data1/openimages/600c_bbox/annotation/", + "train_annot_file": "validation-annotations-bbox.csv", + "cache_name": "openimg_val.pkl", + "valid_times": 1 + } +} diff --git a/config.json b/config/config_sample.json similarity index 84% rename from config.json rename to config/config_sample.json index 31722ac92..b2830a32a 100644 --- a/config.json +++ b/config/config_sample.json @@ -7,9 +7,9 @@ }, "train": { - "train_image_folder": "/home/andy/Desktop/github/kangaroo/images/", + "train_image_folder": "/data1/openimages/600c_bbox/images/train/", "train_annot_folder": "/home/andy/Desktop/github/kangaroo/annots/", - "cache_name": "kangaroo_train.pkl", + "cache_name": cache/"kangaroo_train.pkl", "train_times": 8, "batch_size": 16, @@ -26,7 +26,7 @@ "class_scale": 1, "tensorboard_dir": "logs", - "saved_weights_name": "kangaroo.h5", + "saved_weights_name": "weights/kangaroo.h5", "debug": true }, diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dataset/csv2xml.py b/dataset/csv2xml.py new file mode 100644 index 000000000..91fb0dc0e --- /dev/null +++ b/dataset/csv2xml.py @@ -0,0 +1,104 @@ +''' +### CSV Format ### +ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside +000026e7ee790996,freeform,/m/07j7r,1,0.071905,0.145346,0.206591,0.391306,0,1,1,0,0 +### File Format ### +/data/ + - /images/ + - /train/ + - /validation/ + - /test/ + - /annotations/ + - train-annotations-bbox.csv + - validation-annotations-bbox.csv + - test-annotations-bbox.csv + - class-descriptions-boxable.csv +''' +import numpy as np +import os +import pickle +import pandas +from lxml.etree import Element, SubElement, tostring +from xml.dom.minidom import parseString +from PIL import Image + +def parse_tfrecord_annotation(ann_file, img_dir, lablefile, cache_name, labels=[]): + ''' + ann_file: /data/annotations/train-annotations-bbox.csv + img_dir: /data/images/train/ + ''' + if os.path.exists(cache_name): + with open(cache_name, 'rb') as handle: + cache = pickle.load(handle) + all_insts, seen_labels = cache['all_insts'], cache['seen_labels'] + else: + all_insts = [] + seen_labels = {} + + try: + csv = pandas.read_csv(ann_file).values + except Exception as e: + print(e) + print('Ignore this bad annotation: ' + ann_file) + continue + img = {} + for row in csv: + if not row[0] in img: + fn = row[0] + '.jpg' + img[row[0]] = { + 'filename': fn, + 'path': img_dir + fn + } + im = Image.open(img[row[0]]['path']) + img[row[0]]['size'] = {} + img[row[0]]['size']['width'], img[row[0]]['size']['height'] = im.size + img[row[0]]['size']['depth'] = 3 + img[row[0]]['object'] = [] + img[row[0]]['object'].append({ + 'name': row[2] + }) + + + for elem in tree.iter(): + if 'filename' in elem.tag: + img['filename'] = img_dir + elem.text + if 'width' in elem.tag: + img['width'] = int(elem.text) + if 'height' in elem.tag: + img['height'] = int(elem.text) + if 'object' in elem.tag or 'part' in elem.tag: + obj = {} + + for attr in list(elem): + if 'name' in attr.tag: + obj['name'] = attr.text + + if obj['name'] in seen_labels: + seen_labels[obj['name']] += 1 + else: + seen_labels[obj['name']] = 1 + + if len(labels) > 0 and obj['name'] not in labels: + break + else: + img['object'] += [obj] + + if 'bndbox' in attr.tag: + for dim in list(attr): + if 'xmin' in dim.tag: + obj['xmin'] = int(round(float(dim.text))) + if 'ymin' in dim.tag: + obj['ymin'] = int(round(float(dim.text))) + if 'xmax' in dim.tag: + obj['xmax'] = int(round(float(dim.text))) + if 'ymax' in dim.tag: + obj['ymax'] = int(round(float(dim.text))) + + if len(img['object']) > 0: + all_insts += [img] + + cache = {'all_insts': all_insts, 'seen_labels': seen_labels} + with open(cache_name, 'wb') as handle: + pickle.dump(cache, handle, protocol=pickle.HIGHEST_PROTOCOL) + + return all_insts, seen_labels \ No newline at end of file diff --git a/dataset/openimg.py b/dataset/openimg.py new file mode 100644 index 000000000..bc60356a7 --- /dev/null +++ b/dataset/openimg.py @@ -0,0 +1,90 @@ +''' +### CSV Format ### +ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside +000026e7ee790996,freeform,/m/07j7r,1,0.071905,0.145346,0.206591,0.391306,0,1,1,0,0 +### File Format ### +/data/ + - /images/ + - /train/ + - /validation/ + - /test/ + - /annotations/ + - train-annotations-bbox.csv + - validation-annotations-bbox.csv + - test-annotations-bbox.csv + - class-descriptions-boxable.csv +''' +import numpy as np +import os +import xml.etree.ElementTree as ET +import pickle +import numpy as np +import os, sys +import pickle +import pandas +from lxml.etree import Element, SubElement, tostring +from xml.dom.minidom import parseString +from PIL import Image +import glob + +def parse_openimg_annotation(ann_file, img_dir, lable_map, cache_name, labels=[]): + if os.path.exists(cache_name): + with open(cache_name, 'rb') as handle: + cache = pickle.load(handle) + all_insts, seen_labels = cache['all_insts'], cache['seen_labels'] + else: + all_insts = [] + seen_labels = {} + imgs = {} + label_map = {} + + try: + img_csv = pandas.read_csv(ann_file, sep=',', header = None, skiprows=1, chunksize=1, dtype=str) + label_csv = pandas.read_csv(lable_map,sep=',', header = None, chunksize=1) + except Exception as e: + print(e) + print('Ignore this bad annotation: ' + ann_dir + ann) + for row in label_csv: + label_map[str(row[0].iloc[0])] = str(row[1].iloc[0]) + + for row in img_csv: + iid = str(row[0].iloc[0]) + if not iid in imgs: + imgs[iid] = {} + imgs[iid]['object'] = [] + imgs[iid]['filename'] = os.path.join(img_dir, iid + '.jpg') + try: + im = Image.open(imgs[iid]['filename']) + imgs[iid]['width'], imgs[iid]['height'] = im.size + except: + npath = glob.glob(os.path.join(img_dir, iid) + '.*') + imgs[iid]['filename'] = npath[0] + im = Image.open(imgs[iid]['filename']) + imgs[iid]['width'], imgs[iid]['height'] = im.size + + label_id = str(row[2].iloc[0]) + label_name = label_map[label_id] + if label_name in seen_labels: + seen_labels[label_name] += 1 + else: + seen_labels[label_name] = 1 + if len(labels) > 0 and label_name not in labels: + continue + else: + obj = { + 'name': label_name, + 'xmin': int(round(float(row[4].iloc[0]) * imgs[iid]['width'])), + 'ymin': int(round(float(row[5].iloc[0]) * imgs[iid]['height'])), + 'xmax': int(round(float(row[6].iloc[0]) * imgs[iid]['width'])), + 'ymax': int(round(float(row[7].iloc[0]) * imgs[iid]['height']))} + imgs[iid]['object'].append(obj) + print(imgs) + for key, img in imgs.items(): + if len(img['object']) > 0: + all_insts += [img] + + cache = {'all_insts': all_insts, 'seen_labels': seen_labels} + with open(cache_name, 'wb') as handle: + pickle.dump(cache, handle, protocol=pickle.HIGHEST_PROTOCOL) + + return all_insts, seen_labels diff --git a/voc.py b/dataset/voc.py similarity index 100% rename from voc.py rename to dataset/voc.py diff --git a/demo/yolo3_cam.py b/demo/yolo3_cam.py new file mode 100644 index 000000000..04f2e17cc --- /dev/null +++ b/demo/yolo3_cam.py @@ -0,0 +1,90 @@ +import argparse +import os + +import struct +import cv2 +import numpy as np +import sys +ROOT_DIR = os.path.abspath("../") +sys.path.append(ROOT_DIR) +from utils.weightreader import WeightReader +from utils.bbox import BoundBox +from utils.tools import preprocess_input, decode_netout +from utils.tools import correct_yolo_boxes, do_nms, draw_boxes +from model.yolo3 import make_yolov3_model + +np.set_printoptions(threshold=np.nan) +os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"]="0" + +# set some parameters +net_h, net_w = 416, 416 +obj_thresh, nms_thresh = 0.7, 0.7 +anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] +labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", \ + "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", \ + "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", \ + "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", \ + "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", \ + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", \ + "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", \ + "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", \ + "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", \ + "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] + +def post_stream(yolos, boxes, image): + image_h, image_w, _ = image.shape + for i in range(len(yolos)): + # decode the output of the network + boxes += decode_netout(yolos[i][0], anchors[i], obj_thresh, net_h, net_w) + + # correct the sizes of the bounding boxes + correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w) + # suppress non-maximal boxes + do_nms(boxes, nms_thresh) + # draw bounding boxes on the image using labels + i = draw_boxes(image, boxes, labels, obj_thresh) + return i + +def _main_(args): + weights_path = args.weights + + # make the yolov3 model to predict 80 classes on COCO + yolov3 = make_yolov3_model() + + # load the weights trained on COCO into the model + weight_reader = WeightReader(weights_path) + weight_reader.load_weights(yolov3) + + # While frame + # pre_stream() + # detect() + # post_stream() + cap = cv2.VideoCapture(0) + cap.set(3, 1280) # set the Horizontal resolution + cap.set(4, 720) + while(True): + # Capture frame-by-frame + _, image = cap.read() + # preprocess the image + new_image = preprocess_input(image, net_h, net_w) + # run the prediction + yolos = yolov3.predict(new_image) + boxes = [] + frame = post_stream(yolos, boxes, image) + cv2.imshow('Yolo3', frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + cap.release() + cv2.destroyAllWindows() + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + description='test yolov3 network with coco weights') + argparser.add_argument( + '-w', + '--weights', + help='path to weights file') + + args = argparser.parse_args() + _main_(args) diff --git a/demo/yolo3_one_file_to_detect_them_all.py b/demo/yolo3_one_file_to_detect_them_all.py new file mode 100644 index 000000000..fee5610db --- /dev/null +++ b/demo/yolo3_one_file_to_detect_them_all.py @@ -0,0 +1,88 @@ +import argparse +import os +import numpy as np +from keras.layers import Conv2D, Input, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D +from keras.layers.merge import add, concatenate +from keras.models import Model +import struct +import cv2 +import sys +ROOT_DIR = os.path.abspath("../") +sys.path.append(ROOT_DIR) +from utils.weightreader import WeightReader +from utils.bbox import BoundBox +from utils.tools import preprocess_input, decode_netout +from utils.tools import correct_yolo_boxes, do_nms, draw_boxes +from model.yolo3 import make_yolov3_model + +np.set_printoptions(threshold=np.nan) +os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"]="0" + +argparser = argparse.ArgumentParser( + description='test yolov3 network with coco weights') + +argparser.add_argument( + '-w', + '--weights', + help='path to weights file') + +argparser.add_argument( + '-i', + '--image', + help='path to image file') + +def _main_(args): + weights_path = args.weights + image_path = args.image + + # set some parameters + net_h, net_w = 416, 416 + obj_thresh, nms_thresh = 0.5, 0.45 + anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] + labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", \ + "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", \ + "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", \ + "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", \ + "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", \ + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", \ + "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", \ + "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", \ + "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", \ + "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] + + # make the yolov3 model to predict 80 classes on COCO + yolov3 = make_yolov3_model() + + # load the weights trained on COCO into the model + weight_reader = WeightReader(weights_path) + weight_reader.load_weights(yolov3) + + # preprocess the image + image = cv2.imread(image_path) + image_h, image_w, _ = image.shape + new_image = preprocess_input(image, net_h, net_w) + + # run the prediction + yolos = yolov3.predict(new_image) + boxes = [] + + for i in range(len(yolos)): + # decode the output of the network + boxes += decode_netout(yolos[i][0], anchors[i], obj_thresh, net_h, net_w) + + # correct the sizes of the bounding boxes + correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w) + + # suppress non-maximal boxes + do_nms(boxes, nms_thresh) + + # draw bounding boxes on the image using labels + draw_boxes(image, boxes, labels, obj_thresh) + + # write the image with bounding boxes to file + cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], (image).astype('uint8')) + +if __name__ == '__main__': + args = argparser.parse_args() + _main_(args) diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/yolo.py b/model/yolo.py similarity index 100% rename from yolo.py rename to model/yolo.py diff --git a/model/yolo3.py b/model/yolo3.py new file mode 100644 index 000000000..bd02427af --- /dev/null +++ b/model/yolo3.py @@ -0,0 +1,619 @@ +import os +import multiprocessing +import numpy as np +from keras.layers import Conv2D, Input, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D +from keras.layers.merge import add, concatenate +from keras.models import Model + +import keras +import keras.backend as K +import keras.layers as KL +import keras.engine as KE +import keras.models as KM +from keras.optimizers import Adam + + +def _conv_block(inp, convs, skip=True): + x = inp + count = 0 + + for conv in convs: + if count == (len(convs) - 2) and skip: + skip_connection = x + count += 1 + + if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) # peculiar padding as darknet prefer left and top + x = Conv2D(conv['filter'], + conv['kernel'], + strides=conv['stride'], + padding='valid' if conv['stride'] > 1 else 'same', # peculiar padding as darknet prefer left and top + name='conv_' + str(conv['layer_idx']), + use_bias=False if conv['bnorm'] else True)(x) + if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(x) + if conv['leaky']: x = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(x) + + return add([skip_connection, x]) if skip else x + +def conv_block(inp, convs, skip=True): + x = inp + count = 0 + + for conv in convs: + if count == (len(convs) - 2) and skip: + skip_connection = x + count += 1 + + if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) # peculiar padding as darknet prefer left and top + x = Conv2D(conv['filter'], + conv['kernel'], + strides=conv['stride'], + padding='valid' if conv['stride'] > 1 else 'same', # peculiar padding as darknet prefer left and top + name='conv_' + str(conv['layer_idx']), + use_bias=False if conv['bnorm'] else True)(x) + if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(x) + if conv['leaky']: x = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(x) + merged_graph = add([skip_connection, x]) + return merged_graph if skip else x + +def _interval_overlap(interval_a, interval_b): + x1, x2 = interval_a + x3, x4 = interval_b + + if x3 < x1: + if x4 < x1: + return 0 + else: + return min(x2,x4) - x1 + else: + if x2 < x3: + return 0 + else: + return min(x2,x4) - x3 + + +def _sigmoid(x): + return 1. / (1. + np.exp(-x)) + +def bbox_iou(box1, box2): + intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) + intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) + + intersect = intersect_w * intersect_h + + w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin + w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin + + union = w1*h1 + w2*h2 - intersect + + return float(intersect) / union + +class YoloLayer(): + def __init__(self, anchors, max_grid, batch_size, warmup_batches, ignore_thresh, + grid_scale, obj_scale, noobj_scale, xywh_scale, class_scale, + **kwargs): + # make the model settings persistent + self.ignore_thresh = ignore_thresh + self.warmup_batches = warmup_batches + self.anchors = tf.constant(anchors, dtype='float', shape=[1,1,1,3,2]) + self.grid_scale = grid_scale + self.obj_scale = obj_scale + self.noobj_scale = noobj_scale + self.xywh_scale = xywh_scale + self.class_scale = class_scale + + # make a persistent mesh grid + max_grid_h, max_grid_w = max_grid + + cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(max_grid_w), [max_grid_h]), (1, max_grid_h, max_grid_w, 1, 1))) + cell_y = tf.transpose(cell_x, (0,2,1,3,4)) + self.cell_grid = tf.tile(tf.concat([cell_x,cell_y],-1), [batch_size, 1, 1, 3, 1]) + + super(YoloLayer, self).__init__(**kwargs) + + def build(self, input_shape): + super(YoloLayer, self).build(input_shape) # Be sure to call this somewhere! + + def call(self, x): + input_image, y_pred, y_true, true_boxes = x + + # adjust the shape of the y_predict [batch, grid_h, grid_w, 3, 4+1+nb_class] + y_pred = tf.reshape(y_pred, tf.concat([tf.shape(y_pred)[:3], tf.constant([3, -1])], axis=0)) + + # initialize the masks + object_mask = tf.expand_dims(y_true[..., 4], 4) + + # the variable to keep track of number of batches processed + batch_seen = tf.Variable(0.) + + # compute grid factor and net factor + grid_h = tf.shape(y_true)[1] + grid_w = tf.shape(y_true)[2] + grid_factor = tf.reshape(tf.cast([grid_w, grid_h], tf.float32), [1,1,1,1,2]) + + net_h = tf.shape(input_image)[1] + net_w = tf.shape(input_image)[2] + net_factor = tf.reshape(tf.cast([net_w, net_h], tf.float32), [1,1,1,1,2]) + + """ + Adjust prediction + """ + pred_box_xy = (self.cell_grid[:,:grid_h,:grid_w,:,:] + tf.sigmoid(y_pred[..., :2])) # sigma(t_xy) + c_xy + pred_box_wh = y_pred[..., 2:4] # t_wh + pred_box_conf = tf.expand_dims(tf.sigmoid(y_pred[..., 4]), 4) # adjust confidence + pred_box_class = y_pred[..., 5:] # adjust class probabilities + + """ + Adjust ground truth + """ + true_box_xy = y_true[..., 0:2] # (sigma(t_xy) + c_xy) + true_box_wh = y_true[..., 2:4] # t_wh + true_box_conf = tf.expand_dims(y_true[..., 4], 4) + true_box_class = tf.argmax(y_true[..., 5:], -1) + + """ + Compare each predicted box to all true boxes + """ + # initially, drag all objectness of all boxes to 0 + conf_delta = pred_box_conf - 0 + + # then, ignore the boxes which have good overlap with some true box + true_xy = true_boxes[..., 0:2] / grid_factor + true_wh = true_boxes[..., 2:4] / net_factor + + true_wh_half = true_wh / 2. + true_mins = true_xy - true_wh_half + true_maxes = true_xy + true_wh_half + + pred_xy = tf.expand_dims(pred_box_xy / grid_factor, 4) + pred_wh = tf.expand_dims(tf.exp(pred_box_wh) * self.anchors / net_factor, 4) + + pred_wh_half = pred_wh / 2. + pred_mins = pred_xy - pred_wh_half + pred_maxes = pred_xy + pred_wh_half + + intersect_mins = tf.maximum(pred_mins, true_mins) + intersect_maxes = tf.minimum(pred_maxes, true_maxes) + + intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) + intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] + + true_areas = true_wh[..., 0] * true_wh[..., 1] + pred_areas = pred_wh[..., 0] * pred_wh[..., 1] + + union_areas = pred_areas + true_areas - intersect_areas + iou_scores = tf.truediv(intersect_areas, union_areas) + + best_ious = tf.reduce_max(iou_scores, axis=4) + conf_delta *= tf.expand_dims(tf.to_float(best_ious < self.ignore_thresh), 4) + + """ + Compute some online statistics + """ + true_xy = true_box_xy / grid_factor + true_wh = tf.exp(true_box_wh) * self.anchors / net_factor + + true_wh_half = true_wh / 2. + true_mins = true_xy - true_wh_half + true_maxes = true_xy + true_wh_half + + pred_xy = pred_box_xy / grid_factor + pred_wh = tf.exp(pred_box_wh) * self.anchors / net_factor + + pred_wh_half = pred_wh / 2. + pred_mins = pred_xy - pred_wh_half + pred_maxes = pred_xy + pred_wh_half + + intersect_mins = tf.maximum(pred_mins, true_mins) + intersect_maxes = tf.minimum(pred_maxes, true_maxes) + intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) + intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] + + true_areas = true_wh[..., 0] * true_wh[..., 1] + pred_areas = pred_wh[..., 0] * pred_wh[..., 1] + + union_areas = pred_areas + true_areas - intersect_areas + iou_scores = tf.truediv(intersect_areas, union_areas) + iou_scores = object_mask * tf.expand_dims(iou_scores, 4) + + count = tf.reduce_sum(object_mask) + count_noobj = tf.reduce_sum(1 - object_mask) + detect_mask = tf.to_float((pred_box_conf*object_mask) >= 0.5) + class_mask = tf.expand_dims(tf.to_float(tf.equal(tf.argmax(pred_box_class, -1), true_box_class)), 4) + recall50 = tf.reduce_sum(tf.to_float(iou_scores >= 0.5 ) * detect_mask * class_mask) / (count + 1e-3) + recall75 = tf.reduce_sum(tf.to_float(iou_scores >= 0.75) * detect_mask * class_mask) / (count + 1e-3) + avg_iou = tf.reduce_sum(iou_scores) / (count + 1e-3) + avg_obj = tf.reduce_sum(pred_box_conf * object_mask) / (count + 1e-3) + avg_noobj = tf.reduce_sum(pred_box_conf * (1-object_mask)) / (count_noobj + 1e-3) + avg_cat = tf.reduce_sum(object_mask * class_mask) / (count + 1e-3) + + """ + Warm-up training + """ + batch_seen = tf.assign_add(batch_seen, 1.) + + true_box_xy, true_box_wh, xywh_mask = tf.cond(tf.less(batch_seen, self.warmup_batches+1), + lambda: [true_box_xy + (0.5 + self.cell_grid[:,:grid_h,:grid_w,:,:]) * (1-object_mask), + true_box_wh + tf.zeros_like(true_box_wh) * (1-object_mask), + tf.ones_like(object_mask)], + lambda: [true_box_xy, + true_box_wh, + object_mask]) + + """ + Compare each true box to all anchor boxes + """ + wh_scale = tf.exp(true_box_wh) * self.anchors / net_factor + wh_scale = tf.expand_dims(2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale + + xy_delta = xywh_mask * (pred_box_xy-true_box_xy) * wh_scale * self.xywh_scale + wh_delta = xywh_mask * (pred_box_wh-true_box_wh) * wh_scale * self.xywh_scale + conf_delta = object_mask * (pred_box_conf-true_box_conf) * self.obj_scale + (1-object_mask) * conf_delta * self.noobj_scale + class_delta = object_mask * \ + tf.expand_dims(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class), 4) * \ + self.class_scale + + loss_xy = tf.reduce_sum(tf.square(xy_delta), list(range(1,5))) + loss_wh = tf.reduce_sum(tf.square(wh_delta), list(range(1,5))) + loss_conf = tf.reduce_sum(tf.square(conf_delta), list(range(1,5))) + loss_class = tf.reduce_sum(class_delta, list(range(1,5))) + + loss = loss_xy + loss_wh + loss_conf + loss_class + + loss = tf.Print(loss, [grid_h, avg_obj], message='avg_obj \t\t', summarize=1000) + loss = tf.Print(loss, [grid_h, avg_noobj], message='avg_noobj \t\t', summarize=1000) + loss = tf.Print(loss, [grid_h, avg_iou], message='avg_iou \t\t', summarize=1000) + loss = tf.Print(loss, [grid_h, avg_cat], message='avg_cat \t\t', summarize=1000) + loss = tf.Print(loss, [grid_h, recall50], message='recall50 \t', summarize=1000) + loss = tf.Print(loss, [grid_h, recall75], message='recall75 \t', summarize=1000) + loss = tf.Print(loss, [grid_h, count], message='count \t', summarize=1000) + loss = tf.Print(loss, [grid_h, tf.reduce_sum(loss_xy), + tf.reduce_sum(loss_wh), + tf.reduce_sum(loss_conf), + tf.reduce_sum(loss_class)], message='loss xy, wh, conf, class: \t', summarize=1000) + + + return loss*self.grid_scale + + def compute_output_shape(self, input_shape): + return [(None, 1)] + + +def make_yolov3_model(): + input_image = Input(shape=(None, None, 3)) + + # Layer 0 => 4 + x = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0}, + {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1}, + {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2}, + {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}]) + + # Layer 5 => 8 + x = _conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5}, + {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6}, + {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}]) + + # Layer 9 => 11 + x = _conv_block(x, [{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9}, + {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}]) + + # Layer 12 => 15 + x = _conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}]) + + # Layer 16 => 36 + for i in range(7): + x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}]) + + skip_36 = x + + # Layer 37 => 40 + x = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}]) + + # Layer 41 => 61 + for i in range(7): + x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}]) + + skip_61 = x + + # Layer 62 => 65 + x = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}]) + + # Layer 66 => 74 + for i in range(3): + x = _conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) + + # Layer 75 => 79 + x = _conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False) + + # Layer 80 => 82 + yolo_82 = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80}, + {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False) + + # Layer 83 => 86 + x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False) + x = UpSampling2D(2)(x) + x = concatenate([x, skip_61]) + + # Layer 87 => 91 + x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False) + + # Layer 92 => 94 + yolo_94 = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92}, + {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False) + + # Layer 95 => 98 + x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}], skip=False) + x = UpSampling2D(2)(x) + x = concatenate([x, skip_36]) + + # Layer 99 => 106 + yolo_106 = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104}, + {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False) + + model = Model(input_image, [yolo_82, yolo_94, yolo_106]) + return model + +def dummy_loss(y_true, y_pred): + return tf.sqrt(tf.reduce_sum(y_pred)) + +class YOLO3(): + """ + Encapsulates the YOLO3 model functionality. + The actual Keras model is in the keras_model property. + """ + def __init__(self, mode, config, model_dir): + """ + mode: Either "training" or "inference" + config: A Sub-class of the Config class + model_dir: Directory to save training logs and trained weights + """ + assert mode in ['training', 'inference'] + self.mode = mode + self.config = config + self.model_dir = model_dir + self.set_log_dir() + self.keras_model = self.build(mode=mode, config=config) + + def build(self, mode, config): + """ + Build YOLO3 architecture. + mode: Either ["training", "inference"] + Config: {nb_class,anchors,max_box_per_image, + max_grid,batch_size,warmup_batches,ignore_thresh, + grid_scales,obj_scale,noobj_scale,xywh_scale,class_scale} + """ + assert mode in ['training', 'inference'] + + input_image = KL.Input(shape=(None, None, 3)) # net_h, net_w, 3 + true_boxes = KL.Input( + shape=(1, 1, 1,config.max_box_per_image, 4)) + true_yolo_1 = KL.Input( + shape=(None, None, len(config.anchors)//6, 4+1+config.nb_class)) + # grid_h, grid_w, nb_anchor, 5+nb_class + true_yolo_2 = KL.Input( + shape=(None, None, len(config.anchors)//6, 4+1+config.nb_class)) + # grid_h, grid_w, nb_anchor, 5+nb_class + true_yolo_3 = KL.Input( + shape=(None, None, len(config.anchors)//6, 4+1+config.nb_class)) + # grid_h, grid_w, nb_anchor, 5+nb_class + # Layer 0 => 4 + x = _conv_block(input_image, [ + {'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0}, + {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1}, + {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2}, + {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}]) + # Layer 5 => 8 + x = _conv_block(x, [ + {'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5}, + {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6}, + {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}]) + # Layer 9 => 11 + x = _conv_block(x, [ + {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9}, + {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}]) + # Layer 12 => 15 + x = _conv_block(x, [ + {'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}]) + # Layer 16 => 36 + for i in range(7): + x = _conv_block(x, [ + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}]) + skip_36 = x + # Layer 37 => 40 + x = _conv_block(x, [ + {'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}]) + # Layer 41 => 61 + for i in range(7): + x = _conv_block(x, [ + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}]) + skip_61 = x + # Layer 62 => 65 + x = _conv_block(x, [ + {'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}]) + # Layer 66 => 74 + for i in range(3): + x = _conv_block(x, [ + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) + # Layer 75 => 79 + x = _conv_block(x, [ + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79} + ], do_skip=False) + # Layer 80 => 82 + pred_yolo_1 = _conv_block(x, [ + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80}, + {'filter': (3*(5+nb_class)), 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81} + ], do_skip=False) + # YOLO Layer + loss_yolo_1 = YoloLayer( + anchors[12:], + [1*num for num in max_grid], + batch_size, + warmup_batches, + ignore_thresh, + grid_scales[0], + obj_scale, + noobj_scale, + xywh_scale, + class_scale)( + [input_image, pred_yolo_1, true_yolo_1, true_boxes]) + # Layer 83 => 86 + x = _conv_block(x, [ + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84} + ], do_skip=False) + x = UpSampling2D(2)(x) + x = concatenate([x, skip_61]) + + # Layer 87 => 91 + x = _conv_block(x, [ + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91} + ], do_skip=False) + + # Layer 92 => 94 + pred_yolo_2 = _conv_block(x, [ + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92}, + {'filter': (3*(5+nb_class)), 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93} + ], do_skip=False) + loss_yolo_2 = YoloLayer( + anchors[6:12], + [2*num for num in max_grid], + batch_size, + warmup_batches, + ignore_thresh, + grid_scales[1], + obj_scale, + noobj_scale, + xywh_scale, + class_scale)( + [input_image, pred_yolo_2, true_yolo_2, true_boxes]) + # Layer 95 => 98 + x = _conv_block(x, [ + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96} + ], do_skip=False) + x = UpSampling2D(2)(x) + x = concatenate([x, skip_36]) + # Layer 99 => 106 + pred_yolo_3 = _conv_block(x, [ + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104}, + {'filter': (3*(5+nb_class)), 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105} + ], do_skip=False) + loss_yolo_3 = YoloLayer( + anchors[:6], + [4*num for num in max_grid], + batch_size, + warmup_batches, + ignore_thresh, + grid_scales[2], + obj_scale, + noobj_scale, + xywh_scale, + class_scale)( + [input_image, pred_yolo_3, true_yolo_3, true_boxes]) + + if mode == "training": + model = KM.Model( + [input_image, true_boxes, true_yolo_1, true_yolo_2, true_yolo_3], + [loss_yolo_1, loss_yolo_2, loss_yolo_3]) + elif mode == "inference": + model = KM.Model( + input_image, + [pred_yolo_1, pred_yolo_2, pred_yolo_3]) + # Muti-GPU Model Build Here + if config.GPU_COUNT > 1: + pass + return model + + def compile(self, lr): + assert self.mode in ['training'] + optimizer = Adam(lr=lr, clipnorm=0.001) + self.model.compile(loss=self.dummy_loss, optimizer=optimizer) + + def train(self, train_dataset, val_dataset, + learning_rate, epochs, layers,augmentation=None): + assert self.mode == "training", "train" + # Callbacks + callbacks = [ + keras.callbacks.TensorBoard( + log_dir=self.log_dir, histogram_freq=0, write_graph=True, write_images=False), + keras.callbacks.ModelCheckpoint( + self.checkpoint_path, verbose=0, save_weights_only=True) + ] + # Data generators + ################ data_generator need to be impl + ''' + train_generator = data_generator(train_dataset, + self.config, shuffle=True, + augmentation=augmentation, batch_size=self.config.BATCH_SIZE) + val_generator = data_generator(val_dataset, + self.config, shuffle=True, batch_size=self.config.BATCH_SIZE) + ''' + # Train + # log("\nStarting at epoch {}. LR={}\n".format(self.epoch, learning_rate)) + # log("Checkpoint Path: {}".format(self.checkpoint_path)) + # self.set_trainable(layers) + self.compile(self.config.learning_rate) + if os.name is 'nt': + workers = 0 + else: + workers = multiprocessing.cpu_count() + self.keras_model.fit_generator( + train_generator, + steps_per_epoch = len(train_generator) * config['train']['train_times'], + epochs = config['train']['nb_epochs'] + config['train']['warmup_epochs'], + verbose = 2 if config['train']['debug'] else 1, + callbacks = callbacks, + workers = workers, + max_queue_size = 100, + use_multiprocessing=True) + self.epoch = max(self.epoch, epochs) + + def detect(self): + pass + def trainingModel(self): + pass + def inferenceModel(self): + pass + def dummy_loss(y_true, y_pred): + return tf.sqrt(tf.reduce_sum(y_pred)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..0eb506f9c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +absl-py==0.2.0 +astor==0.6.2 +bleach==1.5.0 +certifi==2018.4.16 +cycler==0.10.0 +gast==0.2.0 +grpcio==1.11.0 +h5py==2.7.1 +html5lib==0.9999999 +Keras==2.1.5 +kiwisolver==1.0.1 +Markdown==2.6.11 +matplotlib==2.2.2 +numpy==1.14.2 +opencv-python==3.4.0.12 +pandas==0.23.0 +Pillow==5.1.0 +protobuf==3.5.2.post1 +pyparsing==2.2.0 +python-dateutil==2.7.2 +pytz==2018.4 +PyYAML==3.12 +scipy==1.0.1 +seaborn==0.8.1 +selenium==3.12.0 +six==1.11.0 +tensorboard==1.7.0 +tensorflow==1.7.0 +termcolor==1.1.0 +Werkzeug==0.14.1 diff --git a/train.py b/train.py index 491289da9..85f9fcdd2 100644 --- a/train.py +++ b/train.py @@ -4,14 +4,14 @@ import os import numpy as np import json -from voc import parse_voc_annotation -from yolo import create_yolov3_model, dummy_loss +from .dataset.voc import parse_voc_annotation +from .model.yolo import create_yolov3_model, dummy_loss from generator import BatchGenerator -from utils.utils import normalize, evaluate, makedirs +from .utils.utils import normalize, evaluate, makedirs from keras.callbacks import EarlyStopping, ReduceLROnPlateau from keras.optimizers import Adam from callbacks import CustomModelCheckpoint, CustomTensorBoard -from utils.multi_gpu_model import multi_gpu_model +from .utils.multi_gpu_model import multi_gpu_model import tensorflow as tf import keras from keras.models import load_model diff --git a/train_openimg.py b/train_openimg.py new file mode 100644 index 000000000..b772002ac --- /dev/null +++ b/train_openimg.py @@ -0,0 +1,282 @@ +#! /usr/bin/env python + +import argparse +import os +import numpy as np +import json +from .dataset.openimg import parse_openimg_annotation +from .model.yolo import create_yolov3_model, dummy_loss +from generator import BatchGenerator +from .utils.utils import normalize, evaluate, makedirs +from keras.callbacks import EarlyStopping, ReduceLROnPlateau +from keras.optimizers import Adam +from callbacks import CustomModelCheckpoint, CustomTensorBoard +from .utils.multi_gpu_model import multi_gpu_model +import tensorflow as tf +import keras +from keras.models import load_model + +def create_training_instances( + train_annot_file, + train_image_folder, + label_map, + train_cache, + valid_annot_folder, + valid_image_folder, + valid_cache, + labels, +): + # parse annotations of the training set + train_ints, train_labels = parse_openimg_annotation(train_annot_file, train_image_folder, label_map, train_cache, labels) + + # parse annotations of the validation set, if any, otherwise split the training set + if os.path.exists(valid_annot_folder): + valid_ints, valid_labels = parse_openimg_annotation(valid_annot_file, valid_image_folder, label_map, valid_cache, labels) + else: + print("valid_annot_folder not exists. Spliting the trainining set.") + + train_valid_split = int(0.8*len(train_ints)) + np.random.seed(0) + np.random.shuffle(train_ints) + np.random.seed() + + valid_ints = train_ints[train_valid_split:] + train_ints = train_ints[:train_valid_split] + + # compare the seen labels with the given labels in config.json + if len(labels) > 0: + overlap_labels = set(labels).intersection(set(train_labels.keys())) + + print('Seen labels: \t' + str(train_labels) + '\n') + print('Given labels: \t' + str(labels)) + + # return None, None, None if some given label is not in the dataset + if len(overlap_labels) < len(labels): + print('Some labels have no annotations! Please revise the list of labels in the config.json.') + return None, None, None + else: + print('No labels are provided. Train on all seen labels.') + print(train_labels) + labels = train_labels.keys() + + max_box_per_image = max([len(inst['object']) for inst in (train_ints + valid_ints)]) + + return train_ints, valid_ints, sorted(labels), max_box_per_image + +def create_callbacks(saved_weights_name, tensorboard_logs, model_to_save): + makedirs(tensorboard_logs) + + early_stop = EarlyStopping( + monitor = 'loss', + min_delta = 0.01, + patience = 5, + mode = 'min', + verbose = 1 + ) + checkpoint = CustomModelCheckpoint( + model_to_save = model_to_save, + filepath = saved_weights_name,# + '{epoch:02d}.h5', + monitor = 'loss', + verbose = 1, + save_best_only = True, + mode = 'min', + period = 1 + ) + reduce_on_plateau = ReduceLROnPlateau( + monitor = 'loss', + factor = 0.1, + patience = 2, + verbose = 1, + mode = 'min', + epsilon = 0.01, + cooldown = 0, + min_lr = 0 + ) + tensorboard = CustomTensorBoard( + log_dir = tensorboard_logs, + write_graph = True, + write_images = True, + ) + return [early_stop, checkpoint, reduce_on_plateau, tensorboard] + +def create_model( + nb_class, + anchors, + max_box_per_image, + max_grid, batch_size, + warmup_batches, + ignore_thresh, + multi_gpu, + saved_weights_name, + lr, + grid_scales, + obj_scale, + noobj_scale, + xywh_scale, + class_scale +): + if multi_gpu > 1: + with tf.device('/cpu:0'): + template_model, infer_model = create_yolov3_model( + nb_class = nb_class, + anchors = anchors, + max_box_per_image = max_box_per_image, + max_grid = max_grid, + batch_size = batch_size//multi_gpu, + warmup_batches = warmup_batches, + ignore_thresh = ignore_thresh, + grid_scales = grid_scales, + obj_scale = obj_scale, + noobj_scale = noobj_scale, + xywh_scale = xywh_scale, + class_scale = class_scale + ) + else: + template_model, infer_model = create_yolov3_model( + nb_class = nb_class, + anchors = anchors, + max_box_per_image = max_box_per_image, + max_grid = max_grid, + batch_size = batch_size, + warmup_batches = warmup_batches, + ignore_thresh = ignore_thresh, + grid_scales = grid_scales, + obj_scale = obj_scale, + noobj_scale = noobj_scale, + xywh_scale = xywh_scale, + class_scale = class_scale + ) + + # load the pretrained weight if exists, otherwise load the backend weight only + if os.path.exists(saved_weights_name): + print("\nLoading pretrained weights.\n") + template_model.load_weights(saved_weights_name) + else: + template_model.load_weights("backend.h5", by_name=True) + + if multi_gpu > 1: + train_model = multi_gpu_model(template_model, gpus=multi_gpu) + else: + train_model = template_model + + optimizer = Adam(lr=lr, clipnorm=0.001) + train_model.compile(loss=dummy_loss, optimizer=optimizer) + + return train_model, infer_model + +def _main_(args): + config_path = args.conf + + with open(config_path) as config_buffer: + config = json.loads(config_buffer.read()) + + ############################### + # Parse the annotations + ############################### + train_ints, valid_ints, labels, max_box_per_image = create_training_instances( + config['train']['train_annot_file'], + config['train']['train_image_folder'], + config['train']['label_map'], + config['train']['cache_name'], + config['valid']['valid_annot_file'], + config['valid']['valid_image_folder'], + config['valid']['cache_name'], + config['model']['labels'] + ) + print('\nTraining on: \t' + str(labels) + '\n') + + ############################### + # Create the generators + ############################### + train_generator = BatchGenerator( + instances = train_ints, + anchors = config['model']['anchors'], + labels = labels, + downsample = 32, # ratio between network input's size and network output's size, 32 for YOLOv3 + max_box_per_image = max_box_per_image, + batch_size = config['train']['batch_size'], + min_net_size = config['model']['min_input_size'], + max_net_size = config['model']['max_input_size'], + shuffle = True, + jitter = 0.3, + norm = normalize + ) + + valid_generator = BatchGenerator( + instances = valid_ints, + anchors = config['model']['anchors'], + labels = labels, + downsample = 32, # ratio between network input's size and network output's size, 32 for YOLOv3 + max_box_per_image = max_box_per_image, + batch_size = config['train']['batch_size'], + min_net_size = config['model']['min_input_size'], + max_net_size = config['model']['max_input_size'], + shuffle = True, + jitter = 0.0, + norm = normalize + ) + + ############################### + # Create the model + ############################### + if os.path.exists(config['train']['saved_weights_name']): + config['train']['warmup_epochs'] = 0 + warmup_batches = config['train']['warmup_epochs'] * (config['train']['train_times']*len(train_generator)) + + os.environ['CUDA_VISIBLE_DEVICES'] = config['train']['gpus'] + multi_gpu = len(config['train']['gpus'].split(',')) + + train_model, infer_model = create_model( + nb_class = len(labels), + anchors = config['model']['anchors'], + max_box_per_image = max_box_per_image, + max_grid = [config['model']['max_input_size'], config['model']['max_input_size']], + batch_size = config['train']['batch_size'], + warmup_batches = warmup_batches, + ignore_thresh = config['train']['ignore_thresh'], + multi_gpu = multi_gpu, + saved_weights_name = config['train']['saved_weights_name'], + lr = config['train']['learning_rate'], + grid_scales = config['train']['grid_scales'], + obj_scale = config['train']['obj_scale'], + noobj_scale = config['train']['noobj_scale'], + xywh_scale = config['train']['xywh_scale'], + class_scale = config['train']['class_scale'], + ) + + ############################### + # Kick off the training + ############################### + callbacks = create_callbacks(config['train']['saved_weights_name'], config['train']['tensorboard_dir'], infer_model) + + train_model.fit_generator( + generator = train_generator, + steps_per_epoch = len(train_generator) * config['train']['train_times'], + epochs = config['train']['nb_epochs'] + config['train']['warmup_epochs'], + verbose = 2 if config['train']['debug'] else 1, + callbacks = callbacks, + workers = 4, + max_queue_size = 8 + ) + + # make a GPU version of infer_model for evaluation + if multi_gpu > 1: + infer_model = load_model(config['train']['saved_weights_name']) + + ############################### + # Run the evaluation + ############################### + # compute mAP for all the classes + average_precisions = evaluate(infer_model, valid_generator) + + # print the score + for label, average_precision in average_precisions.items(): + print(labels[label] + ': {:.4f}'.format(average_precision)) + print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) + +if __name__ == '__main__': + argparser = argparse.ArgumentParser(description='train and evaluate YOLO_v3 model on any dataset') + argparser.add_argument('-c', '--conf', help='path to configuration file') + + args = argparser.parse_args() + _main_(args) diff --git a/utils/tools.py b/utils/tools.py new file mode 100644 index 000000000..a5e852b09 --- /dev/null +++ b/utils/tools.py @@ -0,0 +1,345 @@ +import cv2 +import numpy as np +import os +from .bbox import BoundBox, bbox_iou +from scipy.special import expit + +def _sigmoid(x): + return expit(x) + +def makedirs(path): + try: + os.makedirs(path) + except OSError: + if not os.path.isdir(path): + raise + +def evaluate(model, + generator, + iou_threshold=0.5, + obj_thresh=0.5, + nms_thresh=0.45, + net_h=416, + net_w=416, + save_path=None): + """ Evaluate a given dataset using a given model. + code originally from https://github.com/fizyr/keras-retinanet + + # Arguments + model : The model to evaluate. + generator : The generator that represents the dataset to evaluate. + iou_threshold : The threshold used to consider when a detection is positive or negative. + obj_thresh : The threshold used to distinguish between object and non-object + nms_thresh : The threshold used to determine whether two detections are duplicates + net_h : The height of the input image to the model, higher value results in better accuracy + net_w : The width of the input image to the model + save_path : The path to save images with visualized detections to. + # Returns + A dict mapping class names to mAP scores. + """ + # gather all detections and annotations + all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] + all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] + + for i in range(generator.size()): + raw_image = [generator.load_image(i)] + + # make the boxes and the labels + pred_boxes = get_yolo_boxes(model, raw_image, net_h, net_w, generator.get_anchors(), obj_thresh, nms_thresh)[0] + + score = np.array([box.get_score() for box in pred_boxes]) + pred_labels = np.array([box.label for box in pred_boxes]) + + if len(pred_boxes) > 0: + pred_boxes = np.array([[box.xmin, box.ymin, box.xmax, box.ymax, box.get_score()] for box in pred_boxes]) + else: + pred_boxes = np.array([[]]) + + # sort the boxes and the labels according to scores + score_sort = np.argsort(-score) + pred_labels = pred_labels[score_sort] + pred_boxes = pred_boxes[score_sort] + + # copy detections to all_detections + for label in range(generator.num_classes()): + all_detections[i][label] = pred_boxes[pred_labels == label, :] + + annotations = generator.load_annotation(i) + + # copy detections to all_annotations + for label in range(generator.num_classes()): + all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() + + # compute mAP by comparing all detections and all annotations + average_precisions = {} + + for label in range(generator.num_classes()): + false_positives = np.zeros((0,)) + true_positives = np.zeros((0,)) + scores = np.zeros((0,)) + num_annotations = 0.0 + + for i in range(generator.size()): + detections = all_detections[i][label] + annotations = all_annotations[i][label] + num_annotations += annotations.shape[0] + detected_annotations = [] + + for d in detections: + scores = np.append(scores, d[4]) + + if annotations.shape[0] == 0: + false_positives = np.append(false_positives, 1) + true_positives = np.append(true_positives, 0) + continue + + overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) + assigned_annotation = np.argmax(overlaps, axis=1) + max_overlap = overlaps[0, assigned_annotation] + + if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: + false_positives = np.append(false_positives, 0) + true_positives = np.append(true_positives, 1) + detected_annotations.append(assigned_annotation) + else: + false_positives = np.append(false_positives, 1) + true_positives = np.append(true_positives, 0) + + # no annotations -> AP for this class is 0 (is this correct?) + if num_annotations == 0: + average_precisions[label] = 0 + continue + + # sort by score + indices = np.argsort(-scores) + false_positives = false_positives[indices] + true_positives = true_positives[indices] + + # compute false positives and true positives + false_positives = np.cumsum(false_positives) + true_positives = np.cumsum(true_positives) + + # compute recall and precision + recall = true_positives / num_annotations + precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) + + # compute average precision + average_precision = compute_ap(recall, precision) + average_precisions[label] = average_precision + + return average_precisions + +def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): + if (float(net_w)/image_w) < (float(net_h)/image_h): + new_w = net_w + new_h = (image_h*net_w)/image_w + else: + new_h = net_w + new_w = (image_w*net_h)/image_h + + for i in range(len(boxes)): + x_offset, x_scale = (net_w - new_w)/2./net_w, float(new_w)/net_w + y_offset, y_scale = (net_h - new_h)/2./net_h, float(new_h)/net_h + + boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w) + boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w) + boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h) + boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h) + +def do_nms(boxes, nms_thresh): + if len(boxes) > 0: + nb_class = len(boxes[0].classes) + else: + return + + for c in range(nb_class): + sorted_indices = np.argsort([-box.classes[c] for box in boxes]) + + for i in range(len(sorted_indices)): + index_i = sorted_indices[i] + + if boxes[index_i].classes[c] == 0: continue + + for j in range(i+1, len(sorted_indices)): + index_j = sorted_indices[j] + + if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: + boxes[index_j].classes[c] = 0 + +def decode_netout(netout, anchors, obj_thresh, net_h, net_w): + grid_h, grid_w = netout.shape[:2] + nb_box = 3 + netout = netout.reshape((grid_h, grid_w, nb_box, -1)) + nb_class = netout.shape[-1] - 5 + + boxes = [] + + netout[..., :2] = _sigmoid(netout[..., :2]) + netout[..., 4] = _sigmoid(netout[..., 4]) + netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) + netout[..., 5:] *= netout[..., 5:] > obj_thresh + + for i in range(grid_h*grid_w): + row = i // grid_w + col = i % grid_w + + for b in range(nb_box): + # 4th element is objectness score + objectness = netout[row, col, b, 4] + + if(objectness <= obj_thresh): continue + + # first 4 elements are x, y, w, and h + x, y, w, h = netout[row,col,b,:4] + + x = (col + x) / grid_w # center position, unit: image width + y = (row + y) / grid_h # center position, unit: image height + w = anchors[2 * b + 0] * np.exp(w) / net_w # unit: image width + h = anchors[2 * b + 1] * np.exp(h) / net_h # unit: image height + + # last elements are class probabilities + classes = netout[row,col,b,5:] + + box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes) + + boxes.append(box) + + return boxes + +def preprocess_input(image, net_h, net_w): + new_h, new_w, _ = image.shape + + # determine the new size of the image + if (float(net_w)/new_w) < (float(net_h)/new_h): + new_h = (new_h * net_w)//new_w + new_w = net_w + else: + new_w = (new_w * net_h)//new_h + new_h = net_h + + # resize the image to the new size + resized = cv2.resize(image[:,:,::-1]/255., (new_w, new_h)) + + # embed the image into the standard letter box + new_image = np.ones((net_h, net_w, 3)) * 0.5 + new_image[(net_h-new_h)//2:(net_h+new_h)//2, (net_w-new_w)//2:(net_w+new_w)//2, :] = resized + new_image = np.expand_dims(new_image, 0) + + return new_image + +def normalize(image): + return image/255. + +def get_yolo_boxes(model, images, net_h, net_w, anchors, obj_thresh, nms_thresh): + image_h, image_w, _ = images[0].shape + nb_images = len(images) + batch_input = np.zeros((nb_images, net_h, net_w, 3)) + + # preprocess the input + for i in range(nb_images): + batch_input[i] = preprocess_input(images[i], net_h, net_w) + + # run the prediction + batch_output = model.predict_on_batch(batch_input) + batch_boxes = [None]*nb_images + + for i in range(nb_images): + yolos = [batch_output[0][i], batch_output[1][i], batch_output[2][i]] + boxes = [] + + # decode the output of the network + for j in range(len(yolos)): + yolo_anchors = anchors[(2-j)*6:(3-j)*6] # config['model']['anchors'] + boxes += decode_netout(yolos[j], yolo_anchors, obj_thresh, net_h, net_w) + + # correct the sizes of the bounding boxes + correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w) + + # suppress non-maximal boxes + do_nms(boxes, nms_thresh) + + batch_boxes[i] = boxes + + return batch_boxes + +def compute_overlap(a, b): + """ + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + Parameters + ---------- + a: (N, 4) ndarray of float + b: (K, 4) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) + + iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) + ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) + + iw = np.maximum(iw, 0) + ih = np.maximum(ih, 0) + + ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih + + ua = np.maximum(ua, np.finfo(float).eps) + + intersection = iw * ih + + return intersection / ua + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + +def _softmax(x, axis=-1): + x = x - np.amax(x, axis, keepdims=True) + e_x = np.exp(x) + + return e_x / e_x.sum(axis, keepdims=True) + +def draw_boxes(image, boxes, labels, obj_thresh): + for box in boxes: + label_str = '' + label = -1 + + for i in range(len(labels)): + if box.classes[i] > obj_thresh: + label_str += labels[i] + label = i + print(labels[i] + ': ' + str(box.classes[i]*100) + '%') + + if label >= 0: + cv2.rectangle(image, (box.xmin,box.ymin), (box.xmax,box.ymax), (0,255,0), 3) + cv2.putText(image, + label_str + ' ' + str(box.get_score()), + (box.xmin, box.ymin - 13), + cv2.FONT_HERSHEY_SIMPLEX, + 1e-3 * image.shape[0], + (0,255,0), 2) + + return image \ No newline at end of file diff --git a/utils/weightreader.py b/utils/weightreader.py new file mode 100644 index 000000000..02c4de8c3 --- /dev/null +++ b/utils/weightreader.py @@ -0,0 +1,69 @@ +import argparse +import os +import numpy as np +from keras.layers import Conv2D, Input, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D +from keras.layers.merge import add, concatenate +from keras.models import Model +import struct +import cv2 + + +class WeightReader: + def __init__(self, weight_file): + with open(weight_file, 'rb') as w_f: + major, = struct.unpack('i', w_f.read(4)) + minor, = struct.unpack('i', w_f.read(4)) + revision, = struct.unpack('i', w_f.read(4)) + + if (major*10 + minor) >= 2 and major < 1000 and minor < 1000: + w_f.read(8) + else: + w_f.read(4) + + transpose = (major > 1000) or (minor > 1000) + + binary = w_f.read() + + self.offset = 0 + self.all_weights = np.frombuffer(binary, dtype='float32') + + def read_bytes(self, size): + self.offset = self.offset + size + return self.all_weights[self.offset-size:self.offset] + + def load_weights(self, model): + for i in range(106): + try: + conv_layer = model.get_layer('conv_' + str(i)) + print("loading weights of convolution #" + str(i)) + + if i not in [81, 93, 105]: + norm_layer = model.get_layer('bnorm_' + str(i)) + + size = np.prod(norm_layer.get_weights()[0].shape) + + beta = self.read_bytes(size) # bias + gamma = self.read_bytes(size) # scale + mean = self.read_bytes(size) # mean + var = self.read_bytes(size) # variance + + weights = norm_layer.set_weights([gamma, beta, mean, var]) + + if len(conv_layer.get_weights()) > 1: + bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape)) + kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) + + kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) + kernel = kernel.transpose([2,3,1,0]) + conv_layer.set_weights([kernel, bias]) + else: + kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) + kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) + kernel = kernel.transpose([2,3,1,0]) + conv_layer.set_weights([kernel]) + except ValueError: + print("no convolution #" + str(i)) + + def reset(self): + self.offset = 0 + diff --git a/yolo3_one_file_to_detect_them_all.py b/yolo3_one_file_to_detect_them_all.py deleted file mode 100644 index 17ca64e22..000000000 --- a/yolo3_one_file_to_detect_them_all.py +++ /dev/null @@ -1,434 +0,0 @@ -import argparse -import os -import numpy as np -from keras.layers import Conv2D, Input, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D -from keras.layers.merge import add, concatenate -from keras.models import Model -import struct -import cv2 - -np.set_printoptions(threshold=np.nan) -os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" -os.environ["CUDA_VISIBLE_DEVICES"]="0" - -argparser = argparse.ArgumentParser( - description='test yolov3 network with coco weights') - -argparser.add_argument( - '-w', - '--weights', - help='path to weights file') - -argparser.add_argument( - '-i', - '--image', - help='path to image file') - -class WeightReader: - def __init__(self, weight_file): - with open(weight_file, 'rb') as w_f: - major, = struct.unpack('i', w_f.read(4)) - minor, = struct.unpack('i', w_f.read(4)) - revision, = struct.unpack('i', w_f.read(4)) - - if (major*10 + minor) >= 2 and major < 1000 and minor < 1000: - w_f.read(8) - else: - w_f.read(4) - - transpose = (major > 1000) or (minor > 1000) - - binary = w_f.read() - - self.offset = 0 - self.all_weights = np.frombuffer(binary, dtype='float32') - - def read_bytes(self, size): - self.offset = self.offset + size - return self.all_weights[self.offset-size:self.offset] - - def load_weights(self, model): - for i in range(106): - try: - conv_layer = model.get_layer('conv_' + str(i)) - print("loading weights of convolution #" + str(i)) - - if i not in [81, 93, 105]: - norm_layer = model.get_layer('bnorm_' + str(i)) - - size = np.prod(norm_layer.get_weights()[0].shape) - - beta = self.read_bytes(size) # bias - gamma = self.read_bytes(size) # scale - mean = self.read_bytes(size) # mean - var = self.read_bytes(size) # variance - - weights = norm_layer.set_weights([gamma, beta, mean, var]) - - if len(conv_layer.get_weights()) > 1: - bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape)) - kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) - - kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) - kernel = kernel.transpose([2,3,1,0]) - conv_layer.set_weights([kernel, bias]) - else: - kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) - kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) - kernel = kernel.transpose([2,3,1,0]) - conv_layer.set_weights([kernel]) - except ValueError: - print("no convolution #" + str(i)) - - def reset(self): - self.offset = 0 - -class BoundBox: - def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None): - self.xmin = xmin - self.ymin = ymin - self.xmax = xmax - self.ymax = ymax - - self.objness = objness - self.classes = classes - - self.label = -1 - self.score = -1 - - def get_label(self): - if self.label == -1: - self.label = np.argmax(self.classes) - - return self.label - - def get_score(self): - if self.score == -1: - self.score = self.classes[self.get_label()] - - return self.score - -def _conv_block(inp, convs, skip=True): - x = inp - count = 0 - - for conv in convs: - if count == (len(convs) - 2) and skip: - skip_connection = x - count += 1 - - if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) # peculiar padding as darknet prefer left and top - x = Conv2D(conv['filter'], - conv['kernel'], - strides=conv['stride'], - padding='valid' if conv['stride'] > 1 else 'same', # peculiar padding as darknet prefer left and top - name='conv_' + str(conv['layer_idx']), - use_bias=False if conv['bnorm'] else True)(x) - if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(x) - if conv['leaky']: x = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(x) - - return add([skip_connection, x]) if skip else x - -def _interval_overlap(interval_a, interval_b): - x1, x2 = interval_a - x3, x4 = interval_b - - if x3 < x1: - if x4 < x1: - return 0 - else: - return min(x2,x4) - x1 - else: - if x2 < x3: - return 0 - else: - return min(x2,x4) - x3 - -def _sigmoid(x): - return 1. / (1. + np.exp(-x)) - -def bbox_iou(box1, box2): - intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) - intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) - - intersect = intersect_w * intersect_h - - w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin - w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin - - union = w1*h1 + w2*h2 - intersect - - return float(intersect) / union - -def make_yolov3_model(): - input_image = Input(shape=(None, None, 3)) - - # Layer 0 => 4 - x = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0}, - {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1}, - {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2}, - {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}]) - - # Layer 5 => 8 - x = _conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5}, - {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6}, - {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}]) - - # Layer 9 => 11 - x = _conv_block(x, [{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9}, - {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}]) - - # Layer 12 => 15 - x = _conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12}, - {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13}, - {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}]) - - # Layer 16 => 36 - for i in range(7): - x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3}, - {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}]) - - skip_36 = x - - # Layer 37 => 40 - x = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37}, - {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38}, - {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}]) - - # Layer 41 => 61 - for i in range(7): - x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3}, - {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}]) - - skip_61 = x - - # Layer 62 => 65 - x = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62}, - {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63}, - {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}]) - - # Layer 66 => 74 - for i in range(3): - x = _conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3}, - {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) - - # Layer 75 => 79 - x = _conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75}, - {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76}, - {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77}, - {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78}, - {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False) - - # Layer 80 => 82 - yolo_82 = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80}, - {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False) - - # Layer 83 => 86 - x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False) - x = UpSampling2D(2)(x) - x = concatenate([x, skip_61]) - - # Layer 87 => 91 - x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87}, - {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88}, - {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89}, - {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90}, - {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False) - - # Layer 92 => 94 - yolo_94 = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92}, - {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False) - - # Layer 95 => 98 - x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}], skip=False) - x = UpSampling2D(2)(x) - x = concatenate([x, skip_36]) - - # Layer 99 => 106 - yolo_106 = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99}, - {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100}, - {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101}, - {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102}, - {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103}, - {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104}, - {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False) - - model = Model(input_image, [yolo_82, yolo_94, yolo_106]) - return model - -def preprocess_input(image, net_h, net_w): - new_h, new_w, _ = image.shape - - # determine the new size of the image - if (float(net_w)/new_w) < (float(net_h)/new_h): - new_h = (new_h * net_w)/new_w - new_w = net_w - else: - new_w = (new_w * net_h)/new_h - new_h = net_h - - # resize the image to the new size - resized = cv2.resize(image[:,:,::-1]/255., (int(new_w), int(new_h))) - - # embed the image into the standard letter box - new_image = np.ones((net_h, net_w, 3)) * 0.5 - new_image[int((net_h-new_h)//2):int((net_h+new_h)//2), int((net_w-new_w)//2):int((net_w+new_w)//2), :] = resized - new_image = np.expand_dims(new_image, 0) - - return new_image - -def decode_netout(netout, anchors, obj_thresh, nms_thresh, net_h, net_w): - grid_h, grid_w = netout.shape[:2] - nb_box = 3 - netout = netout.reshape((grid_h, grid_w, nb_box, -1)) - nb_class = netout.shape[-1] - 5 - - boxes = [] - - netout[..., :2] = _sigmoid(netout[..., :2]) - netout[..., 4:] = _sigmoid(netout[..., 4:]) - netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:] - netout[..., 5:] *= netout[..., 5:] > obj_thresh - - for i in range(grid_h*grid_w): - row = i / grid_w - col = i % grid_w - - for b in range(nb_box): - # 4th element is objectness score - objectness = netout[int(row)][int(col)][b][4] - #objectness = netout[..., :4] - - if(objectness.all() <= obj_thresh): continue - - # first 4 elements are x, y, w, and h - x, y, w, h = netout[int(row)][int(col)][b][:4] - - x = (col + x) / grid_w # center position, unit: image width - y = (row + y) / grid_h # center position, unit: image height - w = anchors[2 * b + 0] * np.exp(w) / net_w # unit: image width - h = anchors[2 * b + 1] * np.exp(h) / net_h # unit: image height - - # last elements are class probabilities - classes = netout[int(row)][col][b][5:] - - box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes) - #box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, None, classes) - - boxes.append(box) - - return boxes - -def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): - if (float(net_w)/image_w) < (float(net_h)/image_h): - new_w = net_w - new_h = (image_h*net_w)/image_w - else: - new_h = net_w - new_w = (image_w*net_h)/image_h - - for i in range(len(boxes)): - x_offset, x_scale = (net_w - new_w)/2./net_w, float(new_w)/net_w - y_offset, y_scale = (net_h - new_h)/2./net_h, float(new_h)/net_h - - boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w) - boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w) - boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h) - boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h) - -def do_nms(boxes, nms_thresh): - if len(boxes) > 0: - nb_class = len(boxes[0].classes) - else: - return - - for c in range(nb_class): - sorted_indices = np.argsort([-box.classes[c] for box in boxes]) - - for i in range(len(sorted_indices)): - index_i = sorted_indices[i] - - if boxes[index_i].classes[c] == 0: continue - - for j in range(i+1, len(sorted_indices)): - index_j = sorted_indices[j] - - if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: - boxes[index_j].classes[c] = 0 - -def draw_boxes(image, boxes, labels, obj_thresh): - for box in boxes: - label_str = '' - label = -1 - - for i in range(len(labels)): - if box.classes[i] > obj_thresh: - label_str += labels[i] - label = i - print(labels[i] + ': ' + str(box.classes[i]*100) + '%') - - if label >= 0: - cv2.rectangle(image, (box.xmin,box.ymin), (box.xmax,box.ymax), (0,255,0), 3) - cv2.putText(image, - label_str + ' ' + str(box.get_score()), - (box.xmin, box.ymin - 13), - cv2.FONT_HERSHEY_SIMPLEX, - 1e-3 * image.shape[0], - (0,255,0), 2) - - return image - -def _main_(args): - weights_path = args.weights - image_path = args.image - - # set some parameters - net_h, net_w = 416, 416 - obj_thresh, nms_thresh = 0.5, 0.45 - anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] - labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", \ - "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", \ - "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", \ - "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", \ - "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", \ - "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", \ - "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", \ - "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", \ - "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", \ - "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] - - # make the yolov3 model to predict 80 classes on COCO - yolov3 = make_yolov3_model() - - # load the weights trained on COCO into the model - weight_reader = WeightReader(weights_path) - weight_reader.load_weights(yolov3) - - # preprocess the image - image = cv2.imread(image_path) - image_h, image_w, _ = image.shape - new_image = preprocess_input(image, net_h, net_w) - - # run the prediction - yolos = yolov3.predict(new_image) - boxes = [] - - for i in range(len(yolos)): - # decode the output of the network - boxes += decode_netout(yolos[i][0], anchors[i], obj_thresh, nms_thresh, net_h, net_w) - - # correct the sizes of the bounding boxes - correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w) - - # suppress non-maximal boxes - do_nms(boxes, nms_thresh) - - # draw bounding boxes on the image using labels - draw_boxes(image, boxes, labels, obj_thresh) - - # write the image with bounding boxes to file - cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], (image).astype('uint8')) - -if __name__ == '__main__': - args = argparser.parse_args() - _main_(args)