|
| 1 | +"""Utility class used in JSFusion model, copied from the original author's code |
| 2 | +https://github.com/yj-yu/lsmdc/blob/master/videocap/datasets/data_util.py |
| 3 | +""" |
| 4 | +import time |
| 5 | +import numpy as np |
| 6 | +import re |
| 7 | + |
| 8 | + |
| 9 | +def clean_str(string, downcase=True): |
| 10 | + """Tokenization/string cleaning for strings. |
| 11 | +
|
| 12 | + Taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py |
| 13 | + """ |
| 14 | + string = re.sub(r"[^A-Za-z0-9(),!?\'\`(_____)]", " ", string) |
| 15 | + string = re.sub(r"\'s", " \'s", string) |
| 16 | + string = re.sub(r"\'ve", " \'ve", string) |
| 17 | + string = re.sub(r"n\'t", " n\'t", string) |
| 18 | + string = re.sub(r"\'re", " \'re", string) |
| 19 | + string = re.sub(r"\'d", " \'d", string) |
| 20 | + string = re.sub(r"\'ll", " \'ll", string) |
| 21 | + string = re.sub(r",", " , ", string) |
| 22 | + string = re.sub(r"!", " ! ", string) |
| 23 | + string = re.sub(r"\(", " \( ", string) |
| 24 | + string = re.sub(r"\)", " \) ", string) |
| 25 | + string = re.sub(r"\?", " \? ", string) |
| 26 | + string = re.sub(r"\s{2,}", " ", string) |
| 27 | + return string.strip().lower() if downcase else string.strip() |
| 28 | + |
| 29 | +def recover_word(string): |
| 30 | + string = re.sub(r" \'s", "\'s", string) |
| 31 | + string = re.sub(r" ,", ",", string) |
| 32 | + return string |
| 33 | + |
| 34 | +def clean_blank(blank_sent): |
| 35 | + """Tokenizes and changes _____ to <START> |
| 36 | + <START> would be Answer position in FIB work. |
| 37 | + """ |
| 38 | + clean_sent = clean_str(blank_sent).split() |
| 39 | + return ['<START>' if x == '_____' else x for x in clean_sent] |
| 40 | + |
| 41 | + |
| 42 | +def clean_root(string): |
| 43 | + """Removes unexpected character in root. |
| 44 | + """ |
| 45 | + return string |
| 46 | + |
| 47 | + |
| 48 | +def pad_sequences(sequences, pad_token="[PAD]", pad_location="LEFT", max_length=None): |
| 49 | + """Pads all sequences to the same length. The length is defined by the longest sequence. |
| 50 | + Returns padded sequences. |
| 51 | + """ |
| 52 | + if not max_length: |
| 53 | + max_length = max(len(x) for x in sequences) |
| 54 | + |
| 55 | + result = [] |
| 56 | + for i in range(len(sequences)): |
| 57 | + sentence = sequences[i] |
| 58 | + num_padding = max_length - len(sentence) |
| 59 | + if num_padding == 0: |
| 60 | + new_sentence = sentence |
| 61 | + elif num_padding < 0: |
| 62 | + new_sentence = sentence[:num_padding] |
| 63 | + elif pad_location == "RIGHT": |
| 64 | + new_sentence = sentence + [pad_token] * num_padding |
| 65 | + elif pad_location == "LEFT": |
| 66 | + new_sentence = [pad_token] * num_padding + sentence |
| 67 | + else: |
| 68 | + print("Invalid pad_location. Specify LEFT or RIGHT.") |
| 69 | + result.append(new_sentence) |
| 70 | + return result |
| 71 | + |
| 72 | + |
| 73 | +def convert_sent_to_index(sentence, word_to_index): |
| 74 | + """Converts sentence consisting of string to indexed sentence. |
| 75 | + """ |
| 76 | + return [word_to_index[word] if word in word_to_index.keys() else 0 for word in sentence] |
| 77 | + |
| 78 | + |
| 79 | +def batch_iter(data, batch_size, seed=None, fill=True): |
| 80 | + """Generates a batch iterator for a dataset. |
| 81 | + """ |
| 82 | + random = np.random.RandomState(seed) |
| 83 | + data_length = len(data) |
| 84 | + num_batches = int(data_length / batch_size) |
| 85 | + if data_length % batch_size != 0: |
| 86 | + num_batches += 1 |
| 87 | + |
| 88 | + # Shuffle the data at each epoch |
| 89 | + shuffle_indices = random.permutation(np.arange(data_length)) |
| 90 | + for batch_num in range(num_batches): |
| 91 | + start_index = batch_num * batch_size |
| 92 | + end_index = min((batch_num + 1) * batch_size, data_length) |
| 93 | + selected_indices = shuffle_indices[start_index:end_index] |
| 94 | + # If we don't have enough data left for a whole batch, fill it randomly |
| 95 | + if fill and end_index >= data_length: |
| 96 | + num_missing = batch_size - len(selected_indices) |
| 97 | + selected_indices = np.concatenate([selected_indices, random.randint(0, data_length, num_missing)]) |
| 98 | + yield [data[i] for i in selected_indices] |
| 99 | + |
| 100 | + |
| 101 | +def fsr_iter(fsr_data, batch_size, random_seed=42, fill=True): |
| 102 | + """fsr_data: one of LSMDCData.build_data(), [[video_features], [sentences], [roots]] |
| 103 | + return per iter: [[feature]*batch_size, [sentences]*batch_size, [roots]*batch] |
| 104 | +
|
| 105 | + Usage: |
| 106 | + train_data, val_data, test_data = LSMDCData.build_data() |
| 107 | + for features, sentences, roots in fsr_iter(train_data, 20, 10): |
| 108 | + feed_dict = {model.video_feature : features, |
| 109 | + model.sentences : sentences, |
| 110 | + model.roots : roots} |
| 111 | + """ |
| 112 | + |
| 113 | + train_iter = batch_iter(list(zip(*fsr_data)), batch_size, fill=fill, seed=random_seed) |
| 114 | + return map(lambda batch: zip(*batch), train_iter) |
| 115 | + |
| 116 | + |
| 117 | +def preprocess_sents(descriptions, word_to_index, max_length): |
| 118 | + descriptions = [clean_str(sent).split() for sent in descriptions] |
| 119 | + # Add padding on the right to each sentence in order to keep the same lengths. |
| 120 | + descriptions = pad_sequences(descriptions, max_length=max_length) |
| 121 | + # Convert sentences from a list of string to the list of indices (int) |
| 122 | + descriptions = [convert_sent_to_index(sent, word_to_index) for sent in descriptions] |
| 123 | + |
| 124 | + return descriptions |
| 125 | + # remove punctuation mark and special chars from root. |
| 126 | + |
| 127 | + |
| 128 | +def preprocess_roots(roots, word_to_index): |
| 129 | + roots = [clean_root(root) for root in roots] |
| 130 | + # convert string to int index. |
| 131 | + roots = [word_to_index[root] if root in word_to_index.keys() else 0 for root in roots] |
| 132 | + |
| 133 | + return roots |
| 134 | + |
| 135 | + |
| 136 | +def pad_video(video_feature, dimension, padded_feature=None): |
| 137 | + """Fills pad to video to have same length. |
| 138 | + Pad in Left. |
| 139 | + video = [pad,..., pad, frm1, frm2, ..., frmN] |
| 140 | + """ |
| 141 | + if padded_feature is None: |
| 142 | + padded_feature = np.zeros(dimension, dtype=np.float32) |
| 143 | + max_length = dimension[0] |
| 144 | + current_length = video_feature.shape[0] |
| 145 | + num_padding = max_length - current_length |
| 146 | + if num_padding == 0: |
| 147 | + padded_feature[:] = video_feature |
| 148 | + elif num_padding < 0: |
| 149 | + steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32) |
| 150 | + padded_feature[:] = video_feature[steps] |
| 151 | + else: |
| 152 | + # about 0.7 sec |
| 153 | + padded_feature[num_padding:] = video_feature |
| 154 | + |
| 155 | + return padded_feature |
| 156 | + |
| 157 | +def repeat_pad_video(video_feature, dimension): |
| 158 | + padded_feature = np.zeros(dimension, dtype= np.float) |
| 159 | + max_length = dimension[0] |
| 160 | + current_length = video_feature.shape[0] |
| 161 | + |
| 162 | + if current_length == max_length: |
| 163 | + padded_feature[:] = video_feature |
| 164 | + |
| 165 | + elif current_length < max_length: |
| 166 | + tile_num = int(max_length / current_length) |
| 167 | + to_tile = np.ones(len(dimension), dtype=np.int32) |
| 168 | + to_tile[0] = tile_num |
| 169 | + remainder = max_length % current_length |
| 170 | + tiled_vid = np.tile(video_feature, to_tile) |
| 171 | + if remainder > 0: |
| 172 | + padded_feature[0:remainder] = video_feature[-remainder:] |
| 173 | + padded_feature[remainder:] = tiled_vid |
| 174 | + |
| 175 | + else: |
| 176 | + steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32) |
| 177 | + padded_feature[:] = video_feature[steps] |
| 178 | + return padded_feature |
| 179 | + |
| 180 | +def stretch_pad_video(video_feature, dimension): |
| 181 | + padded_feature = np.zeros(dimension, dtype= np.float) |
| 182 | + max_length = dimension[0] |
| 183 | + current_length = video_feature.shape[0] |
| 184 | + |
| 185 | + if current_length == max_length: |
| 186 | + padded_feature[:] = video_feature |
| 187 | + elif current_length < max_length: |
| 188 | + repeat_num = int((max_length-1) / current_length)+1 |
| 189 | + tiled_vid = np.repeat(video_feature, repeat_num,0) |
| 190 | + steps = np.linspace(0, repeat_num*current_length, num=max_length, endpoint=False, dtype=np.int32) |
| 191 | + padded_feature[:] = tiled_vid[steps] |
| 192 | + else: |
| 193 | + steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32) |
| 194 | + padded_feature[:] = video_feature[steps] |
| 195 | + return padded_feature |
| 196 | + |
| 197 | + |
| 198 | +def fill_mask(max_length, current_length, zero_location='LEFT'): |
| 199 | + num_padding = max_length - current_length |
| 200 | + if num_padding <= 0: |
| 201 | + mask = np.ones(max_length) |
| 202 | + elif zero_location == 'LEFT': |
| 203 | + mask = np.ones(max_length) |
| 204 | + for i in range(num_padding): |
| 205 | + mask[i] = 0 |
| 206 | + elif zero_location == 'RIGHT': |
| 207 | + mask = np.zeros(max_length) |
| 208 | + for i in range(current_length): |
| 209 | + mask[i] = 1 |
| 210 | + |
| 211 | + return mask |
0 commit comments