computationalmedia
diff --git a/‎.env
Lines changed: 1 addition & 0 deletions b/‎.env
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 34 additions & 4 deletions b/‎README.md
Lines changed: 34 additions & 4 deletions
diff --git a/‎crowdflower.csv
Lines changed: 555 additions & 0 deletions b/‎crowdflower.csv
Lines changed: 555 additions & 0 deletions
diff --git a/‎datasets/beefban.h5
4.27 MB b/‎datasets/beefban.h5
4.27 MB
diff --git a/‎datasets/capital.h5
25.3 MB b/‎datasets/capital.h5
25.3 MB
diff --git a/‎datasets/guncontrol.h5
20.5 MB b/‎datasets/guncontrol.h5
20.5 MB
diff --git a/‎demo.ipynb
Lines changed: 320 additions & 0 deletions b/‎demo.ipynb
Lines changed: 320 additions & 0 deletions
diff --git a/‎environment.yml
Lines changed: 15 additions & 0 deletions b/‎environment.yml
Lines changed: 15 additions & 0 deletions
diff --git a/‎grad.py
Lines changed: 306 additions & 0 deletions b/‎grad.py
Lines changed: 306 additions & 0 deletions
diff --git a/‎install.sh
Lines changed: 7 additions & 0 deletions b/‎install.sh
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1 @@
+source activate compsumm > /dev/null 2>&1
@@ -0,0 +1,2 @@
+.DS_Store
+__pycache__
@@ -1,14 +1,44 @@
 # compsumm
 Code, Datasets and Supplementary Appendix for AAAI paper **Comparative Document Summarisation via Classification**
 
-**Code**: Coming Soon
+**Supplementary Appendix**: [pdf](/appendix.pdf)
 
-**Dataset**: Coming Soon
+## How to use this repository ?
 
-**Supplementary Appendix**: [pdf](/appendix.pdf)
+### Installing
+If you have miniconda or anaconda, please use `install.sh` to install new env `compsumm` that has all dependencies; otherwise the dependencies are listed in `environment.yml`
 
-If you use this dataset, please cite this work at
+### 1. Dataset
+The dataset are in directory `dataset` in `HDF5` format. There are three files for each of the three news topics used in paper. Each file has following structure:
+```
+-- data: Averaged GLOVE vectors of title and first 3 sentences, 300 dimensional
+-- y: labels created by dividing timeranges into two groups
+-- yn: labels created using month for beefban and wee for capital punishment and guncontrol.
+-- title: title of article
+-- text: first three sentences
+-- datetime: date of publication
+
+The dataset was split 70-20-10 as train-test-val sets several times.
+-- train_idxs: Matrix with each row i containing training indexes of split i.
+-- test_idxs: Matrix with each row i containing test indexes of split i.
+-- val_idxs: Matrix with each row i containing val indexes of split i.
+```
+Please see `news.py` for example loading of this dataset.
 
+### 2. Code
+Please see [demo notebook](/demo.ipynb) for example use of `subm.py` and `grad.py`
+- `subm.py` has utility functions and greedy optimiser for discrete optimisation.
+- `grad.py` has utility functions and SGD optimiser for continuous optimisation. SGD optimised wasn't used, LBFGS from scipy was used instead.
+
+`models.py` has several models for summarisation as classifiers. Models were abstracted into `Summ` class. This is particularly useful in creating common pattern for different summariser methods and in tuning hyperparameters. Please see [models notebook](/models.ipynb) for demo of `news.py` and `models.py`.
+
+`utils.py` has common functions such as `balanced accuracy`, which is used for evaluation.
+
+### Crowd-sourced evaluations results
+The crowd-sourced evaluations results is in file `crowdflower.csv`. The design and settings for this experiment is explained in the paper.
+
+### 4. Citing
+If you use this dataset, please cite this work at
 ```
 @inproceedings{bista2019compsumm,
   title={Comparative Document Summarisation via Classification},
 
@@ -0,0 +1,15 @@
+name: compsumm
+dependencies:
+  - python=3
+  - numpy
+  - joblib
+  - pandas
+  - scipy
+  - jupyter
+  - matplotlib
+  - scikit-learn
+  - h5py
+  - seaborn
+  - pip :
+    - mypy
+    - profilehooks
@@ -0,0 +1,306 @@
+import scipy as sp
+import numpy as np
+from sklearn.metrics.pairwise import pairwise_kernels
+from typing import Tuple, Callable, Any, List
+from functools import partial
+from profilehooks import profile
+from math import ceil
+import time
+
+### MMD-Grad ###
+# @profile
+def ekxs_cost_grad(A: np.ndarray, X: np.ndarray, gamma: float, **kwargs: Any) -> Tuple[float, np.ndarray]:
+	n, d = X.shape
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	Kxa = pairwise_kernels(X, A, metric = "rbf", gamma = gamma)
+	cost = -2 * Kxa.mean()
+	Grad = np.zeros((m, d))
+	for l in range(A.shape[0]):
+		Grad[l] -= ((X - A[l]).T * Kxa[:,l]).T.mean(axis = 0)
+	return cost, 4 * gamma / m * Grad.flatten()
+
+def mmd_cost_grad(A: np.ndarray, X: np.ndarray, gamma: float, **kwargs: Any) -> Tuple[float, np.ndarray]:
+	n, d = X.shape
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	Kxa = pairwise_kernels(X, A, metric = "rbf", gamma = gamma)
+	Kaa = pairwise_kernels(A, A, metric = "rbf", gamma = gamma)
+	cost = -2 * Kxa.mean() + Kaa.mean()
+	Grad = np.zeros((m, d))
+	for l in range(A.shape[0]):
+		Grad[l] -= ((X - A[l]).T * Kxa[:,l]).T.mean(axis = 0)
+		Grad[l] += ((A - A[l]).T * Kaa[:,l]).T.mean(axis = 0)
+	return cost, 4 * gamma / m * Grad.flatten()
+
+def ekxs_cost(A: np.ndarray, X: np.ndarray, gamma: float, **kwargs: Any) -> Tuple[float, np.ndarray]:
+	n, d = X.shape
+	return -2 * pairwise_kernels(X, A.reshape(A.shape[0] // d, d), metric = "rbf", gamma = gamma).mean()
+
+def mmd_cost(A: np.ndarray, X: np.ndarray, gamma: float, **kwargs: Any) -> Tuple[float, np.ndarray]:
+	n, d = X.shape
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	Kxa = pairwise_kernels(X, A, metric = "rbf", gamma = gamma)
+	Kaa = pairwise_kernels(A, A, metric = "rbf", gamma = gamma)
+	return -2 * Kxa.mean() + Kaa.mean()
+
+#### MMD_grad with labels: different thank other data
+# @profile
+def mmd_cost_grad_labels(A: np.ndarray, X: np.ndarray, y: np.ndarray, 
+					gamma: float, gamma2: float, lambdaa: float = 0.0, diff = "data", **kwargs: Any) -> Tuple[float, np.ndarray]:
+	cost = 0.0
+	n, d = X.shape
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	Grad = np.zeros((m, d))
+	classes = sorted(set(y))
+	perclass = m // len(classes)
+
+	for c, k in enumerate(classes):
+		Ac = A[c*perclass:(c+1)*perclass, :].flatten()
+		Xc = X[np.where(y == k)[0], :]
+		Xk = X[np.where(y != k)[0], :]
+		cost_c, Grad_c = mmd_cost_grad(Ac, Xc, gamma)
+		cost = cost + cost_c
+		Grad[c*perclass:(c+1)*perclass, :] = Grad_c.reshape(perclass, d)
+		if lambdaa > 0:
+			cost_k, Grad_k = 0.0, 0.0
+			if diff == "data":
+				cost_k, Grad_k = mmd_cost_grad(Ac, Xk, gamma2)
+				Grad_k = Grad_k.reshape(perclass, d)
+			elif diff == "EKxs":
+				cost_k, Grad_k = ekxs_cost_grad(Ac, Xk, gamma2)
+				Grad_k = Grad_k.reshape(perclass, d)
+			cost -= lambdaa * cost_k
+			Grad[c*perclass:(c+1)*perclass, :] -= lambdaa * Grad_k
+	if diff == "params" and lambdaa > 0:
+		cost_k, Grad_k = mmd_cost_grad_params(A.reshape(-1), d, len(classes), gamma2)
+		cost -= lambdaa * cost_k
+		Grad -= lambdaa * Grad_k.reshape(m, d)
+	return cost, Grad.flatten()
+
+### cost only
+def mmd_cost_params(A: np.ndarray, d: int, K: int, gamma: float) -> float:
+	# print(A.shape, d, K, gamma)
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	mk = m // K
+	cost = 0
+	Kaa = pairwise_kernels(A, metric = "rbf", gamma = gamma)
+
+	for k in np.arange(K):
+		r = np.arange(k*mk,(k+1)*mk,1)
+		msk = np.zeros(m, dtype = np.bool)
+		msk[r] = True
+		cost_k =  Kaa[msk, :][:, msk].mean() + Kaa[~msk, :][:, ~msk].mean() -2 * Kaa[msk, :][:, ~msk].mean()
+		cost += cost_k
+	return cost
+
+def mmd_cost_labels(A: np.ndarray, X: np.ndarray, y: np.ndarray, 
+					gamma: float, gamma2: float, lambdaa: float = 0.0, diff = "data", **kwargs: Any) -> Tuple[float, np.ndarray]:
+	cost = 0.0
+	_, d = X.shape
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	classes = sorted(set(y))
+	perclass = m // len(classes)
+
+	for c, k in enumerate(classes):
+		Ac = A[c*perclass:(c+1)*perclass, :].flatten()
+		Xc = X[np.where(y == k)[0], :]
+		Xk = X[np.where(y != k)[0], :]
+		cost_c, Grad_c = mmd_cost_grad(Ac, Xc, gamma)
+		cost = cost + cost_c
+		cost_k = 0.0
+		if diff == "data" and lambdaa > 0:
+			cost_k = mmd_cost(Ac, Xk, gamma2)
+		elif diff == "EKxs":
+			cost_k = ekxs_cost(Ac, Xk, gamma2)
+		cost -= lambdaa * cost_k
+	if diff == "params" and lambdaa > 0:
+		cost_k = mmd_cost_params(A.reshape(-1), d, len(classes), gamma2)
+		cost -= lambdaa * cost_k
+	return cost
+
+## MMD with labels: different prototypes
+# @profile
+def mmd_cost_grad_params(A: np.ndarray, d: int, K: int, gamma: float) -> float:
+	# print(A.shape, d, K, gamma)
+	m = A.shape[0] // d
+	A = A.reshape(m, d)
+	mk = m // K
+	cost = 0
+	Grad = np.zeros((m, d))
+	Kaa = pairwise_kernels(A, metric = "rbf", gamma = gamma)
+
+	for k in np.arange(K):
+		r = np.arange(k*mk,(k+1)*mk,1)
+		msk = np.zeros(m, dtype = np.bool)
+		msk[r] = True
+		cost_k =  Kaa[msk, :][:, msk].mean() + Kaa[~msk, :][:, ~msk].mean() -2 * Kaa[msk, :][:, ~msk].mean()
+		cost += cost_k
+
+		for l in range(m):
+			if l in r:
+				Grad[l] += (4 * gamma / mk * ((A[msk, :] - A[l]).T * Kaa[msk, :][:,l]).T.mean(axis = 0) )
+				Grad[l] -= (4 * gamma / mk * ((A[~msk, :] - A[l]).T * Kaa[~msk, :][:,l]).T.mean(axis = 0) )
+			if l not in r:
+				Grad[l] += (4 * gamma / (m - mk) * ((A[~msk, :] - A[l]).T * Kaa[~msk, :][:,l]).T.mean(axis = 0) )
+				Grad[l] -= (4 * gamma / (m - mk) * ((A[msk, :] - A[l]).T * Kaa[msk, :][:,l]).T.mean(axis = 0) )
+	return cost, Grad.flatten()
+
+
+####################################################################################################
+########################################## OPTIMIZATION ############################################
+def step_decay(epochs: int, drop: float = 0.5, epochs_drop: int = 10) -> float:
+	'''
+		step_decay of learning rate
+	'''
+	return drop ** ((1 + epochs) // epochs_drop)
+
+def gd(func: Callable[[np.ndarray, Any], Tuple[np.ndarray, List[float]]], 
+		param0: np.ndarray, lr: float = 0.1, beta: float = 0.9, 
+		decay: Callable[[int], float] = partial(step_decay, drop = 0.5, epochs_drop = 10),
+		max_epochs: int = 100, tol: float = 1e-6, **kwargs) -> Tuple[np.ndarray, List[float]]:
+	'''
+		Gradient descent with momentum
+		args:
+			- func => f: (param, *args) -> cost, grad
+			- param0 => initial guess
+			- kwargs => optional arguments to the cost/grad function
+			- lr => learning rate
+			- beta => momentum parameter
+			- max_epochs => maximum number of epochs
+			- tol => tolerance of param for stopping criteria
+		returns:
+			- param: optimized parameter
+			- cost: list of costs evaluated in each iterations
+	'''
+	costs = []
+	V = np.zeros_like(param0)
+	for epoch in range(max_epochs):
+		cost, grad = func(param0, **kwargs)
+		V = beta * V + grad
+		lr_ = (lr * decay(epoch))
+		param = param0 - lr_ * V
+		costs.append(cost)
+		if np.abs(param - param0).sum() <= tol:
+		# if i >= 1 and np.abs(costs[-1] - costs[-2]) <= tol:
+			break
+		param0 = param
+	return param, costs
+
+
+def sgd(func, param0, X, y = None, batch_size = 100, lr = 0.1, beta = 0.9, 
+		decay = partial(step_decay, drop = 0.5, epochs_drop = 5), tol = 1e-6,
+		max_epochs = 100, **kwargs):
+	'''
+		Stochastic Gradient descent with momentum
+		args:
+			- func => f: (param, *args) -> cost, grad
+			- param0 => initial guess
+			- args => optional arguments to the cost/grad function
+			- lr => learning rate
+			- beta => momentum parameter
+			- max_epochs => maximum number of epochs
+			- tol => tolerance of param for stopping criteria
+		returns:
+			- param: optimized parameter
+			- cost: list of costs evaluated in each iterations
+	'''
+	costs = []
+	N, _ = X.shape
+	V = np.zeros_like(param0)
+	num_batches = ceil(N/batch_size)
+	print("starting sgd with {} batches".format(num_batches))
+	for epoch in range(max_epochs):
+		for i in range(num_batches):
+			if y is not None:
+				cost, grad = func(param0.flatten(), 
+					X = X[i * batch_size: (i+1) * batch_size],
+					y = y[i * batch_size: (i+1) * batch_size],
+					**kwargs
+				)
+			else:
+				cost, grad = func(param0.flatten(), 
+					X = X[i * batch_size: (i+1) * batch_size],
+					**kwargs
+				)
+
+			V = beta * V + grad.reshape(param0.shape)
+			lr_ = (lr * decay(epoch))
+			param = param0 - lr_ * V
+			costs.append(cost)
+			if np.abs(param - param0).sum() <= tol:
+				break
+			param0 = param
+		# print("epoch {} => {:.4f}".format(epoch +1, cost))
+	# print("sgd costs:", costs)
+	return param, costs
+
+
+#############################################################################
+#################### Just some tests from here #########################
+import h5py
+def hdf5(path):
+
+    with h5py.File(path, 'r') as hf:
+        X = hf.get("data_pca85")[:]
+        y = np.array(hf.get("target")[:], dtype = np.uint8)
+        train_idxs = hf.get("train_idxs")[:]
+        test_idxs = hf.get('test_idxs')[:]
+
+    return X[train_idxs[0]], y[train_idxs[0]], X[test_idxs[0]], y[test_idxs[0]]
+
+
+def eval_sgd():
+	from scipy.optimize import check_grad, approx_fprime, minimize
+	from sklearn.cluster import KMeans
+	import os
+	X_tr, y_tr, X_te, y_te  = hdf5(
+		os.environ["HOME"] + "/Nextcloud/datasets/usps.h5"
+	)
+
+	m = 4
+	gamma = 0.04
+	A0 = []
+	for c in sorted(set(y_tr)):
+		kmeans = KMeans(n_clusters = 2, init = "k-means++", random_state = 29)
+		kmeans.fit(X_tr)
+		A0.append(kmeans.cluster_centers_)
+	A0 = np.concatenate(A0, axis = 0)
+	print("kmeans completed")
+	A, costs = sgd(mmd_cost_grad_labels, A0, X_tr, y_tr, gamma = gamma, gamma2 = gamma,lambdaa = 0.1)
+	print("sgd", costs)
+
+def eval_lbfgs():
+	from scipy.optimize import check_grad, approx_fprime, minimize
+	from sklearn.cluster import KMeans
+	import os
+	X_tr, y_tr, X_te, y_te  = hdf5(
+		os.environ["HOME"] + "/Nextcloud/datasets/usps.h5"
+	)
+
+	m = 4
+	gamma = 0.04
+	A0 = []
+	for c in sorted(set(y_tr)):
+		kmeans = KMeans(n_clusters = 2, init = "k-means++", random_state = 29)
+		kmeans.fit(X_tr)
+		A0.append(kmeans.cluster_centers_)
+	A0 = np.concatenate(A0, axis = 0)
+
+	print("kmeans completed")
+	opt = sp.optimize.minimize(mmd_cost_grad_labels, A0.flatten(), args = (X_tr, y_tr, gamma, gamma, 0.1), 
+	                    method='L-BFGS-B', jac = True, tol = 1e-6,
+	                    options={'maxiter': 100, 'disp': True})
+	# A = opt.x.reshape(m, X_tr.shape[1])
+	print("LBFGS", opt.x.shape())
+
+def main():
+	eval_sgd()
+	eval_lbfgs()
+
+if __name__ == '__main__':
+	main()
@@ -0,0 +1,7 @@
+#!/bin/bash
+echo "Installing/updating conda environment and dependencies"
+conda env create -f environment.yml || conda env update
+echo "creating .env for autoenv, follow README.md for its installation"
+rm .env
+echo "source activate compsumm > /dev/null 2>&1" >> .env
+echo "Installation completed !!!"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+source activate compsumm > /dev/null 2>&1`