utils.py

#!/usr/bin/env python3
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

# Modifications for Guinet et al. 

import io
import numpy as np
import collections
from typing import List, Dict, Tuple

def load_vectors(fname: str,
                maxload: int = 10000,
                norm: bool = True,
                center: bool = False,
                verbose: bool = True) -> Tuple[List,np.array]:
    """Load vectors from a path.
    Args:
        fname: Path of the file to load
        maxload: Number of vectors to load
        norm: If we normalize the vector or not
        center: If we center the vectors
        verbose: Verbose parameter
    Returns:
        words: List of words
        x: np.array with the vectors
    Raises:

    """
    if verbose:
        print("Loading vectors from %s" % fname)
    fin = io.open(fname, "r", encoding="utf-8", newline="\n", errors="ignore")
    n, d = map(int, fin.readline().split())
    if maxload > 0:
        n = min(n, maxload)
    x = np.zeros([n, d])
    words = []
    for i, line in enumerate(fin):
        if i >= n:
            break
        tokens = line.rstrip().split(" ")
        words.append(tokens[0])
        v = np.array(tokens[1:], dtype=float)
        x[i, :] = v
    if norm:
        x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
    if center:
        x -= x.mean(axis=0)[np.newaxis, :]
        x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
    if verbose:
        print("%d word vectors loaded" % (len(words)))
    return words, x


def idx(words: List[str]) -> Dict:
    """Create a mapping from words to indexes
    Args:
        words: List of words
    Returns:
        w2i: Dict mapping words from their index
    Raises:

    """
    w2i = {}
    for i, w in enumerate(words):
        if w not in w2i:
            w2i[w] = i
    return w2i


def save_vectors(fname: str,
                    x: np.array,
                    words: List[str]):
    """Save into fname the words and their embeddings.
    Args:
        fname: Path where to save the filee
        x: List of embeddings
        words: List of words
    Returns:
        
    Raises:

    """
    n, d = x.shape
    fout = io.open(fname, "w", encoding="utf-8")
    fout.write(u"%d %d\n" % (n, d))
    for i in range(n):
        fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
    fout.close()


def save_matrix(fname: str,
                x: np.array):
    """Save into fname the wanted matrix, usually the orthogonal matrix from the procrustes problem.
    Args:
        fname: Path where to save the filee
        x: List of embeddings
    Returns:
        
    Raises:

    """
    n, d = x.shape
    fout = io.open(fname, "w", encoding="utf-8")
    fout.write(u"%d %d\n" % (n, d))
    for i in range(n):
        fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
    fout.close()


def procrustes(X_src: np.array,
                Y_tgt: np.array) -> np.array:
    """Compute the solution of an Orthogonal Procrustes:
    $$ min_{Q in On} (X_src * Q - PY) $$
    Args:
        X_src, Y_tgt: Matrix from the minimization problem
    Returns:
        np.dot(U, V): Solution of the problem
    Raises:

    """
    U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
    return np.dot(U, V)


def select_vectors_from_pairs(x_src: np.array,
                                y_tgt: np.array,
                                pairs: List) -> Tuple[np.array, np.array]:
    """Select embeddings from the wanted pairs.
    Args:
        x_src, y_tgt: List of embeddings
        pairs: List of pair of indexes we want the embeddings
    Returns:
        x,y: The selected embeddings
    Raises:

    """
    n = len(pairs)
    d = x_src.shape[1]
    x = np.zeros([n, d])
    y = np.zeros([n, d])
    for k, ij in enumerate(pairs):
        i, j = ij
        x[k, :] = x_src[i, :]
        y[k, :] = y_tgt[j, :]
    return x, y


def load_lexicon(filename: str,
                words_src: List[str],
                words_tgt: List[str],
                verbose: bool = True) -> Tuple[Dict,float]:
    """Creates a lexicon (mapping) from one language to another.
    Args:
        filename: Language 1 & 2 to compute 
        words_src: Words from language 1
        words_tgt: Words from language 2
        verbose: Verbose parameter
    Returns:
        lexicon: Mapping from L1 to L2
        float(len(vocab)): Size of our vocabulary
    Raises:

    """
    f = io.open(filename, "r", encoding="utf-8")
    lexicon = collections.defaultdict(set)
    idx_src, idx_tgt = idx(words_src), idx(words_tgt)
    vocab = set()
    for line in f:
        word_src, word_tgt = line.split()
        if word_src in idx_src and word_tgt in idx_tgt:
            lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
        vocab.add(word_src)
    if verbose:
        coverage = len(lexicon) / float(len(vocab))
        print("Coverage of source vocab: %.4f" % (coverage))
    return lexicon, float(len(vocab))

def load_lexicon_reverse(filename: str,
                words_src: List[str],
                words_tgt: List[str],
                verbose: bool = True) -> Tuple[Dict,float]:
    """Creates a lexicon (mapping) from one language to another.
    Args:
        filename: Language 1 & 2 to compute 
        words_src: Words from language 1
        words_tgt: Words from language 2
        verbose: Verbose parameter
    Returns:
        lexicon: Mapping from L1 to L2
        float(len(vocab)): Size of our vocabulary
    Raises:

    """
    f = io.open(filename, "r", encoding="utf-8")
    lexicon = collections.defaultdict(set)
    idx_src, idx_tgt = idx(words_src), idx(words_tgt)
    vocab = set()
    for line in f:
        word_tgt, word_src = line.split()
        if word_src in idx_src and word_tgt in idx_tgt:
            lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
        vocab.add(word_src)
    if verbose:
        coverage = len(lexicon) / float(len(vocab))
        print("Coverage of source vocab: %.4f" % (coverage))
    return lexicon, float(len(vocab))


def load_pairs(filename: str,
                idx_src,
                idx_tgt,
                verbose: bool = True) -> List:
    """Creates possible pairs from filename, present in the given indexes.
    Args:
        filename: Indexes of pairs to check out
        idx_src: Indexes from language 1
        idx_tgt: Indexes from language 2
        verbose: Verbose parameter
    Returns:
        pairs: Available pairs
    Raises:

    """
    f = io.open(filename, "r", encoding="utf-8")
    pairs = []
    tot = 0
    for line in f:
        a, b = line.rstrip().split(" ")
        tot += 1
        if a in idx_src and b in idx_tgt:
            pairs.append((idx_src[a], idx_tgt[b]))
    if verbose:
        coverage = (1.0 * len(pairs)) / tot
        print(
            "Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f"
            % (len(pairs), tot, coverage)
        )
    return pairs

def load_pairs_reverse(filename: str,
                idx_src,
                idx_tgt,
                verbose: bool = True) -> List:
    """Creates possible pairs from filename, present in the given indexes.
    Args:
        filename: Indexes of pairs to check out
        idx_src: Indexes from language 1
        idx_tgt: Indexes from language 2
        verbose: Verbose parameter
    Returns:
        pairs: Available pairs
    Raises:

    """
    f = io.open(filename, "r", encoding="utf-8")
    pairs = []
    tot = 0
    for line in f:
        b, a = line.rstrip().split(" ")
        tot += 1
        if a in idx_src and b in idx_tgt:
            pairs.append((idx_src[a], idx_tgt[b]))
    if verbose:
        coverage = (1.0 * len(pairs)) / tot
        print(
            "Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f"
            % (len(pairs), tot, coverage)
        )
    return pairs

def compute_nn_accuracy(x_src: np.array,
                        x_tgt: np.array,
                        lexicon: Dict,
                        bsz: int = 100) -> float:
    """Computes the accuracy of the alignment between x_src & x_tgt, with the nearest neighbours method.
    Args: 
        x_src: Source embeddings
        x_tgt: Target embeddings
        lexicon: True mapping
        bsz: Batch Size
    Returns:
        acc / lexicon_size: Accuracy of the alignment
    Raises:

    """
    
    lexicon_size = len(lexicon)
    idx_src = list(lexicon.keys())

    acc = 0.0
    x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
    x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
    for i in range(0, len(idx_src), bsz):
        e = min(i + bsz, len(idx_src))
        scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
        pred = scores.argmax(axis=0)
        for j in range(i, e):
            if pred[j - i] in lexicon[idx_src[j]]:
                acc += 1.0
    return acc / lexicon_size

def compute_csls_accuracy(x_src: np.array,
                            x_tgt: np.array,
                            lexicon: Dict,
                            k: int = 10,
                            bsz: int = 1024):
    """Computes the accuracy of the alignment between x_src & x_tgt, with the CSLS method.
    Args: 
        x_src: Source embeddings
        x_tgt: Target embeddings
        lexicon: True mapping
        k: Number of neighbours
        bsz: Batch Size
    Returns:
        correct / lexicon_size: Accuracy of the alignment
    Raises:

    """

    lexicon_size = len(lexicon)
    idx_src = list(lexicon.keys())

    x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
    x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8

    sr = x_src[list(idx_src)]
    sc = np.dot(sr, x_tgt.T)
    similarities = 2 * sc
    sc2 = np.zeros(x_tgt.shape[0])
    for i in range(0, x_tgt.shape[0], bsz):
        j = min(i + bsz, x_tgt.shape[0])
        sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
        dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
        sc2[i:j] = np.mean(dotprod, axis=1)
    similarities -= sc2[np.newaxis, :]

    nn = np.argmax(similarities, axis=1).tolist()
    correct = 0.0
    for k_ in range(0, len(lexicon)):
        if nn[k_] in lexicon[idx_src[k_]]:
            correct += 1.0
    return correct / lexicon_size