Source code for vocab.vocab

import sys
from collections import Counter
from copy import deepcopy


[docs]class OutOfVocabularyException(Exception):
    pass


[docs]class Vocab:
    """
    A vocabulary object for converting between words and numerical indices.

    Attributes:
        _index2word (list): an ordered list of words in the vocabulary.
        _word2index (dict): maps words to their respective indices.
        counts (dict): the number of times each word has been added to the vocabulary.
        
    """

    _reserved = set()  # words in here will not be discarded during pruning

[docs]    def __init__(self, words=()):
        """
        Args:
            words (:obj:`list` of :obj:`str`, optional): words to build vocab from.

        Example:
            >>> Vocab(['initial', 'words', 'for', 'the', 'vocabulary'])
        """
        self._index2word = []
        self._word2index = {}
        self.counts = Counter()
        for w in self._reserved:
            self.word2index(w, train=True)
            self.counts[w] = 0
        if words:
            self.word2index(words, train=True)

[docs]    def __len__(self):
        """
        Returns:
            int: number of words in the vocabulary.
        """
        return len(self._index2word)

    def __repr__(self):
        return "{}({})".format(self.__class__.__name__, len(self))

    def __eq__(self, another):
        if self.__class__ != another.__class__:
            return False
        if len(self) != len(another):
            return False
        for i, w in enumerate(self._index2word):
            if w not in self._reserved:
                if w != another._index2word[i] or self.counts[w] != another.counts[w]:
                    return False
        return True

    def __ne__(self, another):
        return not self.__eq__(another)

[docs]    def contains_same_content(self, another, same_counts=True):
        """
        Args:
            another (Vocab): another vocab to compare against.
            same_counts (:obj:`bool`, optional): whether to also check the counts.

        Returns:
            bool: whether this vocab and `another` contains the same content.
        """
        words = set(list(self._word2index.keys())).union(set(list(another._word2index.keys())))
        for w in words:
            if (w in another._word2index) != (w in self._word2index):
                return False
            if same_counts and self.counts[w] != another.counts[w]:
                return False
        return True

[docs]    def to_dict(self):
        """
        Returns:
            dict: dictionary of the voca object.
        """
        return {
            'index2word': [w for w in self._index2word if w not in self._reserved],
            'counts': {k: v for k, v in self.counts.items() if k not in self._reserved},
        }

[docs]    @classmethod
    def from_dict(cls, d):
        """
        Args:
            d (dict): dictionary of the vocab object.

        Returns:
            Vocab: vocab object from the given dictionary.
        """
        v = cls()
        for i, w in enumerate(d['index2word']):
            v.word2index(w, train=True)
            v.counts[w] = d['counts'][w]
        return v

[docs]    def copy(self, keep_words=True):
        """
        Args:
            keep_words (bool): whether to copy words in the vocab. Defaults to `True`.

        Returns:
            Vocab: a copy of this vocab.
        """
        return deepcopy(self) if keep_words else self.__class__()

[docs]    def prune_by_count(self, cutoff):
        """
        Args:
            cutoff (int): words occurring less than this number of times are removed from the new vocab.

        Returns:
            Vocab: a copy of this vocab object with words occurring less than `cutoff` times removed.
        """
        another = self.copy(keep_words=False)
        for w, c in self.counts.items():
            if c >= cutoff:
                another.word2index(w, train=True)
                another.counts[w] = c
        return another

[docs]    def prune_by_total(self, total):
        """
        Args:
            total (int): maximum vocab size
        Returns:
            Vocab: a copy of this vocab with only the top `total` words kept.
        """
        another = self.copy(keep_words=False)
        keep = [k for k, c in self.counts.most_common(total) if k not in self._reserved]
        for w in keep[:total]:
            another.word2index(w, train=True)
            another.counts[w] = self.counts[w]
        return another

[docs]    def word2index(self, word, train=False):
        """
        Args:
            word (str): word to look up index for.
            train (:obj:`bool`, optional): if `True`, then this word will be added to the voculary. Defaults to `False`.

        Returns:
            int: index corresponding to `word`.

            if `word` is a :obj:`list` of :obj:`str` then this function will be applied for each word and the corresponding list of indices is returned.

        Raises:
            OutOfVocabularyException: if `train` is `False` and `word` is not in the vocabulary
        """
        if isinstance(word, (list, tuple)):
            return [self.word2index(w, train=train) for w in word]
        self.counts[word] += train
        if word in self._word2index:
            return self._word2index[word]
        else:
            if train:
                self._index2word += [word]
                self._word2index[word] = len(self._word2index)
            else:
                return self._handle_oov_word(word)
        return self._word2index[word]

    def _handle_oov_word(self, word):
        """
        What to do when the word is out of vocabulary and not in training mode.
        You should not use this function explicitly.

        Args:
            word (str): word that trigged the OOV exception.
        """
        raise OutOfVocabularyException("Word '{}' is not in the vocabulary".format(word))

[docs]    def index2word(self, index):
        """
        Args:
            index (int): index to look up word for.

        Returns:
            str: word corresponding to `index`.

            if `index` is a :obj:`list` of :obj:`int` then this function will be applied for each index and the corresponding list of words is returned.

        Raises:
            OutOfVocabularyException: if `index` is not a valid index to the vocabulary.
        """
        if isinstance(index, list):
            return [self.index2word(i) for i in index]
        if index < 0:
            raise OutOfVocabularyException('Index {} is negative and is not a valid word index'.format(index))
        if index >= len(self):
            raise OutOfVocabularyException('Index {} exceeds vocab size {} and is not a valid word index'.format(index, len(self)))
        return sys.intern(self._index2word[index])

[docs]    def word2padded_index(self, lists_of_words, pad='<pad>', train=False, enforce_end_pad=True):
        """
        Args:
            lists_of_words (list): list of lists of words to pad
            pad (:obj:`str`, optional): word to use for padding. Defaults to `'<pad>'`.
            train (:obj:`bool`, optional): whether to add unknown words to the vocabulary. Defaults to `False`.
            enforce_end_pad (:obj:`bool`, optional): whether to always append a pad word to the end of each sentence.

        Returns:
            list: list of lists of word indices that are padded to be a matrix
            list: list of lengths for each valid sequence. Note that if `enforce_end_pad=True`, then the valid sequence includes the additional pad at the end.

        Raises:
            OutOfVocabularyException: if `lists_of_words` contains words not in the vocabulary and `train=False`.
        """
        if pad not in self._word2index and not train:
            raise OutOfVocabularyException("Pad word '{}' is not in the vocabulary".format(pad))
        seqs = [s + [pad] for s in lists_of_words] if enforce_end_pad else lists_of_words
        lens = [len(s) for s in seqs]
        max_len = max(lens)
        indices = [self.word2index(s, train=train) for s in seqs]
        pad_index = self.word2index(pad, train=train)
        padded_indices = [s + [pad_index] * (max_len - l) for s, l in zip(indices, lens)]
        return padded_indices, lens

[docs]    def padded_index2word(self, padded_indices, pad='<pad>'):
        """
        Args:
            padded_indices (list): list of lists of word indices to depad
            pad (:obj:`str`, optional): word to use for padding. Defaults to `'<pad>'`.

        Returns:
            list: list of lists of words that correspond to the depadded `padded_indices`.
            list: list of lengths for each valid sequence. Note that if `enforce_end_pad=True`, then the valid sequence includes the additional pad at the end.

        Raises:
            OutOfVocabularyException: if `padded_indices` contains indices not in the vocabulary or if `pad` is a word not in the vocabulary.
        """
        if pad not in self._word2index:
            raise OutOfVocabularyException("Pad word '{}' is not in the vocabulary".format(pad))
        pad_index = self.word2index(pad)
        depadded = []
        for indices in padded_indices:
            try:
                end = indices.index(pad_index)
            except ValueError as e:
                end = len(indices)
            finally:
                depadded.append(self.index2word(indices[:end]))
        return depadded