sparse_caption package

Subpackages

Submodules

sparse_caption.opts module

Created on 16 Sep 2020 17:21:25 @author: jiahuei

sparse_caption.opts.parse_opt(arguments=None) argparse.Namespace

sparse_caption.tokenizer module

Created on 06 May 2020 14:03:56 @author: jiahuei

Copyright (c) Facebook, Inc. and its affiliates, under the BSD License Copyright 2018 Google Inc., licensed under the Apache License, Version 2.0

Google SentencePiece:

https://github.com/google/sentencepiece/blob/v0.1.86/python/sentencepiece_python_module_example.ipynb https://github.com/google/sentencepiece/blob/v0.1.86/python/README.md

TorchText:

https://github.com/pytorch/text/blob/0.6.0/torchtext/data/utils.py#L74

Facebook Research:

https://github.com/facebookresearch/pytext/blob/v0.3.2/pytext/data/tokenizers/tokenizer.py

class sparse_caption.tokenizer.CharacterTokenizer(config)

Bases: sparse_caption.tokenizer.SentencePieceUnigramTokenizer

Character tokenizer implemented using Sentence Piece.

MODEL_TYPE = 'char'
encode(input_str: str, add_bos_eos: bool = True, max_seq_length: int = 60, sampling: bool = False) List[int]
class sparse_caption.tokenizer.RadixTokenizer(config)

Bases: sparse_caption.tokenizer.SentencePieceUnigramTokenizer

Word tokenizer implemented using Sentence Piece.

MODEL_TYPE = 'word'
static add_argparse_args(parser: Union[argparse._ArgumentGroup, argparse.ArgumentParser])
static base_to_decimal(digits, radix)

Converts a vector of non-negative digits in given radix into a number

property bos_token_id

Id of the beginning of sentence token in the vocabulary.

static decimal_to_base(n, base)

Function to convert any base-10 integer to base-N, shifted by 1.

decode(input_ids: Union[List[int], torch.Tensor, numpy.ndarray]) str
encode(input_str: str, add_bos_eos: bool = True, max_seq_length: int = 30, sampling: bool = False) List[int]
encode_tokenized(input_list: List[str], add_bos_eos: bool = True, max_seq_length: int = 30) List[int]
property eos_token_id

Id of the end of sentence token in the vocabulary.

static grouper(iterable, group_n, fill_value=1)
id_to_token(token_id)
property mask_token_id

Id of the mask token in the vocabulary.

property pad_token_id

Id of the padding token in the vocabulary.

token_to_id(token)
property unk_token_id

Id of the unknown token in the vocabulary.

class sparse_caption.tokenizer.SentencePieceBPETokenizer(config)

Bases: sparse_caption.tokenizer.SentencePieceUnigramTokenizer

Character tokenizer implemented using Sentence Piece.

MODEL_TYPE = 'bpe'
class sparse_caption.tokenizer.SentencePieceUnigramTokenizer(config)

Bases: sparse_caption.tokenizer.Tokenizer

https://github.com/google/sentencepiece/blob/v0.1.86/python/sentencepiece_python_module_example.ipynb

TrainerSpec {

input: train_captions.txt input_format: model_prefix: m model_type: UNIGRAM vocab_size: 2000 self_test_sample_size: 0 character_coverage: 0.9995 input_sentence_size: 0 shuffle_input_sentence: 1 seed_sentencepiece_size: 1000000 shrinking_factor: 0.75 max_sentence_length: 4192 num_threads: 16 num_sub_iterations: 2 max_sentencepiece_length: 16 split_by_unicode_script: 1 split_by_number: 1 split_by_whitespace: 1 treat_whitespace_as_suffix: 0 hard_vocab_limit: 1 use_all_vocab: 0 unk_id: 0 bos_id: 1 eos_id: 2 pad_id: -1 unk_piece: <unk> bos_piece: <s> eos_piece: </s> pad_piece: <pad> unk_surface: Γüç

} NormalizerSpec {

name: nmt_nfkc add_dummy_prefix: 1 remove_extra_whitespaces: 1 escape_whitespaces: 1 normalization_rule_tsv:

}

def set_vocabulary(self, valid_vocab):

return _sentencepiece.SentencePieceProcessor_set_vocabulary(self, valid_vocab)

def reset_vocabulary(self):

return _sentencepiece.SentencePieceProcessor_reset_vocabulary(self)

def load_vocabulary(self, filename, threshold):

return _sentencepiece.SentencePieceProcessor_load_vocabulary(self, filename, threshold)

MODEL_TYPE = 'unigram'
static add_argparse_args(parser: Union[argparse._ArgumentGroup, argparse.ArgumentParser])
property bos_token_id

Id of the beginning of sentence token in the vocabulary.

decode(input_ids: Union[List[int], torch.Tensor, numpy.ndarray]) str
encode(input_str: str, add_bos_eos: bool = True, max_seq_length: int = 16, sampling: bool = False) List[int]
encode_tokenized(input_list: List[str], add_bos_eos: bool = True, max_seq_length: int = 16) List[int]
property eos_token_id

Id of the end of sentence token in the vocabulary.

id_to_token(token_id)
property mask_token_id

Id of the mask token in the vocabulary.

property pad_token_id

Id of the padding token in the vocabulary.

token_to_id(token)
tokenize(input_str: str) List[sparse_caption.tokenizer.Token]
train()
property unk_token_id

Id of the unknown token in the vocabulary.

class sparse_caption.tokenizer.Token(value, start, end)

Bases: tuple

property end

Alias for field number 2

property start

Alias for field number 1

property value

Alias for field number 0

class sparse_caption.tokenizer.Tokenizer

Bases: abc.ABC

property bos_token

Beginning of sentence token (string).

abstract property bos_token_id

Id of the beginning of sentence token in the vocabulary.

control_symbols = ()
abstract decode(input_ids: List[int]) str
abstract encode(input_str: str, add_bos_eos: bool = True, max_seq_length: int = 16, sampling=False) List[int]
abstract encode_tokenized(input_list: List[str], add_bos_eos: bool = True, max_seq_length: int = 16) List[int]
property eos_token

End of sentence token (string).

abstract property eos_token_id

Id of the end of sentence token in the vocabulary.

abstract id_to_token(token_id)
property mask_token

Mask token (string). E.g. when training a model with masked-language modeling.

abstract property mask_token_id

Id of the mask token in the vocabulary.

property pad_token

Padding token (string).

abstract property pad_token_id

Id of the padding token in the vocabulary.

static process_tokens(input_str, tokens, token_strip_fn=None)

Calculate start and end indices of each piece. This roughly doubles the time taken for tokenization from 896,357.8 token/sec to 390,008.4 token/sec.

processor = None
special_token_attributes = ('user_defined_symbols', 'bos_token', 'eos_token', 'unk_token', 'pad_token', 'mask_token', 'bos_token_id', 'eos_token_id', 'unk_token_id', 'pad_token_id', 'mask_token_id')
abstract token_to_id(token)
abstract tokenize(input_str: str) List[sparse_caption.tokenizer.Token]
property unk_token

Unknown token (string).

abstract property unk_token_id

Id of the unknown token in the vocabulary.

property vocab_size

Vocabulary size, including special tokens.

class sparse_caption.tokenizer.WordTokenizer(config)

Bases: sparse_caption.tokenizer.SentencePieceUnigramTokenizer

Word tokenizer implemented using Sentence Piece.

MODEL_TYPE = 'word'
sparse_caption.tokenizer.get_tokenizer(name: str)
sparse_caption.tokenizer.register_tokenizer(name)

New tokenizers can be added with the register_tokenizer() function decorator.

For example:

@register_tokenizer('CharBPE')
class CharBPEPreTrainedTokenizer:
    (...)
Parameters

name (str) – the name of the tokenizer

sparse_caption.version module

Created on 31 Dec 2020 12:18:55 @author: jiahuei

Module contents

Created on 09 Jul 2020 23:36:50 @author: jiahuei