core.model.transformer.dataset
View Source
import shelve, atexit, os from torch.utils.data import Dataset as TorchDataset from abc import ABC, abstractmethod from core.utils import check_and_create_folder, clear_filename, Random, CacheName import core.utils.const as const class Dataset(TorchDataset, ABC): """ Torch Dataset to use with Transformers representing a `core.corpus.annotated_corpus.AnnotatedCorpus`. This class is *abstract*, use sub classes for each specific task! """ MAX_INPUT_LEN = 512 """ Currently the max input length for BERT (length of "input_ids") """ BATCH_SIZE_INNER = 1 """ The inner batch size (of one item added by self._add(data)) 1 means only one row of "input_ids" per self._add(data) """ def __init__(self, annotated_corpus, tokenizer, memory_only=False): """ Args: annotated_corpus (`core.corpus.annotated_corpus.AnnotatedCorpus`): The AnnotatedCorpus to represent tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory? """ self.annotated_corpus = annotated_corpus self.tokenizer = tokenizer self.memory_only = memory_only if self.annotated_corpus.is_cacheable() and not self.memory_only: # shelve based persitant storage check_and_create_folder(const.TOKENIZER_CACHEDIR) # open new empty shelve and fill all data in if not self._is_cached(): self.storage = shelve.open(self._get_cachefilename()[:-3], flag='n', writeback=False) # process all data into shelve self.index = 0 self._load_and_tokenize() self.storage['items_count'] = self.index # close shelve (to make sure all is written to disk!) self.storage.close() # now open the shelve self.storage = shelve.open(self._get_cachefilename()[:-3], flag='r', writeback=False) # make sure to close at the end! atexit.register(lambda: self.storage.close()) else: # ram only storage self.storage = {} self.index = 0 self._load_and_tokenize() @abstractmethod def _load_and_tokenize(self): ''' Load the annotated corpus and tokenize all parts correctly as dataset. Add each item via self._add(item) ''' pass @abstractmethod def compute_metrics(self, eval_result): ''' Function to calculate metrics from returned predictions of model. Use with `transformers.Trainer` as ``compute_metrics=Dataset.compute_metrics`` ''' pass def _add(self, item): ''' Add a new item ''' self.storage[str(self.index)] = item self.index += 1 def __getitem__(self, index): ''' Get item as specified by pyTorch protocol ''' if index >= self.__len__(): raise IndexError return self.storage[str(index)] def __len__(self): ''' Get length as specified by pyTorch protocol ''' if 'items_count' in self.storage: return self.storage['items_count'] else: return len(self.storage) def get_cachename(self): return str(type(self.tokenizer).__name__).lower() \ + '-' + str(type(self).__name__).lower() \ + '-' + self.annotated_corpus.get_cachename() \ + '_' + str(Random.get_seed()) + '_' def _get_cachefilename(self): ''' Returns the filename datasets' cachefile (pickle file used by shelve) ''' return CacheName.filename(os.path.join( const.TOKENIZER_CACHEDIR, clear_filename(self.get_cachename()) + ".db" )) def _is_cached(self): return os.path.isdir(const.TOKENIZER_CACHEDIR) and os.path.isfile(self._get_cachefilename()) def _tokenized_len(self, sentence): ''' Get the number of tokens needed to encode this sentence (given as string or list of words). ''' if isinstance(sentence, list): input_str = ' '.join(sentence) elif isinstance(sentence, str): input_str = sentence else: raise AttributeError("Only list of words or entire sentence as string allowed!") return len(self.tokenizer.tokenize(input_str))
View Source
class Dataset(TorchDataset, ABC): """ Torch Dataset to use with Transformers representing a `core.corpus.annotated_corpus.AnnotatedCorpus`. This class is *abstract*, use sub classes for each specific task! """ MAX_INPUT_LEN = 512 """ Currently the max input length for BERT (length of "input_ids") """ BATCH_SIZE_INNER = 1 """ The inner batch size (of one item added by self._add(data)) 1 means only one row of "input_ids" per self._add(data) """ def __init__(self, annotated_corpus, tokenizer, memory_only=False): """ Args: annotated_corpus (`core.corpus.annotated_corpus.AnnotatedCorpus`): The AnnotatedCorpus to represent tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory? """ self.annotated_corpus = annotated_corpus self.tokenizer = tokenizer self.memory_only = memory_only if self.annotated_corpus.is_cacheable() and not self.memory_only: # shelve based persitant storage check_and_create_folder(const.TOKENIZER_CACHEDIR) # open new empty shelve and fill all data in if not self._is_cached(): self.storage = shelve.open(self._get_cachefilename()[:-3], flag='n', writeback=False) # process all data into shelve self.index = 0 self._load_and_tokenize() self.storage['items_count'] = self.index # close shelve (to make sure all is written to disk!) self.storage.close() # now open the shelve self.storage = shelve.open(self._get_cachefilename()[:-3], flag='r', writeback=False) # make sure to close at the end! atexit.register(lambda: self.storage.close()) else: # ram only storage self.storage = {} self.index = 0 self._load_and_tokenize() @abstractmethod def _load_and_tokenize(self): ''' Load the annotated corpus and tokenize all parts correctly as dataset. Add each item via self._add(item) ''' pass @abstractmethod def compute_metrics(self, eval_result): ''' Function to calculate metrics from returned predictions of model. Use with `transformers.Trainer` as ``compute_metrics=Dataset.compute_metrics`` ''' pass def _add(self, item): ''' Add a new item ''' self.storage[str(self.index)] = item self.index += 1 def __getitem__(self, index): ''' Get item as specified by pyTorch protocol ''' if index >= self.__len__(): raise IndexError return self.storage[str(index)] def __len__(self): ''' Get length as specified by pyTorch protocol ''' if 'items_count' in self.storage: return self.storage['items_count'] else: return len(self.storage) def get_cachename(self): return str(type(self.tokenizer).__name__).lower() \ + '-' + str(type(self).__name__).lower() \ + '-' + self.annotated_corpus.get_cachename() \ + '_' + str(Random.get_seed()) + '_' def _get_cachefilename(self): ''' Returns the filename datasets' cachefile (pickle file used by shelve) ''' return CacheName.filename(os.path.join( const.TOKENIZER_CACHEDIR, clear_filename(self.get_cachename()) + ".db" )) def _is_cached(self): return os.path.isdir(const.TOKENIZER_CACHEDIR) and os.path.isfile(self._get_cachefilename()) def _tokenized_len(self, sentence): ''' Get the number of tokens needed to encode this sentence (given as string or list of words). ''' if isinstance(sentence, list): input_str = ' '.join(sentence) elif isinstance(sentence, str): input_str = sentence else: raise AttributeError("Only list of words or entire sentence as string allowed!") return len(self.tokenizer.tokenize(input_str))
Torch Dataset to use with Transformers representing a core.corpus.annotated_corpus.AnnotatedCorpus
.
This class is abstract, use sub classes for each specific task!
View Source
def __init__(self, annotated_corpus, tokenizer, memory_only=False): """ Args: annotated_corpus (`core.corpus.annotated_corpus.AnnotatedCorpus`): The AnnotatedCorpus to represent tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory? """ self.annotated_corpus = annotated_corpus self.tokenizer = tokenizer self.memory_only = memory_only if self.annotated_corpus.is_cacheable() and not self.memory_only: # shelve based persitant storage check_and_create_folder(const.TOKENIZER_CACHEDIR) # open new empty shelve and fill all data in if not self._is_cached(): self.storage = shelve.open(self._get_cachefilename()[:-3], flag='n', writeback=False) # process all data into shelve self.index = 0 self._load_and_tokenize() self.storage['items_count'] = self.index # close shelve (to make sure all is written to disk!) self.storage.close() # now open the shelve self.storage = shelve.open(self._get_cachefilename()[:-3], flag='r', writeback=False) # make sure to close at the end! atexit.register(lambda: self.storage.close()) else: # ram only storage self.storage = {} self.index = 0 self._load_and_tokenize()
Args
- annotated_corpus (
core.corpus.annotated_corpus.AnnotatedCorpus
): The AnnotatedCorpus to represent - tokenizer (
transformers.PreTrainedTokenizer
): The tokenizer to use - memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory?
Currently the max input length for BERT (length of "input_ids")
The inner batch size (of one item added by self._add(data))
1 means only one row of "input_ids" per self._add(data)
View Source
@abstractmethod def compute_metrics(self, eval_result): ''' Function to calculate metrics from returned predictions of model. Use with `transformers.Trainer` as ``compute_metrics=Dataset.compute_metrics`` ''' pass
Function to calculate metrics from returned predictions of model.
Use with transformers.Trainer
as compute_metrics=Dataset.compute_metrics
View Source
def get_cachename(self): return str(type(self.tokenizer).__name__).lower() \ + '-' + str(type(self).__name__).lower() \ + '-' + self.annotated_corpus.get_cachename() \ + '_' + str(Random.get_seed()) + '_'