core.model.transformer.dataset

View Source

import shelve, atexit, os

from torch.utils.data import Dataset as TorchDataset
from abc import ABC, abstractmethod

from core.utils import check_and_create_folder, clear_filename, Random, CacheName
import core.utils.const as const

class Dataset(TorchDataset, ABC):
	"""
		Torch Dataset to use with Transformers representing a `core.corpus.annotated_corpus.AnnotatedCorpus`.

		This class is *abstract*, use sub classes for each specific task!
	"""

	MAX_INPUT_LEN = 512 
	"""
		Currently the max input length for BERT (length of "input_ids")
	"""

	BATCH_SIZE_INNER = 1 
	"""
		The inner batch size (of one item added by self._add(data))  
		1 means only one row of "input_ids" per self._add(data)
	"""

	def __init__(self, annotated_corpus, tokenizer, memory_only=False):
		"""
			Args:
				annotated_corpus (`core.corpus.annotated_corpus.AnnotatedCorpus`): The AnnotatedCorpus to represent
				tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use
				memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory?
		"""
		self.annotated_corpus = annotated_corpus
		self.tokenizer = tokenizer
		self.memory_only = memory_only
	
		if self.annotated_corpus.is_cacheable() and not self.memory_only: # shelve based persitant storage
			check_and_create_folder(const.TOKENIZER_CACHEDIR)

			# open new empty shelve and fill all data in
			if not self._is_cached(): 
				self.storage = shelve.open(self._get_cachefilename()[:-3], flag='n', writeback=False)

				# process all data into shelve
				self.index = 0
				self._load_and_tokenize()
				self.storage['items_count'] = self.index

				# close shelve (to make sure all is written to disk!)
				self.storage.close()

			# now open the shelve
			self.storage = shelve.open(self._get_cachefilename()[:-3], flag='r', writeback=False)

			# make sure to close at the end!
			atexit.register(lambda: self.storage.close())

		else: # ram only storage
			self.storage = {}
			self.index = 0
			self._load_and_tokenize()

	@abstractmethod
	def _load_and_tokenize(self):
		'''
			Load the annotated corpus and tokenize all parts correctly as dataset.
			Add each item via self._add(item)
		'''
		pass

	@abstractmethod
	def compute_metrics(self, eval_result):
		'''
			Function to calculate metrics from returned predictions of model.
			Use with `transformers.Trainer` as ``compute_metrics=Dataset.compute_metrics``
		'''
		pass

	def _add(self, item):
		'''
			Add a new item
		'''
		self.storage[str(self.index)] = item
		self.index += 1
	
	def __getitem__(self, index):
		'''
			Get item as specified by pyTorch protocol
		'''
		if index >= self.__len__():
			raise IndexError

		return self.storage[str(index)]

	def __len__(self):
		'''
			Get length as specified by pyTorch protocol
		'''
		if 'items_count' in self.storage:
			return self.storage['items_count']
		else:
			return len(self.storage)

	def get_cachename(self):
		return str(type(self.tokenizer).__name__).lower() \
			+ '-' + str(type(self).__name__).lower() \
			+ '-' + self.annotated_corpus.get_cachename()  \
			+ '_' + str(Random.get_seed()) + '_'

	def _get_cachefilename(self):
		'''
			Returns the filename datasets' cachefile (pickle file used by shelve)
		'''
		return CacheName.filename(os.path.join(
			const.TOKENIZER_CACHEDIR,
			clear_filename(self.get_cachename()) + ".db"
		))

	def _is_cached(self):
		return os.path.isdir(const.TOKENIZER_CACHEDIR) and os.path.isfile(self._get_cachefilename())

	def _tokenized_len(self, sentence):
		'''
			Get the number of tokens needed to encode this sentence (given as string or list of words).
		'''
		if isinstance(sentence, list):
			input_str = ' '.join(sentence)
		elif isinstance(sentence, str):
			input_str = sentence
		else:
			raise AttributeError("Only list of words or entire sentence as string allowed!")

		return len(self.tokenizer.tokenize(input_str))

# class Dataset(typing.Generic[+T_co]):

View Source

class Dataset(TorchDataset, ABC):
	"""
		Torch Dataset to use with Transformers representing a `core.corpus.annotated_corpus.AnnotatedCorpus`.

		This class is *abstract*, use sub classes for each specific task!
	"""

	MAX_INPUT_LEN = 512 
	"""
		Currently the max input length for BERT (length of "input_ids")
	"""

	BATCH_SIZE_INNER = 1 
	"""
		The inner batch size (of one item added by self._add(data))  
		1 means only one row of "input_ids" per self._add(data)
	"""

	def __init__(self, annotated_corpus, tokenizer, memory_only=False):
		"""
			Args:
				annotated_corpus (`core.corpus.annotated_corpus.AnnotatedCorpus`): The AnnotatedCorpus to represent
				tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use
				memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory?
		"""
		self.annotated_corpus = annotated_corpus
		self.tokenizer = tokenizer
		self.memory_only = memory_only
	
		if self.annotated_corpus.is_cacheable() and not self.memory_only: # shelve based persitant storage
			check_and_create_folder(const.TOKENIZER_CACHEDIR)

			# open new empty shelve and fill all data in
			if not self._is_cached(): 
				self.storage = shelve.open(self._get_cachefilename()[:-3], flag='n', writeback=False)

				# process all data into shelve
				self.index = 0
				self._load_and_tokenize()
				self.storage['items_count'] = self.index

				# close shelve (to make sure all is written to disk!)
				self.storage.close()

			# now open the shelve
			self.storage = shelve.open(self._get_cachefilename()[:-3], flag='r', writeback=False)

			# make sure to close at the end!
			atexit.register(lambda: self.storage.close())

		else: # ram only storage
			self.storage = {}
			self.index = 0
			self._load_and_tokenize()

	@abstractmethod
	def _load_and_tokenize(self):
		'''
			Load the annotated corpus and tokenize all parts correctly as dataset.
			Add each item via self._add(item)
		'''
		pass

	@abstractmethod
	def compute_metrics(self, eval_result):
		'''
			Function to calculate metrics from returned predictions of model.
			Use with `transformers.Trainer` as ``compute_metrics=Dataset.compute_metrics``
		'''
		pass

	def _add(self, item):
		'''
			Add a new item
		'''
		self.storage[str(self.index)] = item
		self.index += 1
	
	def __getitem__(self, index):
		'''
			Get item as specified by pyTorch protocol
		'''
		if index >= self.__len__():
			raise IndexError

		return self.storage[str(index)]

	def __len__(self):
		'''
			Get length as specified by pyTorch protocol
		'''
		if 'items_count' in self.storage:
			return self.storage['items_count']
		else:
			return len(self.storage)

	def get_cachename(self):
		return str(type(self.tokenizer).__name__).lower() \
			+ '-' + str(type(self).__name__).lower() \
			+ '-' + self.annotated_corpus.get_cachename()  \
			+ '_' + str(Random.get_seed()) + '_'

	def _get_cachefilename(self):
		'''
			Returns the filename datasets' cachefile (pickle file used by shelve)
		'''
		return CacheName.filename(os.path.join(
			const.TOKENIZER_CACHEDIR,
			clear_filename(self.get_cachename()) + ".db"
		))

	def _is_cached(self):
		return os.path.isdir(const.TOKENIZER_CACHEDIR) and os.path.isfile(self._get_cachefilename())

	def _tokenized_len(self, sentence):
		'''
			Get the number of tokens needed to encode this sentence (given as string or list of words).
		'''
		if isinstance(sentence, list):
			input_str = ' '.join(sentence)
		elif isinstance(sentence, str):
			input_str = sentence
		else:
			raise AttributeError("Only list of words or entire sentence as string allowed!")

		return len(self.tokenizer.tokenize(input_str))

Torch Dataset to use with Transformers representing a core.corpus.annotated_corpus.AnnotatedCorpus.

This class is abstract, use sub classes for each specific task!

# Dataset(annotated_corpus, tokenizer, memory_only=False)

View Source

	def __init__(self, annotated_corpus, tokenizer, memory_only=False):
		"""
			Args:
				annotated_corpus (`core.corpus.annotated_corpus.AnnotatedCorpus`): The AnnotatedCorpus to represent
				tokenizer (`transformers.PreTrainedTokenizer`): The tokenizer to use
				memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory?
		"""
		self.annotated_corpus = annotated_corpus
		self.tokenizer = tokenizer
		self.memory_only = memory_only
	
		if self.annotated_corpus.is_cacheable() and not self.memory_only: # shelve based persitant storage
			check_and_create_folder(const.TOKENIZER_CACHEDIR)

			# open new empty shelve and fill all data in
			if not self._is_cached(): 
				self.storage = shelve.open(self._get_cachefilename()[:-3], flag='n', writeback=False)

				# process all data into shelve
				self.index = 0
				self._load_and_tokenize()
				self.storage['items_count'] = self.index

				# close shelve (to make sure all is written to disk!)
				self.storage.close()

			# now open the shelve
			self.storage = shelve.open(self._get_cachefilename()[:-3], flag='r', writeback=False)

			# make sure to close at the end!
			atexit.register(lambda: self.storage.close())

		else: # ram only storage
			self.storage = {}
			self.index = 0
			self._load_and_tokenize()

Args

annotated_corpus (core.corpus.annotated_corpus.AnnotatedCorpus): The AnnotatedCorpus to represent
tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use
memory_only (bool): Store data on disk (will save memory and speed up when used again) or store all data only in memory?

# MAX_INPUT_LEN = 512

Currently the max input length for BERT (length of "input_ids")

# BATCH_SIZE_INNER = 1

The inner batch size (of one item added by self._add(data))
1 means only one row of "input_ids" per self._add(data)

@abstractmethod

def compute_metrics(self, eval_result):

View Source

	@abstractmethod
	def compute_metrics(self, eval_result):
		'''
			Function to calculate metrics from returned predictions of model.
			Use with `transformers.Trainer` as ``compute_metrics=Dataset.compute_metrics``
		'''
		pass

Function to calculate metrics from returned predictions of model. Use with transformers.Trainer as compute_metrics=Dataset.compute_metrics

# def get_cachename(self):

View Source

	def get_cachename(self):
		return str(type(self.tokenizer).__name__).lower() \
			+ '-' + str(type(self).__name__).lower() \
			+ '-' + self.annotated_corpus.get_cachename()  \
			+ '_' + str(Random.get_seed()) + '_'