core.model.exec

View Source
import os, time

from core.utils import Random, const, write_json_file

from core.corpus.corpus import Corpus
from core.corpus.preprocess import Preprocessor
from core.corpus import ( 
	Wiktionary,
	Quotes,
	MultiAnnotatedCorpus,
	SingleAnnotatedCorpus
)
from core.model.transformer import (
	IsNextSCDBert,
	IsSCDBert,
	SelectSCDBert,
	GivenTextFindSCDBert, GivenSCDFindTextBert
)
from core.model.scdmatrix import (
	iSCDMatrix,
	MPSCDMatrix
)


class Executor():
	"""	
		Wrapper class for model usage and training.

		Just choose Corpus, Annotator and Model an let this
		class manage the training and evaluation.

		Example:
			<pre>
			e = Executor(
				IsSCDBert,
				Wiktionary,
				TwentyNews(subgroup=['misc-forsale']),
			)
			e.exec()
			print(e.predict("The bison is cool!"))
			</pre>
			Using the classes `core.model.transformer.models.IsSCDBert`,
			`core.corpus.annotators.Wiktionary`, and `core.corpus.corpora.TwentyNews`.
	"""

	ALL_MODELS = [
		IsNextSCDBert,
		IsSCDBert,
		SelectSCDBert,
		GivenTextFindSCDBert, GivenSCDFindTextBert,
		iSCDMatrix,
		MPSCDMatrix
	]
	"""
		Supported models
	"""

	ALL_ANNOTATORS = [
		Wiktionary,
		Quotes
	]
	"""
		Supported annotators
	"""

	TIME_FORMAT = "%Y-%m-%d_%H-%M-%S"
	"""
		The time format used for the result files.
	"""

	_EVAL_REMAIN = [
		"accuracy",
		"accuracy_include",
		"accuracy_part",
		"precision",
		"recall",
		"f1",
		"avg_similarity",
		"num",
		"num_positive_labels",
		"num_negative_labels",
		"num_positive_predictions",
		"num_negative_predictions"
	]
	_EVAL_PREFIX = 'eval_'

	def __init__(self,
			model_class, annotator_class,
			corpus,
			annotator_preprocessor=None,
			percentage_train=0.8
		):
		"""
			Args:
				model_class (`core.model.base.Model`): The model to use (as **class**, not an initiated object!)
				annotator_class (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): The annotator to use (as **class**, not an initiated object!)
				corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): The corpus to use (as **initiated object**)
				annotator_preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use for annotator, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` (no list allowed, will use same for all corpora!)
				percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation)

			Raises:
				AttributeError: if parameters do not match needed types
		"""
		
		if model_class not in Executor.ALL_MODELS:
			raise AttributeError("The model_class parameter needs class from Executor.ALL_MODELS!")
		if (not isinstance(annotator_class, list) and not annotator_class in Executor.ALL_ANNOTATORS ) \
			or (isinstance(annotator_class, list) and not all([a in Executor.ALL_ANNOTATORS for a in annotator_class])):
			raise AttributeError("The annotator_class parameter needs a single class from Executor.ALL_ANNOTATORS or a list of classes!")
		if (not isinstance(corpus, list) and not isinstance(corpus, Corpus) ) \
			or (isinstance(corpus, list) and not all([isinstance(c, Corpus) for c in corpus])):
			raise AttributeError("The corpus parameter needs a object or list of objects of type core.corpus.corpus.Corpus!")
		if not isinstance(annotator_preprocessor, Preprocessor) and not annotator_preprocessor == None:
			raise AttributeError("The annotator_preprocessor parameter needs a object of type core.corpus.preprocess.Preprocessor or None!")
		if percentage_train < 0.01 or percentage_train > 0.99:
			raise AttributeError("The percentage_train needs to be a floats in [0.01, 0.99]!")

		self.model_class = model_class
		self.annotator_class = annotator_class
		self.annotator_preprocessor = annotator_preprocessor
		self.corpus = corpus
		self.percentages = [percentage_train, round(1-percentage_train, 1)]

		if isinstance(corpus, list) and isinstance(annotator_class, list):
			if len(corpus) == len(annotator_class) > 1:
				self.is_multiple = True
			else:
				raise AttributeError("Lists corpus and annotator_class have to be same length and contain more than 1 element each!")
		else:
			self.is_multiple = False

			self.annotator_class = [self.annotator_class]
			self.corpus = [self.corpus]

		
		
	def exec(self, save_results=True, split_annotator=True, **kwargs):
		'''
			Executes the model: Create, Train and Evaluate.  
			See `core.model.base.Model.train()` and `core.model.base.Model.evaluate()`

			Will output some basic results.

			Args:
				save_results (bool): write the results to a json file?
				split_annotator (bool): split the annotator to use different annotations for training and evaluation?
				**kwargs: key-value-args passed to the model while ``model_class.__init__``
			Returns:
				dictionary with results and parameters (same written to json, if active)
		'''
		self.split_annotator = split_annotator

		# split corpora
		splitted_corpora = []
		for corpus in self.corpus:
			splitted_corpora.append(corpus.split(percentages=self.percentages))
		c_train, c_eval = zip(*splitted_corpora)

		# create annotators
		annotators = []
		for annotator_class in self.annotator_class:
			if self.split_annotator:
				annotators.append((
					annotator_class(percentages=self.percentages, part=0, preprocessor=self.annotator_preprocessor), # train
					annotator_class(percentages=self.percentages, part=1, preprocessor=self.annotator_preprocessor) # eval
				))
			else:
				annotators.append((
					annotator_class(preprocessor=self.annotator_preprocessor),
					annotator_class(preprocessor=self.annotator_preprocessor)
				))
		a_train, a_eval = zip(*annotators)
		
		# create annotated corpora
		if self.is_multiple:
			self.ac_train = MultiAnnotatedCorpus(c_train, a_train)
			self.ac_eval = MultiAnnotatedCorpus(c_eval, a_eval)
		else:
			self.ac_train = SingleAnnotatedCorpus(c_train[0], a_train[0])
			self.ac_eval = SingleAnnotatedCorpus(c_eval[0], a_eval[0])

		self.model = self.model_class(self.ac_train, self.ac_eval, **kwargs)
		self.training_metrics = self.model.train()

		results = self.model.evaluate()
		self.results = self._clear_eval(results)

		print(self.results)
		return self._full_results(save_results)

	def predict(self, *args):
		'''
			Run a prediction against the model.  
			See `core.model.base.Model.predict()`
		'''
		return self.model.predict(*args)

	def _clear_eval(self, r):
		'''
			Remove some parts from Trainer evaluation result
			Args:
				r (dict): Trainer evaluation result
			Returns:
				filtered dictionary
		'''
		r_n = {}
		for k,v in r.items():
			if k.startswith(Executor._EVAL_PREFIX):
				k = k[len(Executor._EVAL_PREFIX):]
			
			if k in Executor._EVAL_REMAIN:
				r_n[k] = v
		return r_n

	def _full_results(self, do_save):
		'''
			Create the full result data.
			Args:
				do_save (bool): Also save as json?
			Returns:
				dictionary with results and parameters
		'''

		try:
			# BERT only
			base_model = self.model.pretrained_model
			dataset_train = self.model.data_train.get_cachename()
			dataset_eval = self.model.data_eval.get_cachename()
			bert_training = self.model.training_params_str
		except:
			base_model = "none"
			dataset_train = ""
			dataset_eval = ""
			bert_training = ""

		try:
			# iSCD only
			scd_threshold = self.model.threshold
		except:
			scd_threshold = "none"

		try:
			# MPSCD only
			scd_mapping = self.model.scd_mapping
		except:
			scd_mapping = "none"

		data = {
			'percentages' : self.percentages,
			'classes' : {
				'model' : self.model_class.__name__,
				'annotator' : '-'.join([a.__name__ for a in self.annotator_class]),
				'annotator_preprocessor' : "default" if self.annotator_preprocessor == None else type(self.annotator_preprocessor).__name__,
				'corpus' : '-'.join([type(c).__name__ for c in self.corpus]),
				'corpus_preprocessor' : '-'.join([type(c.preprocessor).__name__ for c in self.corpus])
			},
			'names' : {
				'corpus' : '-'.join([c.get_cachename() for c in self.corpus]),
				'annotated_corpus_train' : self.ac_train.get_cachename(),
				'annotated_corpus_eval' : self.ac_eval.get_cachename(),
				'base_model' : base_model,
				'dataset_train' : dataset_train,
				'dataset_eval' : dataset_eval
			},
			'parameters' : {
				'seed' : Random.get_seed(),
				'batch_sizes' : [ const.TRAIN_BATCH_SIZE, const.EVAL_BATCH_SIZE ],
				'scd_threshold' : scd_threshold,
				'split_annotator' : self.split_annotator,
				'scd_mapping' : scd_mapping,
				'bert_training' : bert_training
			},
			'results' : self.results,
			'training_metrics' : self.training_metrics
		}

		if do_save:
			write_json_file(
				os.path.join(
					const.RESULTS_DIR,
					"execution_" + \
					data['classes']['model'] + '-' + data['classes']['annotator'] + '-' + data['classes']['corpus'] + \
					'_' + time.strftime( Executor.TIME_FORMAT ) + ".json"
				),
				data
			)

		return data
#   class Executor:
View Source
class Executor():
	"""	
		Wrapper class for model usage and training.

		Just choose Corpus, Annotator and Model an let this
		class manage the training and evaluation.

		Example:
			<pre>
			e = Executor(
				IsSCDBert,
				Wiktionary,
				TwentyNews(subgroup=['misc-forsale']),
			)
			e.exec()
			print(e.predict("The bison is cool!"))
			</pre>
			Using the classes `core.model.transformer.models.IsSCDBert`,
			`core.corpus.annotators.Wiktionary`, and `core.corpus.corpora.TwentyNews`.
	"""

	ALL_MODELS = [
		IsNextSCDBert,
		IsSCDBert,
		SelectSCDBert,
		GivenTextFindSCDBert, GivenSCDFindTextBert,
		iSCDMatrix,
		MPSCDMatrix
	]
	"""
		Supported models
	"""

	ALL_ANNOTATORS = [
		Wiktionary,
		Quotes
	]
	"""
		Supported annotators
	"""

	TIME_FORMAT = "%Y-%m-%d_%H-%M-%S"
	"""
		The time format used for the result files.
	"""

	_EVAL_REMAIN = [
		"accuracy",
		"accuracy_include",
		"accuracy_part",
		"precision",
		"recall",
		"f1",
		"avg_similarity",
		"num",
		"num_positive_labels",
		"num_negative_labels",
		"num_positive_predictions",
		"num_negative_predictions"
	]
	_EVAL_PREFIX = 'eval_'

	def __init__(self,
			model_class, annotator_class,
			corpus,
			annotator_preprocessor=None,
			percentage_train=0.8
		):
		"""
			Args:
				model_class (`core.model.base.Model`): The model to use (as **class**, not an initiated object!)
				annotator_class (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): The annotator to use (as **class**, not an initiated object!)
				corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): The corpus to use (as **initiated object**)
				annotator_preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use for annotator, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` (no list allowed, will use same for all corpora!)
				percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation)

			Raises:
				AttributeError: if parameters do not match needed types
		"""
		
		if model_class not in Executor.ALL_MODELS:
			raise AttributeError("The model_class parameter needs class from Executor.ALL_MODELS!")
		if (not isinstance(annotator_class, list) and not annotator_class in Executor.ALL_ANNOTATORS ) \
			or (isinstance(annotator_class, list) and not all([a in Executor.ALL_ANNOTATORS for a in annotator_class])):
			raise AttributeError("The annotator_class parameter needs a single class from Executor.ALL_ANNOTATORS or a list of classes!")
		if (not isinstance(corpus, list) and not isinstance(corpus, Corpus) ) \
			or (isinstance(corpus, list) and not all([isinstance(c, Corpus) for c in corpus])):
			raise AttributeError("The corpus parameter needs a object or list of objects of type core.corpus.corpus.Corpus!")
		if not isinstance(annotator_preprocessor, Preprocessor) and not annotator_preprocessor == None:
			raise AttributeError("The annotator_preprocessor parameter needs a object of type core.corpus.preprocess.Preprocessor or None!")
		if percentage_train < 0.01 or percentage_train > 0.99:
			raise AttributeError("The percentage_train needs to be a floats in [0.01, 0.99]!")

		self.model_class = model_class
		self.annotator_class = annotator_class
		self.annotator_preprocessor = annotator_preprocessor
		self.corpus = corpus
		self.percentages = [percentage_train, round(1-percentage_train, 1)]

		if isinstance(corpus, list) and isinstance(annotator_class, list):
			if len(corpus) == len(annotator_class) > 1:
				self.is_multiple = True
			else:
				raise AttributeError("Lists corpus and annotator_class have to be same length and contain more than 1 element each!")
		else:
			self.is_multiple = False

			self.annotator_class = [self.annotator_class]
			self.corpus = [self.corpus]

		
		
	def exec(self, save_results=True, split_annotator=True, **kwargs):
		'''
			Executes the model: Create, Train and Evaluate.  
			See `core.model.base.Model.train()` and `core.model.base.Model.evaluate()`

			Will output some basic results.

			Args:
				save_results (bool): write the results to a json file?
				split_annotator (bool): split the annotator to use different annotations for training and evaluation?
				**kwargs: key-value-args passed to the model while ``model_class.__init__``
			Returns:
				dictionary with results and parameters (same written to json, if active)
		'''
		self.split_annotator = split_annotator

		# split corpora
		splitted_corpora = []
		for corpus in self.corpus:
			splitted_corpora.append(corpus.split(percentages=self.percentages))
		c_train, c_eval = zip(*splitted_corpora)

		# create annotators
		annotators = []
		for annotator_class in self.annotator_class:
			if self.split_annotator:
				annotators.append((
					annotator_class(percentages=self.percentages, part=0, preprocessor=self.annotator_preprocessor), # train
					annotator_class(percentages=self.percentages, part=1, preprocessor=self.annotator_preprocessor) # eval
				))
			else:
				annotators.append((
					annotator_class(preprocessor=self.annotator_preprocessor),
					annotator_class(preprocessor=self.annotator_preprocessor)
				))
		a_train, a_eval = zip(*annotators)
		
		# create annotated corpora
		if self.is_multiple:
			self.ac_train = MultiAnnotatedCorpus(c_train, a_train)
			self.ac_eval = MultiAnnotatedCorpus(c_eval, a_eval)
		else:
			self.ac_train = SingleAnnotatedCorpus(c_train[0], a_train[0])
			self.ac_eval = SingleAnnotatedCorpus(c_eval[0], a_eval[0])

		self.model = self.model_class(self.ac_train, self.ac_eval, **kwargs)
		self.training_metrics = self.model.train()

		results = self.model.evaluate()
		self.results = self._clear_eval(results)

		print(self.results)
		return self._full_results(save_results)

	def predict(self, *args):
		'''
			Run a prediction against the model.  
			See `core.model.base.Model.predict()`
		'''
		return self.model.predict(*args)

	def _clear_eval(self, r):
		'''
			Remove some parts from Trainer evaluation result
			Args:
				r (dict): Trainer evaluation result
			Returns:
				filtered dictionary
		'''
		r_n = {}
		for k,v in r.items():
			if k.startswith(Executor._EVAL_PREFIX):
				k = k[len(Executor._EVAL_PREFIX):]
			
			if k in Executor._EVAL_REMAIN:
				r_n[k] = v
		return r_n

	def _full_results(self, do_save):
		'''
			Create the full result data.
			Args:
				do_save (bool): Also save as json?
			Returns:
				dictionary with results and parameters
		'''

		try:
			# BERT only
			base_model = self.model.pretrained_model
			dataset_train = self.model.data_train.get_cachename()
			dataset_eval = self.model.data_eval.get_cachename()
			bert_training = self.model.training_params_str
		except:
			base_model = "none"
			dataset_train = ""
			dataset_eval = ""
			bert_training = ""

		try:
			# iSCD only
			scd_threshold = self.model.threshold
		except:
			scd_threshold = "none"

		try:
			# MPSCD only
			scd_mapping = self.model.scd_mapping
		except:
			scd_mapping = "none"

		data = {
			'percentages' : self.percentages,
			'classes' : {
				'model' : self.model_class.__name__,
				'annotator' : '-'.join([a.__name__ for a in self.annotator_class]),
				'annotator_preprocessor' : "default" if self.annotator_preprocessor == None else type(self.annotator_preprocessor).__name__,
				'corpus' : '-'.join([type(c).__name__ for c in self.corpus]),
				'corpus_preprocessor' : '-'.join([type(c.preprocessor).__name__ for c in self.corpus])
			},
			'names' : {
				'corpus' : '-'.join([c.get_cachename() for c in self.corpus]),
				'annotated_corpus_train' : self.ac_train.get_cachename(),
				'annotated_corpus_eval' : self.ac_eval.get_cachename(),
				'base_model' : base_model,
				'dataset_train' : dataset_train,
				'dataset_eval' : dataset_eval
			},
			'parameters' : {
				'seed' : Random.get_seed(),
				'batch_sizes' : [ const.TRAIN_BATCH_SIZE, const.EVAL_BATCH_SIZE ],
				'scd_threshold' : scd_threshold,
				'split_annotator' : self.split_annotator,
				'scd_mapping' : scd_mapping,
				'bert_training' : bert_training
			},
			'results' : self.results,
			'training_metrics' : self.training_metrics
		}

		if do_save:
			write_json_file(
				os.path.join(
					const.RESULTS_DIR,
					"execution_" + \
					data['classes']['model'] + '-' + data['classes']['annotator'] + '-' + data['classes']['corpus'] + \
					'_' + time.strftime( Executor.TIME_FORMAT ) + ".json"
				),
				data
			)

		return data

Wrapper class for model usage and training.

Just choose Corpus, Annotator and Model an let this class manage the training and evaluation.

Example
e = Executor(
        IsSCDBert,
        Wiktionary,
        TwentyNews(subgroup=['misc-forsale']),
)
e.exec()
print(e.predict("The bison is cool!"))

Using the classes core.model.transformer.models.IsSCDBert, core.corpus.annotators.Wiktionary, and core.corpus.corpora.TwentyNews.

#   Executor( model_class, annotator_class, corpus, annotator_preprocessor=None, percentage_train=0.8 )
View Source
	def __init__(self,
			model_class, annotator_class,
			corpus,
			annotator_preprocessor=None,
			percentage_train=0.8
		):
		"""
			Args:
				model_class (`core.model.base.Model`): The model to use (as **class**, not an initiated object!)
				annotator_class (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): The annotator to use (as **class**, not an initiated object!)
				corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): The corpus to use (as **initiated object**)
				annotator_preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use for annotator, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` (no list allowed, will use same for all corpora!)
				percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation)

			Raises:
				AttributeError: if parameters do not match needed types
		"""
		
		if model_class not in Executor.ALL_MODELS:
			raise AttributeError("The model_class parameter needs class from Executor.ALL_MODELS!")
		if (not isinstance(annotator_class, list) and not annotator_class in Executor.ALL_ANNOTATORS ) \
			or (isinstance(annotator_class, list) and not all([a in Executor.ALL_ANNOTATORS for a in annotator_class])):
			raise AttributeError("The annotator_class parameter needs a single class from Executor.ALL_ANNOTATORS or a list of classes!")
		if (not isinstance(corpus, list) and not isinstance(corpus, Corpus) ) \
			or (isinstance(corpus, list) and not all([isinstance(c, Corpus) for c in corpus])):
			raise AttributeError("The corpus parameter needs a object or list of objects of type core.corpus.corpus.Corpus!")
		if not isinstance(annotator_preprocessor, Preprocessor) and not annotator_preprocessor == None:
			raise AttributeError("The annotator_preprocessor parameter needs a object of type core.corpus.preprocess.Preprocessor or None!")
		if percentage_train < 0.01 or percentage_train > 0.99:
			raise AttributeError("The percentage_train needs to be a floats in [0.01, 0.99]!")

		self.model_class = model_class
		self.annotator_class = annotator_class
		self.annotator_preprocessor = annotator_preprocessor
		self.corpus = corpus
		self.percentages = [percentage_train, round(1-percentage_train, 1)]

		if isinstance(corpus, list) and isinstance(annotator_class, list):
			if len(corpus) == len(annotator_class) > 1:
				self.is_multiple = True
			else:
				raise AttributeError("Lists corpus and annotator_class have to be same length and contain more than 1 element each!")
		else:
			self.is_multiple = False

			self.annotator_class = [self.annotator_class]
			self.corpus = [self.corpus]
Args
Raises
  • AttributeError: if parameters do not match needed types

Supported annotators

#   TIME_FORMAT = '%Y-%m-%d_%H-%M-%S'

The time format used for the result files.

#   def exec(self, save_results=True, split_annotator=True, **kwargs):
View Source
	def exec(self, save_results=True, split_annotator=True, **kwargs):
		'''
			Executes the model: Create, Train and Evaluate.  
			See `core.model.base.Model.train()` and `core.model.base.Model.evaluate()`

			Will output some basic results.

			Args:
				save_results (bool): write the results to a json file?
				split_annotator (bool): split the annotator to use different annotations for training and evaluation?
				**kwargs: key-value-args passed to the model while ``model_class.__init__``
			Returns:
				dictionary with results and parameters (same written to json, if active)
		'''
		self.split_annotator = split_annotator

		# split corpora
		splitted_corpora = []
		for corpus in self.corpus:
			splitted_corpora.append(corpus.split(percentages=self.percentages))
		c_train, c_eval = zip(*splitted_corpora)

		# create annotators
		annotators = []
		for annotator_class in self.annotator_class:
			if self.split_annotator:
				annotators.append((
					annotator_class(percentages=self.percentages, part=0, preprocessor=self.annotator_preprocessor), # train
					annotator_class(percentages=self.percentages, part=1, preprocessor=self.annotator_preprocessor) # eval
				))
			else:
				annotators.append((
					annotator_class(preprocessor=self.annotator_preprocessor),
					annotator_class(preprocessor=self.annotator_preprocessor)
				))
		a_train, a_eval = zip(*annotators)
		
		# create annotated corpora
		if self.is_multiple:
			self.ac_train = MultiAnnotatedCorpus(c_train, a_train)
			self.ac_eval = MultiAnnotatedCorpus(c_eval, a_eval)
		else:
			self.ac_train = SingleAnnotatedCorpus(c_train[0], a_train[0])
			self.ac_eval = SingleAnnotatedCorpus(c_eval[0], a_eval[0])

		self.model = self.model_class(self.ac_train, self.ac_eval, **kwargs)
		self.training_metrics = self.model.train()

		results = self.model.evaluate()
		self.results = self._clear_eval(results)

		print(self.results)
		return self._full_results(save_results)

Executes the model: Create, Train and Evaluate.
See core.model.base.Model.train() and core.model.base.Model.evaluate()

Will output some basic results.

Args
  • save_results (bool): write the results to a json file?
  • split_annotator (bool): split the annotator to use different annotations for training and evaluation?
  • **kwargs: key-value-args passed to the model while model_class.__init__
Returns

dictionary with results and parameters (same written to json, if active)

#   def predict(self, *args):
View Source
	def predict(self, *args):
		'''
			Run a prediction against the model.  
			See `core.model.base.Model.predict()`
		'''
		return self.model.predict(*args)

Run a prediction against the model.
See core.model.base.Model.predict()