core.model.exec
View Source
import os, time from core.utils import Random, const, write_json_file from core.corpus.corpus import Corpus from core.corpus.preprocess import Preprocessor from core.corpus import ( Wiktionary, Quotes, MultiAnnotatedCorpus, SingleAnnotatedCorpus ) from core.model.transformer import ( IsNextSCDBert, IsSCDBert, SelectSCDBert, GivenTextFindSCDBert, GivenSCDFindTextBert ) from core.model.scdmatrix import ( iSCDMatrix, MPSCDMatrix ) class Executor(): """ Wrapper class for model usage and training. Just choose Corpus, Annotator and Model an let this class manage the training and evaluation. Example: <pre> e = Executor( IsSCDBert, Wiktionary, TwentyNews(subgroup=['misc-forsale']), ) e.exec() print(e.predict("The bison is cool!")) </pre> Using the classes `core.model.transformer.models.IsSCDBert`, `core.corpus.annotators.Wiktionary`, and `core.corpus.corpora.TwentyNews`. """ ALL_MODELS = [ IsNextSCDBert, IsSCDBert, SelectSCDBert, GivenTextFindSCDBert, GivenSCDFindTextBert, iSCDMatrix, MPSCDMatrix ] """ Supported models """ ALL_ANNOTATORS = [ Wiktionary, Quotes ] """ Supported annotators """ TIME_FORMAT = "%Y-%m-%d_%H-%M-%S" """ The time format used for the result files. """ _EVAL_REMAIN = [ "accuracy", "accuracy_include", "accuracy_part", "precision", "recall", "f1", "avg_similarity", "num", "num_positive_labels", "num_negative_labels", "num_positive_predictions", "num_negative_predictions" ] _EVAL_PREFIX = 'eval_' def __init__(self, model_class, annotator_class, corpus, annotator_preprocessor=None, percentage_train=0.8 ): """ Args: model_class (`core.model.base.Model`): The model to use (as **class**, not an initiated object!) annotator_class (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): The annotator to use (as **class**, not an initiated object!) corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): The corpus to use (as **initiated object**) annotator_preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use for annotator, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` (no list allowed, will use same for all corpora!) percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation) Raises: AttributeError: if parameters do not match needed types """ if model_class not in Executor.ALL_MODELS: raise AttributeError("The model_class parameter needs class from Executor.ALL_MODELS!") if (not isinstance(annotator_class, list) and not annotator_class in Executor.ALL_ANNOTATORS ) \ or (isinstance(annotator_class, list) and not all([a in Executor.ALL_ANNOTATORS for a in annotator_class])): raise AttributeError("The annotator_class parameter needs a single class from Executor.ALL_ANNOTATORS or a list of classes!") if (not isinstance(corpus, list) and not isinstance(corpus, Corpus) ) \ or (isinstance(corpus, list) and not all([isinstance(c, Corpus) for c in corpus])): raise AttributeError("The corpus parameter needs a object or list of objects of type core.corpus.corpus.Corpus!") if not isinstance(annotator_preprocessor, Preprocessor) and not annotator_preprocessor == None: raise AttributeError("The annotator_preprocessor parameter needs a object of type core.corpus.preprocess.Preprocessor or None!") if percentage_train < 0.01 or percentage_train > 0.99: raise AttributeError("The percentage_train needs to be a floats in [0.01, 0.99]!") self.model_class = model_class self.annotator_class = annotator_class self.annotator_preprocessor = annotator_preprocessor self.corpus = corpus self.percentages = [percentage_train, round(1-percentage_train, 1)] if isinstance(corpus, list) and isinstance(annotator_class, list): if len(corpus) == len(annotator_class) > 1: self.is_multiple = True else: raise AttributeError("Lists corpus and annotator_class have to be same length and contain more than 1 element each!") else: self.is_multiple = False self.annotator_class = [self.annotator_class] self.corpus = [self.corpus] def exec(self, save_results=True, split_annotator=True, **kwargs): ''' Executes the model: Create, Train and Evaluate. See `core.model.base.Model.train()` and `core.model.base.Model.evaluate()` Will output some basic results. Args: save_results (bool): write the results to a json file? split_annotator (bool): split the annotator to use different annotations for training and evaluation? **kwargs: key-value-args passed to the model while ``model_class.__init__`` Returns: dictionary with results and parameters (same written to json, if active) ''' self.split_annotator = split_annotator # split corpora splitted_corpora = [] for corpus in self.corpus: splitted_corpora.append(corpus.split(percentages=self.percentages)) c_train, c_eval = zip(*splitted_corpora) # create annotators annotators = [] for annotator_class in self.annotator_class: if self.split_annotator: annotators.append(( annotator_class(percentages=self.percentages, part=0, preprocessor=self.annotator_preprocessor), # train annotator_class(percentages=self.percentages, part=1, preprocessor=self.annotator_preprocessor) # eval )) else: annotators.append(( annotator_class(preprocessor=self.annotator_preprocessor), annotator_class(preprocessor=self.annotator_preprocessor) )) a_train, a_eval = zip(*annotators) # create annotated corpora if self.is_multiple: self.ac_train = MultiAnnotatedCorpus(c_train, a_train) self.ac_eval = MultiAnnotatedCorpus(c_eval, a_eval) else: self.ac_train = SingleAnnotatedCorpus(c_train[0], a_train[0]) self.ac_eval = SingleAnnotatedCorpus(c_eval[0], a_eval[0]) self.model = self.model_class(self.ac_train, self.ac_eval, **kwargs) self.training_metrics = self.model.train() results = self.model.evaluate() self.results = self._clear_eval(results) print(self.results) return self._full_results(save_results) def predict(self, *args): ''' Run a prediction against the model. See `core.model.base.Model.predict()` ''' return self.model.predict(*args) def _clear_eval(self, r): ''' Remove some parts from Trainer evaluation result Args: r (dict): Trainer evaluation result Returns: filtered dictionary ''' r_n = {} for k,v in r.items(): if k.startswith(Executor._EVAL_PREFIX): k = k[len(Executor._EVAL_PREFIX):] if k in Executor._EVAL_REMAIN: r_n[k] = v return r_n def _full_results(self, do_save): ''' Create the full result data. Args: do_save (bool): Also save as json? Returns: dictionary with results and parameters ''' try: # BERT only base_model = self.model.pretrained_model dataset_train = self.model.data_train.get_cachename() dataset_eval = self.model.data_eval.get_cachename() bert_training = self.model.training_params_str except: base_model = "none" dataset_train = "" dataset_eval = "" bert_training = "" try: # iSCD only scd_threshold = self.model.threshold except: scd_threshold = "none" try: # MPSCD only scd_mapping = self.model.scd_mapping except: scd_mapping = "none" data = { 'percentages' : self.percentages, 'classes' : { 'model' : self.model_class.__name__, 'annotator' : '-'.join([a.__name__ for a in self.annotator_class]), 'annotator_preprocessor' : "default" if self.annotator_preprocessor == None else type(self.annotator_preprocessor).__name__, 'corpus' : '-'.join([type(c).__name__ for c in self.corpus]), 'corpus_preprocessor' : '-'.join([type(c.preprocessor).__name__ for c in self.corpus]) }, 'names' : { 'corpus' : '-'.join([c.get_cachename() for c in self.corpus]), 'annotated_corpus_train' : self.ac_train.get_cachename(), 'annotated_corpus_eval' : self.ac_eval.get_cachename(), 'base_model' : base_model, 'dataset_train' : dataset_train, 'dataset_eval' : dataset_eval }, 'parameters' : { 'seed' : Random.get_seed(), 'batch_sizes' : [ const.TRAIN_BATCH_SIZE, const.EVAL_BATCH_SIZE ], 'scd_threshold' : scd_threshold, 'split_annotator' : self.split_annotator, 'scd_mapping' : scd_mapping, 'bert_training' : bert_training }, 'results' : self.results, 'training_metrics' : self.training_metrics } if do_save: write_json_file( os.path.join( const.RESULTS_DIR, "execution_" + \ data['classes']['model'] + '-' + data['classes']['annotator'] + '-' + data['classes']['corpus'] + \ '_' + time.strftime( Executor.TIME_FORMAT ) + ".json" ), data ) return data
View Source
class Executor(): """ Wrapper class for model usage and training. Just choose Corpus, Annotator and Model an let this class manage the training and evaluation. Example: <pre> e = Executor( IsSCDBert, Wiktionary, TwentyNews(subgroup=['misc-forsale']), ) e.exec() print(e.predict("The bison is cool!")) </pre> Using the classes `core.model.transformer.models.IsSCDBert`, `core.corpus.annotators.Wiktionary`, and `core.corpus.corpora.TwentyNews`. """ ALL_MODELS = [ IsNextSCDBert, IsSCDBert, SelectSCDBert, GivenTextFindSCDBert, GivenSCDFindTextBert, iSCDMatrix, MPSCDMatrix ] """ Supported models """ ALL_ANNOTATORS = [ Wiktionary, Quotes ] """ Supported annotators """ TIME_FORMAT = "%Y-%m-%d_%H-%M-%S" """ The time format used for the result files. """ _EVAL_REMAIN = [ "accuracy", "accuracy_include", "accuracy_part", "precision", "recall", "f1", "avg_similarity", "num", "num_positive_labels", "num_negative_labels", "num_positive_predictions", "num_negative_predictions" ] _EVAL_PREFIX = 'eval_' def __init__(self, model_class, annotator_class, corpus, annotator_preprocessor=None, percentage_train=0.8 ): """ Args: model_class (`core.model.base.Model`): The model to use (as **class**, not an initiated object!) annotator_class (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): The annotator to use (as **class**, not an initiated object!) corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): The corpus to use (as **initiated object**) annotator_preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use for annotator, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` (no list allowed, will use same for all corpora!) percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation) Raises: AttributeError: if parameters do not match needed types """ if model_class not in Executor.ALL_MODELS: raise AttributeError("The model_class parameter needs class from Executor.ALL_MODELS!") if (not isinstance(annotator_class, list) and not annotator_class in Executor.ALL_ANNOTATORS ) \ or (isinstance(annotator_class, list) and not all([a in Executor.ALL_ANNOTATORS for a in annotator_class])): raise AttributeError("The annotator_class parameter needs a single class from Executor.ALL_ANNOTATORS or a list of classes!") if (not isinstance(corpus, list) and not isinstance(corpus, Corpus) ) \ or (isinstance(corpus, list) and not all([isinstance(c, Corpus) for c in corpus])): raise AttributeError("The corpus parameter needs a object or list of objects of type core.corpus.corpus.Corpus!") if not isinstance(annotator_preprocessor, Preprocessor) and not annotator_preprocessor == None: raise AttributeError("The annotator_preprocessor parameter needs a object of type core.corpus.preprocess.Preprocessor or None!") if percentage_train < 0.01 or percentage_train > 0.99: raise AttributeError("The percentage_train needs to be a floats in [0.01, 0.99]!") self.model_class = model_class self.annotator_class = annotator_class self.annotator_preprocessor = annotator_preprocessor self.corpus = corpus self.percentages = [percentage_train, round(1-percentage_train, 1)] if isinstance(corpus, list) and isinstance(annotator_class, list): if len(corpus) == len(annotator_class) > 1: self.is_multiple = True else: raise AttributeError("Lists corpus and annotator_class have to be same length and contain more than 1 element each!") else: self.is_multiple = False self.annotator_class = [self.annotator_class] self.corpus = [self.corpus] def exec(self, save_results=True, split_annotator=True, **kwargs): ''' Executes the model: Create, Train and Evaluate. See `core.model.base.Model.train()` and `core.model.base.Model.evaluate()` Will output some basic results. Args: save_results (bool): write the results to a json file? split_annotator (bool): split the annotator to use different annotations for training and evaluation? **kwargs: key-value-args passed to the model while ``model_class.__init__`` Returns: dictionary with results and parameters (same written to json, if active) ''' self.split_annotator = split_annotator # split corpora splitted_corpora = [] for corpus in self.corpus: splitted_corpora.append(corpus.split(percentages=self.percentages)) c_train, c_eval = zip(*splitted_corpora) # create annotators annotators = [] for annotator_class in self.annotator_class: if self.split_annotator: annotators.append(( annotator_class(percentages=self.percentages, part=0, preprocessor=self.annotator_preprocessor), # train annotator_class(percentages=self.percentages, part=1, preprocessor=self.annotator_preprocessor) # eval )) else: annotators.append(( annotator_class(preprocessor=self.annotator_preprocessor), annotator_class(preprocessor=self.annotator_preprocessor) )) a_train, a_eval = zip(*annotators) # create annotated corpora if self.is_multiple: self.ac_train = MultiAnnotatedCorpus(c_train, a_train) self.ac_eval = MultiAnnotatedCorpus(c_eval, a_eval) else: self.ac_train = SingleAnnotatedCorpus(c_train[0], a_train[0]) self.ac_eval = SingleAnnotatedCorpus(c_eval[0], a_eval[0]) self.model = self.model_class(self.ac_train, self.ac_eval, **kwargs) self.training_metrics = self.model.train() results = self.model.evaluate() self.results = self._clear_eval(results) print(self.results) return self._full_results(save_results) def predict(self, *args): ''' Run a prediction against the model. See `core.model.base.Model.predict()` ''' return self.model.predict(*args) def _clear_eval(self, r): ''' Remove some parts from Trainer evaluation result Args: r (dict): Trainer evaluation result Returns: filtered dictionary ''' r_n = {} for k,v in r.items(): if k.startswith(Executor._EVAL_PREFIX): k = k[len(Executor._EVAL_PREFIX):] if k in Executor._EVAL_REMAIN: r_n[k] = v return r_n def _full_results(self, do_save): ''' Create the full result data. Args: do_save (bool): Also save as json? Returns: dictionary with results and parameters ''' try: # BERT only base_model = self.model.pretrained_model dataset_train = self.model.data_train.get_cachename() dataset_eval = self.model.data_eval.get_cachename() bert_training = self.model.training_params_str except: base_model = "none" dataset_train = "" dataset_eval = "" bert_training = "" try: # iSCD only scd_threshold = self.model.threshold except: scd_threshold = "none" try: # MPSCD only scd_mapping = self.model.scd_mapping except: scd_mapping = "none" data = { 'percentages' : self.percentages, 'classes' : { 'model' : self.model_class.__name__, 'annotator' : '-'.join([a.__name__ for a in self.annotator_class]), 'annotator_preprocessor' : "default" if self.annotator_preprocessor == None else type(self.annotator_preprocessor).__name__, 'corpus' : '-'.join([type(c).__name__ for c in self.corpus]), 'corpus_preprocessor' : '-'.join([type(c.preprocessor).__name__ for c in self.corpus]) }, 'names' : { 'corpus' : '-'.join([c.get_cachename() for c in self.corpus]), 'annotated_corpus_train' : self.ac_train.get_cachename(), 'annotated_corpus_eval' : self.ac_eval.get_cachename(), 'base_model' : base_model, 'dataset_train' : dataset_train, 'dataset_eval' : dataset_eval }, 'parameters' : { 'seed' : Random.get_seed(), 'batch_sizes' : [ const.TRAIN_BATCH_SIZE, const.EVAL_BATCH_SIZE ], 'scd_threshold' : scd_threshold, 'split_annotator' : self.split_annotator, 'scd_mapping' : scd_mapping, 'bert_training' : bert_training }, 'results' : self.results, 'training_metrics' : self.training_metrics } if do_save: write_json_file( os.path.join( const.RESULTS_DIR, "execution_" + \ data['classes']['model'] + '-' + data['classes']['annotator'] + '-' + data['classes']['corpus'] + \ '_' + time.strftime( Executor.TIME_FORMAT ) + ".json" ), data ) return data
Wrapper class for model usage and training.
Just choose Corpus, Annotator and Model an let this class manage the training and evaluation.
Example
e = Executor( IsSCDBert, Wiktionary, TwentyNews(subgroup=['misc-forsale']), ) e.exec() print(e.predict("The bison is cool!"))Using the classes
core.model.transformer.models.IsSCDBert
,core.corpus.annotators.Wiktionary
, andcore.corpus.corpora.TwentyNews
.
#  
Executor(
model_class,
annotator_class,
corpus,
annotator_preprocessor=None,
percentage_train=0.8
)
View Source
def __init__(self, model_class, annotator_class, corpus, annotator_preprocessor=None, percentage_train=0.8 ): """ Args: model_class (`core.model.base.Model`): The model to use (as **class**, not an initiated object!) annotator_class (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): The annotator to use (as **class**, not an initiated object!) corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): The corpus to use (as **initiated object**) annotator_preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use for annotator, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` (no list allowed, will use same for all corpora!) percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation) Raises: AttributeError: if parameters do not match needed types """ if model_class not in Executor.ALL_MODELS: raise AttributeError("The model_class parameter needs class from Executor.ALL_MODELS!") if (not isinstance(annotator_class, list) and not annotator_class in Executor.ALL_ANNOTATORS ) \ or (isinstance(annotator_class, list) and not all([a in Executor.ALL_ANNOTATORS for a in annotator_class])): raise AttributeError("The annotator_class parameter needs a single class from Executor.ALL_ANNOTATORS or a list of classes!") if (not isinstance(corpus, list) and not isinstance(corpus, Corpus) ) \ or (isinstance(corpus, list) and not all([isinstance(c, Corpus) for c in corpus])): raise AttributeError("The corpus parameter needs a object or list of objects of type core.corpus.corpus.Corpus!") if not isinstance(annotator_preprocessor, Preprocessor) and not annotator_preprocessor == None: raise AttributeError("The annotator_preprocessor parameter needs a object of type core.corpus.preprocess.Preprocessor or None!") if percentage_train < 0.01 or percentage_train > 0.99: raise AttributeError("The percentage_train needs to be a floats in [0.01, 0.99]!") self.model_class = model_class self.annotator_class = annotator_class self.annotator_preprocessor = annotator_preprocessor self.corpus = corpus self.percentages = [percentage_train, round(1-percentage_train, 1)] if isinstance(corpus, list) and isinstance(annotator_class, list): if len(corpus) == len(annotator_class) > 1: self.is_multiple = True else: raise AttributeError("Lists corpus and annotator_class have to be same length and contain more than 1 element each!") else: self.is_multiple = False self.annotator_class = [self.annotator_class] self.corpus = [self.corpus]
Args
- model_class (
core.model.base.Model
): The model to use (as class, not an initiated object!) - annotator_class (
core.corpus.annotator.Annotator
or list ofcore.corpus.annotator.Annotator
): The annotator to use (as class, not an initiated object!) - corpus (
core.corpus.corpus.Corpus
or list ofcore.corpus.corpus.Corpus
): The corpus to use (as initiated object) - annotator_preprocessor (
core.corpus.preprocess.Preprocessor
): The preprocessor to use for annotator, ifNone
usescore.corpus.preprocess.DefaultPreprocessor
(no list allowed, will use same for all corpora!) - percentage_train (float): The percantage of the corpus to use for training (rest will be used for evaluation)
Raises
- AttributeError: if parameters do not match needed types
#  
ALL_MODELS = [<class 'core.model.transformer.models.IsNextSCDBert'>, <class 'core.model.transformer.models.IsSCDBert'>, <class 'core.model.transformer.models.SelectSCDBert'>, <class 'core.model.transformer.models.GivenTextFindSCDBert'>, <class 'core.model.transformer.models.GivenSCDFindTextBert'>, <class 'core.model.scdmatrix.models.iSCDMatrix'>, <class 'core.model.scdmatrix.models.MPSCDMatrix'>]
Supported models
#  
ALL_ANNOTATORS = [<class 'core.corpus.annotators.Wiktionary'>, <class 'core.corpus.annotators.Quotes'>]
Supported annotators
The time format used for the result files.
View Source
def exec(self, save_results=True, split_annotator=True, **kwargs): ''' Executes the model: Create, Train and Evaluate. See `core.model.base.Model.train()` and `core.model.base.Model.evaluate()` Will output some basic results. Args: save_results (bool): write the results to a json file? split_annotator (bool): split the annotator to use different annotations for training and evaluation? **kwargs: key-value-args passed to the model while ``model_class.__init__`` Returns: dictionary with results and parameters (same written to json, if active) ''' self.split_annotator = split_annotator # split corpora splitted_corpora = [] for corpus in self.corpus: splitted_corpora.append(corpus.split(percentages=self.percentages)) c_train, c_eval = zip(*splitted_corpora) # create annotators annotators = [] for annotator_class in self.annotator_class: if self.split_annotator: annotators.append(( annotator_class(percentages=self.percentages, part=0, preprocessor=self.annotator_preprocessor), # train annotator_class(percentages=self.percentages, part=1, preprocessor=self.annotator_preprocessor) # eval )) else: annotators.append(( annotator_class(preprocessor=self.annotator_preprocessor), annotator_class(preprocessor=self.annotator_preprocessor) )) a_train, a_eval = zip(*annotators) # create annotated corpora if self.is_multiple: self.ac_train = MultiAnnotatedCorpus(c_train, a_train) self.ac_eval = MultiAnnotatedCorpus(c_eval, a_eval) else: self.ac_train = SingleAnnotatedCorpus(c_train[0], a_train[0]) self.ac_eval = SingleAnnotatedCorpus(c_eval[0], a_eval[0]) self.model = self.model_class(self.ac_train, self.ac_eval, **kwargs) self.training_metrics = self.model.train() results = self.model.evaluate() self.results = self._clear_eval(results) print(self.results) return self._full_results(save_results)
Executes the model: Create, Train and Evaluate.
See core.model.base.Model.train()
and core.model.base.Model.evaluate()
Will output some basic results.
Args
- save_results (bool): write the results to a json file?
- split_annotator (bool): split the annotator to use different annotations for training and evaluation?
- **kwargs: key-value-args passed to the model while
model_class.__init__
Returns
dictionary with results and parameters (same written to json, if active)
View Source
def predict(self, *args): ''' Run a prediction against the model. See `core.model.base.Model.predict()` ''' return self.model.predict(*args)
Run a prediction against the model.
See core.model.base.Model.predict()