core.corpus.annotator
View Source
import os from core.utils import Random from abc import ABC, abstractmethod from core.corpus.preprocess import DefaultPreprocessor from core.utils import read_json_file, write_json_file, clear_filename, check_and_create_folder, CacheName import core.utils.const as const class Annotator(ABC): ''' An annotator creates some type of annotations for given sentences. A `core.corpus.corpus.Corpus` may iterated with annotation from an `Annotator` using the `core.corpus.annotated_corpus.AnnotatedCorpus`. ''' # annotators are always cacheable (they cache their data, not the sentences annotated) def __init__(self, percentages=[1], part=0, preprocessor=None): ''' Args: percentages (array of float): Annotators do not support splitting, but one may select only a subset of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one using 70% one would write ``percentages=[0.3, 0.7]``. part (int): Select one of the percentages defined by ``percentages``. For ``percentages=[0.3, 0.7]`` setting ``part=0`` would select 30%, ``part=1`` 70%. preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` ''' if sum(percentages) != 1: raise AttributeError("Percentages have to sum up to 1!") if part < 0 or part >= len(percentages): raise AttributeError("Parts selects a percentage by index (0 based)!") self.len_percentages = len(percentages) # number of percentages given self.use_percentages = self.len_percentages > 1 # used percentaged here? self.part_len = percentages[part] # the length (percentage) of the part self.part_start = sum(percentages[:part]) # the length (percentages) before the part if preprocessor == None: self.preprocessor = DefaultPreprocessor() else: self.preprocessor = preprocessor self.preprocessor_name = type(self.preprocessor).__name__ self.random_seed = Random.get_seed() self.random = Random.get_generator() @abstractmethod def _get_annotations(self, sentence, n): pass @abstractmethod def _get_non_annotations(self, sentence, n): pass @abstractmethod def _get_cachename(self): ''' Internal cachename, does not take care of seed! ''' pass def get_annotations(self, sentence, n=1): ''' Get array of *right*/ *matching* scds for given sentence (list of words) Args: sentence (array): the sentence n (int): max number of scds to return; -1 for all (limited to 1000) ''' return self._get_annotations(sentence, n) def get_non_annotations(self, sentence, n=1): ''' Get array of *wrong*/ *non matching* scds for given sentence (list of words) Args: sentence (array): the sentence n (int): max number of scds to return; -1 for all (limited to 1000) ''' return self._get_non_annotations(sentence, n) def _get_cachefilename(self, suffix=''): ''' Returns the path (string) to the annotators' cachefile Args: suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file ''' return CacheName.filename(os.path.join( const.CORPUS_CACHEDIR, clear_filename(self._get_cachename()) + (("_" + suffix) if len(suffix) > 0 else "") + ".json" )) def get_cachename(self): ''' External cachename, also takes care of seed and percentages (internal does not!) ''' if self.use_percentages: percentage_str = str(self.len_percentages) + "-" + str(self.part_start) + "-" + str(self.part_len) + "-" else: percentage_str = "" return self._get_cachename() + "_" + percentage_str + str(self.random_seed) + "_" def _is_cached(self, suffix=''): return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename(suffix=suffix)) def _get_cached(self, suffix=''): return read_json_file(self._get_cachefilename(suffix=suffix)) def _set_cached(self, data, suffix=''): check_and_create_folder(const.CORPUS_CACHEDIR) write_json_file(self._get_cachefilename(suffix=suffix), data) @abstractmethod def get_inverse_annotator(self): """ Get an instance of `core.corpus.annotator.InverseAnnotator` for the Annotator. """ pass class InverseAnnotator(ABC): """ To check predicted annotations it is sometimes necessary to map back from an annotation to the text. This class allows to get the similarity between a sentence and scd for some annotator. """ def __init__(self, preprocessor): """ Args: preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor used by the Annotator! """ self.annotator = type(self).ANNOTATOR_CLASS(preprocessor=preprocessor) if self.annotator._is_cached(suffix='inversed'): cache_data = self.annotator._get_cached(suffix='inversed') else: cache_data = self._create_cache_data() self.annotator._set_cached(cache_data, suffix='inversed') self._init(cache_data) @abstractmethod def _init(self, cache_data): pass @abstractmethod def _create_cache_data(self): pass def is_similar_annotation(self, sentence, annotation, other_annotation): """ Checks if the sentence could get both annotation. I.e. both annotations are similar in the context of the sentence. Args: sentence (list of str): The sentence annotated annotation (list of str): A annotation given for sentence other_annotation (list of str): Another annotation given for sentence Returns: bool, both annotations are possible annotations """ return self.is_annotation(sentence, annotation) and self.is_annotation(sentence, other_annotation) @abstractmethod def is_annotation(self, sentence, annotation): """ Checks if the sentence could get the annotation. Args: sentence (list of str): The sentence annotated annotation (list of str): The annotation given for sentence Returns: bool, is possible annotation """ pass
View Source
class Annotator(ABC): ''' An annotator creates some type of annotations for given sentences. A `core.corpus.corpus.Corpus` may iterated with annotation from an `Annotator` using the `core.corpus.annotated_corpus.AnnotatedCorpus`. ''' # annotators are always cacheable (they cache their data, not the sentences annotated) def __init__(self, percentages=[1], part=0, preprocessor=None): ''' Args: percentages (array of float): Annotators do not support splitting, but one may select only a subset of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one using 70% one would write ``percentages=[0.3, 0.7]``. part (int): Select one of the percentages defined by ``percentages``. For ``percentages=[0.3, 0.7]`` setting ``part=0`` would select 30%, ``part=1`` 70%. preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` ''' if sum(percentages) != 1: raise AttributeError("Percentages have to sum up to 1!") if part < 0 or part >= len(percentages): raise AttributeError("Parts selects a percentage by index (0 based)!") self.len_percentages = len(percentages) # number of percentages given self.use_percentages = self.len_percentages > 1 # used percentaged here? self.part_len = percentages[part] # the length (percentage) of the part self.part_start = sum(percentages[:part]) # the length (percentages) before the part if preprocessor == None: self.preprocessor = DefaultPreprocessor() else: self.preprocessor = preprocessor self.preprocessor_name = type(self.preprocessor).__name__ self.random_seed = Random.get_seed() self.random = Random.get_generator() @abstractmethod def _get_annotations(self, sentence, n): pass @abstractmethod def _get_non_annotations(self, sentence, n): pass @abstractmethod def _get_cachename(self): ''' Internal cachename, does not take care of seed! ''' pass def get_annotations(self, sentence, n=1): ''' Get array of *right*/ *matching* scds for given sentence (list of words) Args: sentence (array): the sentence n (int): max number of scds to return; -1 for all (limited to 1000) ''' return self._get_annotations(sentence, n) def get_non_annotations(self, sentence, n=1): ''' Get array of *wrong*/ *non matching* scds for given sentence (list of words) Args: sentence (array): the sentence n (int): max number of scds to return; -1 for all (limited to 1000) ''' return self._get_non_annotations(sentence, n) def _get_cachefilename(self, suffix=''): ''' Returns the path (string) to the annotators' cachefile Args: suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file ''' return CacheName.filename(os.path.join( const.CORPUS_CACHEDIR, clear_filename(self._get_cachename()) + (("_" + suffix) if len(suffix) > 0 else "") + ".json" )) def get_cachename(self): ''' External cachename, also takes care of seed and percentages (internal does not!) ''' if self.use_percentages: percentage_str = str(self.len_percentages) + "-" + str(self.part_start) + "-" + str(self.part_len) + "-" else: percentage_str = "" return self._get_cachename() + "_" + percentage_str + str(self.random_seed) + "_" def _is_cached(self, suffix=''): return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename(suffix=suffix)) def _get_cached(self, suffix=''): return read_json_file(self._get_cachefilename(suffix=suffix)) def _set_cached(self, data, suffix=''): check_and_create_folder(const.CORPUS_CACHEDIR) write_json_file(self._get_cachefilename(suffix=suffix), data) @abstractmethod def get_inverse_annotator(self): """ Get an instance of `core.corpus.annotator.InverseAnnotator` for the Annotator. """ pass
An annotator creates some type of annotations for given sentences.
A core.corpus.corpus.Corpus
may iterated with annotation from an Annotator
using
the core.corpus.annotated_corpus.AnnotatedCorpus
.
View Source
def __init__(self, percentages=[1], part=0, preprocessor=None): ''' Args: percentages (array of float): Annotators do not support splitting, but one may select only a subset of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one using 70% one would write ``percentages=[0.3, 0.7]``. part (int): Select one of the percentages defined by ``percentages``. For ``percentages=[0.3, 0.7]`` setting ``part=0`` would select 30%, ``part=1`` 70%. preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` ''' if sum(percentages) != 1: raise AttributeError("Percentages have to sum up to 1!") if part < 0 or part >= len(percentages): raise AttributeError("Parts selects a percentage by index (0 based)!") self.len_percentages = len(percentages) # number of percentages given self.use_percentages = self.len_percentages > 1 # used percentaged here? self.part_len = percentages[part] # the length (percentage) of the part self.part_start = sum(percentages[:part]) # the length (percentages) before the part if preprocessor == None: self.preprocessor = DefaultPreprocessor() else: self.preprocessor = preprocessor self.preprocessor_name = type(self.preprocessor).__name__ self.random_seed = Random.get_seed() self.random = Random.get_generator()
Args
- percentages (array of float): Annotators do not support splitting, but one may select only a subset
of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one
using 70% one would write
percentages=[0.3, 0.7]
. - part (int): Select one of the percentages defined by
percentages
. Forpercentages=[0.3, 0.7]
settingpart=0
would select 30%,part=1
70%. - preprocessor (
core.corpus.preprocess.Preprocessor
): The preprocessor to use, ifNone
usescore.corpus.preprocess.DefaultPreprocessor
View Source
def get_annotations(self, sentence, n=1): ''' Get array of *right*/ *matching* scds for given sentence (list of words) Args: sentence (array): the sentence n (int): max number of scds to return; -1 for all (limited to 1000) ''' return self._get_annotations(sentence, n)
Get array of right/ matching scds for given sentence (list of words)
Args
- sentence (array): the sentence
- n (int): max number of scds to return; -1 for all (limited to 1000)
View Source
def get_non_annotations(self, sentence, n=1): ''' Get array of *wrong*/ *non matching* scds for given sentence (list of words) Args: sentence (array): the sentence n (int): max number of scds to return; -1 for all (limited to 1000) ''' return self._get_non_annotations(sentence, n)
Get array of wrong/ non matching scds for given sentence (list of words)
Args
- sentence (array): the sentence
- n (int): max number of scds to return; -1 for all (limited to 1000)
View Source
def get_cachename(self): ''' External cachename, also takes care of seed and percentages (internal does not!) ''' if self.use_percentages: percentage_str = str(self.len_percentages) + "-" + str(self.part_start) + "-" + str(self.part_len) + "-" else: percentage_str = "" return self._get_cachename() + "_" + percentage_str + str(self.random_seed) + "_"
External cachename, also takes care of seed and percentages (internal does not!)
View Source
@abstractmethod def get_inverse_annotator(self): """ Get an instance of `core.corpus.annotator.InverseAnnotator` for the Annotator. """ pass
Get an instance of core.corpus.annotator.InverseAnnotator
for the
Annotator.
View Source
class InverseAnnotator(ABC): """ To check predicted annotations it is sometimes necessary to map back from an annotation to the text. This class allows to get the similarity between a sentence and scd for some annotator. """ def __init__(self, preprocessor): """ Args: preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor used by the Annotator! """ self.annotator = type(self).ANNOTATOR_CLASS(preprocessor=preprocessor) if self.annotator._is_cached(suffix='inversed'): cache_data = self.annotator._get_cached(suffix='inversed') else: cache_data = self._create_cache_data() self.annotator._set_cached(cache_data, suffix='inversed') self._init(cache_data) @abstractmethod def _init(self, cache_data): pass @abstractmethod def _create_cache_data(self): pass def is_similar_annotation(self, sentence, annotation, other_annotation): """ Checks if the sentence could get both annotation. I.e. both annotations are similar in the context of the sentence. Args: sentence (list of str): The sentence annotated annotation (list of str): A annotation given for sentence other_annotation (list of str): Another annotation given for sentence Returns: bool, both annotations are possible annotations """ return self.is_annotation(sentence, annotation) and self.is_annotation(sentence, other_annotation) @abstractmethod def is_annotation(self, sentence, annotation): """ Checks if the sentence could get the annotation. Args: sentence (list of str): The sentence annotated annotation (list of str): The annotation given for sentence Returns: bool, is possible annotation """ pass
To check predicted annotations it is sometimes necessary to map back from an annotation to the text.
This class allows to get the similarity between a sentence and scd for some annotator.
View Source
def __init__(self, preprocessor): """ Args: preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor used by the Annotator! """ self.annotator = type(self).ANNOTATOR_CLASS(preprocessor=preprocessor) if self.annotator._is_cached(suffix='inversed'): cache_data = self.annotator._get_cached(suffix='inversed') else: cache_data = self._create_cache_data() self.annotator._set_cached(cache_data, suffix='inversed') self._init(cache_data)
Args
- preprocessor (
core.corpus.preprocess.Preprocessor
): The preprocessor used by the Annotator!
View Source
def is_similar_annotation(self, sentence, annotation, other_annotation): """ Checks if the sentence could get both annotation. I.e. both annotations are similar in the context of the sentence. Args: sentence (list of str): The sentence annotated annotation (list of str): A annotation given for sentence other_annotation (list of str): Another annotation given for sentence Returns: bool, both annotations are possible annotations """ return self.is_annotation(sentence, annotation) and self.is_annotation(sentence, other_annotation)
Checks if the sentence could get both annotation. I.e. both annotations are similar in the context of the sentence.
Args
- sentence (list of str): The sentence annotated
- annotation (list of str): A annotation given for sentence
- other_annotation (list of str): Another annotation given for sentence
Returns
bool, both annotations are possible annotations
View Source
@abstractmethod def is_annotation(self, sentence, annotation): """ Checks if the sentence could get the annotation. Args: sentence (list of str): The sentence annotated annotation (list of str): The annotation given for sentence Returns: bool, is possible annotation """ pass
Checks if the sentence could get the annotation.
Args
- sentence (list of str): The sentence annotated
- annotation (list of str): The annotation given for sentence
Returns
bool, is possible annotation