core.corpus.annotator

View Source
import os
from core.utils import Random

from abc import ABC, abstractmethod

from core.corpus.preprocess import DefaultPreprocessor

from core.utils import read_json_file, write_json_file, clear_filename, check_and_create_folder, CacheName
import core.utils.const as const


class Annotator(ABC):
	'''
		An annotator creates some type of annotations for given sentences.
		A `core.corpus.corpus.Corpus` may iterated with annotation from an `Annotator` using 
		the `core.corpus.annotated_corpus.AnnotatedCorpus`.
	'''
	# annotators are always cacheable (they cache their data, not the sentences annotated)

	def __init__(self, percentages=[1], part=0, preprocessor=None):
		'''
			Args:
				percentages (array of float): Annotators do not support splitting, but one may select only a subset
					of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one 
					using 70% one would write ``percentages=[0.3, 0.7]``.
				part (int): Select one of the percentages defined by ``percentages``. For ``percentages=[0.3, 0.7]`` setting
					``part=0`` would select 30%, ``part=1`` 70%.
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses
					`core.corpus.preprocess.DefaultPreprocessor` 
		'''
		if sum(percentages) != 1:
			raise AttributeError("Percentages have to sum up to 1!")
		if part < 0 or part >= len(percentages):
			raise AttributeError("Parts selects a percentage by index (0 based)!")

		self.len_percentages = len(percentages) # number of percentages given
		self.use_percentages = self.len_percentages > 1 # used percentaged here?
		self.part_len = percentages[part] # the length (percentage) of the part
		self.part_start = sum(percentages[:part]) # the length (percentages) before the part 

		if preprocessor == None:
			self.preprocessor = DefaultPreprocessor()
		else:
			self.preprocessor = preprocessor
		self.preprocessor_name = type(self.preprocessor).__name__

		self.random_seed = Random.get_seed()
		self.random = Random.get_generator()

	@abstractmethod
	def _get_annotations(self, sentence, n):
		pass

	@abstractmethod
	def _get_non_annotations(self, sentence, n):
		pass

	@abstractmethod
	def _get_cachename(self):
		'''
			Internal cachename, does not take care of seed!
		'''
		pass

	def get_annotations(self, sentence, n=1):
		'''
			Get array of *right*/ *matching* scds for given sentence (list of words)

			Args:
				sentence (array): the sentence
				n (int): max number of scds to return; -1 for all (limited to 1000)
		'''
		return self._get_annotations(sentence, n)

	def get_non_annotations(self, sentence, n=1):
		'''
			Get array of *wrong*/ *non matching* scds for given sentence (list of words)

			Args:
				sentence (array): the sentence
				n (int): max number of scds to return; -1 for all (limited to 1000)
		'''
		return self._get_non_annotations(sentence, n)

	def _get_cachefilename(self, suffix=''):
		'''
			Returns the path (string) to the annotators' cachefile 
			Args:
				suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file
		'''
		return CacheName.filename(os.path.join(
			const.CORPUS_CACHEDIR,
			clear_filename(self._get_cachename()) + (("_" + suffix) if len(suffix) > 0 else "") + ".json"
		))

	def get_cachename(self):
		'''
			External cachename, also takes care of seed and percentages (internal does not!)
		'''
		if self.use_percentages:
			percentage_str = str(self.len_percentages) + "-" + str(self.part_start) + "-" + str(self.part_len) + "-"
		else:
			percentage_str = ""
		return self._get_cachename() + "_" + percentage_str + str(self.random_seed) + "_"

	def _is_cached(self, suffix=''):
		return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename(suffix=suffix))
	
	def _get_cached(self, suffix=''):
		return read_json_file(self._get_cachefilename(suffix=suffix))

	def _set_cached(self, data, suffix=''):
		check_and_create_folder(const.CORPUS_CACHEDIR)
		write_json_file(self._get_cachefilename(suffix=suffix), data)

	@abstractmethod
	def get_inverse_annotator(self):
		"""
			Get an instance of `core.corpus.annotator.InverseAnnotator` for the  
			Annotator.
		"""
		pass

class InverseAnnotator(ABC):
	"""
		To check predicted annotations it is sometimes necessary to
		map back from an annotation to the text.

		This class allows to get the similarity between a sentence and scd 
		for some annotator. 
	"""

	def __init__(self, preprocessor):
		"""
			Args:
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor used by the Annotator!
		"""
		self.annotator = type(self).ANNOTATOR_CLASS(preprocessor=preprocessor)

		if self.annotator._is_cached(suffix='inversed'):
			cache_data = self.annotator._get_cached(suffix='inversed')
		else:
			cache_data = self._create_cache_data()
			self.annotator._set_cached(cache_data, suffix='inversed')
		
		self._init(cache_data)

	@abstractmethod
	def _init(self, cache_data):
		pass

	@abstractmethod
	def _create_cache_data(self):
		pass

	def is_similar_annotation(self, sentence, annotation, other_annotation):
		"""
			Checks if the sentence could get both annotation.
			I.e. both annotations are similar in the context of the sentence.

			Args:
				sentence (list of str): The sentence annotated
				annotation (list of str): A annotation given for sentence
				other_annotation (list of str): Another annotation given for sentence
			Returns:
				bool, both annotations are possible annotations
		"""
		return self.is_annotation(sentence, annotation) and self.is_annotation(sentence, other_annotation)

	@abstractmethod
	def is_annotation(self, sentence, annotation):
		"""
			Checks if the sentence could get the annotation.

			Args:
				sentence (list of str): The sentence annotated
				annotation (list of str): The annotation given for sentence
			Returns:
				bool, is possible annotation
		"""
		pass

		
		
#   class Annotator(abc.ABC):
View Source
class Annotator(ABC):
	'''
		An annotator creates some type of annotations for given sentences.
		A `core.corpus.corpus.Corpus` may iterated with annotation from an `Annotator` using 
		the `core.corpus.annotated_corpus.AnnotatedCorpus`.
	'''
	# annotators are always cacheable (they cache their data, not the sentences annotated)

	def __init__(self, percentages=[1], part=0, preprocessor=None):
		'''
			Args:
				percentages (array of float): Annotators do not support splitting, but one may select only a subset
					of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one 
					using 70% one would write ``percentages=[0.3, 0.7]``.
				part (int): Select one of the percentages defined by ``percentages``. For ``percentages=[0.3, 0.7]`` setting
					``part=0`` would select 30%, ``part=1`` 70%.
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses
					`core.corpus.preprocess.DefaultPreprocessor` 
		'''
		if sum(percentages) != 1:
			raise AttributeError("Percentages have to sum up to 1!")
		if part < 0 or part >= len(percentages):
			raise AttributeError("Parts selects a percentage by index (0 based)!")

		self.len_percentages = len(percentages) # number of percentages given
		self.use_percentages = self.len_percentages > 1 # used percentaged here?
		self.part_len = percentages[part] # the length (percentage) of the part
		self.part_start = sum(percentages[:part]) # the length (percentages) before the part 

		if preprocessor == None:
			self.preprocessor = DefaultPreprocessor()
		else:
			self.preprocessor = preprocessor
		self.preprocessor_name = type(self.preprocessor).__name__

		self.random_seed = Random.get_seed()
		self.random = Random.get_generator()

	@abstractmethod
	def _get_annotations(self, sentence, n):
		pass

	@abstractmethod
	def _get_non_annotations(self, sentence, n):
		pass

	@abstractmethod
	def _get_cachename(self):
		'''
			Internal cachename, does not take care of seed!
		'''
		pass

	def get_annotations(self, sentence, n=1):
		'''
			Get array of *right*/ *matching* scds for given sentence (list of words)

			Args:
				sentence (array): the sentence
				n (int): max number of scds to return; -1 for all (limited to 1000)
		'''
		return self._get_annotations(sentence, n)

	def get_non_annotations(self, sentence, n=1):
		'''
			Get array of *wrong*/ *non matching* scds for given sentence (list of words)

			Args:
				sentence (array): the sentence
				n (int): max number of scds to return; -1 for all (limited to 1000)
		'''
		return self._get_non_annotations(sentence, n)

	def _get_cachefilename(self, suffix=''):
		'''
			Returns the path (string) to the annotators' cachefile 
			Args:
				suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file
		'''
		return CacheName.filename(os.path.join(
			const.CORPUS_CACHEDIR,
			clear_filename(self._get_cachename()) + (("_" + suffix) if len(suffix) > 0 else "") + ".json"
		))

	def get_cachename(self):
		'''
			External cachename, also takes care of seed and percentages (internal does not!)
		'''
		if self.use_percentages:
			percentage_str = str(self.len_percentages) + "-" + str(self.part_start) + "-" + str(self.part_len) + "-"
		else:
			percentage_str = ""
		return self._get_cachename() + "_" + percentage_str + str(self.random_seed) + "_"

	def _is_cached(self, suffix=''):
		return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename(suffix=suffix))
	
	def _get_cached(self, suffix=''):
		return read_json_file(self._get_cachefilename(suffix=suffix))

	def _set_cached(self, data, suffix=''):
		check_and_create_folder(const.CORPUS_CACHEDIR)
		write_json_file(self._get_cachefilename(suffix=suffix), data)

	@abstractmethod
	def get_inverse_annotator(self):
		"""
			Get an instance of `core.corpus.annotator.InverseAnnotator` for the  
			Annotator.
		"""
		pass

An annotator creates some type of annotations for given sentences. A core.corpus.corpus.Corpus may iterated with annotation from an Annotator using the core.corpus.annotated_corpus.AnnotatedCorpus.

#   Annotator(percentages=[1], part=0, preprocessor=None)
View Source
	def __init__(self, percentages=[1], part=0, preprocessor=None):
		'''
			Args:
				percentages (array of float): Annotators do not support splitting, but one may select only a subset
					of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one 
					using 70% one would write ``percentages=[0.3, 0.7]``.
				part (int): Select one of the percentages defined by ``percentages``. For ``percentages=[0.3, 0.7]`` setting
					``part=0`` would select 30%, ``part=1`` 70%.
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses
					`core.corpus.preprocess.DefaultPreprocessor` 
		'''
		if sum(percentages) != 1:
			raise AttributeError("Percentages have to sum up to 1!")
		if part < 0 or part >= len(percentages):
			raise AttributeError("Parts selects a percentage by index (0 based)!")

		self.len_percentages = len(percentages) # number of percentages given
		self.use_percentages = self.len_percentages > 1 # used percentaged here?
		self.part_len = percentages[part] # the length (percentage) of the part
		self.part_start = sum(percentages[:part]) # the length (percentages) before the part 

		if preprocessor == None:
			self.preprocessor = DefaultPreprocessor()
		else:
			self.preprocessor = preprocessor
		self.preprocessor_name = type(self.preprocessor).__name__

		self.random_seed = Random.get_seed()
		self.random = Random.get_generator()
Args
  • percentages (array of float): Annotators do not support splitting, but one may select only a subset of the possible annotations by giving the percentage. E.g. to get one subset using 30% and one using 70% one would write percentages=[0.3, 0.7].
  • part (int): Select one of the percentages defined by percentages. For percentages=[0.3, 0.7] setting part=0 would select 30%, part=1 70%.
  • preprocessor (core.corpus.preprocess.Preprocessor): The preprocessor to use, if None uses core.corpus.preprocess.DefaultPreprocessor
#   def get_annotations(self, sentence, n=1):
View Source
	def get_annotations(self, sentence, n=1):
		'''
			Get array of *right*/ *matching* scds for given sentence (list of words)

			Args:
				sentence (array): the sentence
				n (int): max number of scds to return; -1 for all (limited to 1000)
		'''
		return self._get_annotations(sentence, n)

Get array of right/ matching scds for given sentence (list of words)

Args
  • sentence (array): the sentence
  • n (int): max number of scds to return; -1 for all (limited to 1000)
#   def get_non_annotations(self, sentence, n=1):
View Source
	def get_non_annotations(self, sentence, n=1):
		'''
			Get array of *wrong*/ *non matching* scds for given sentence (list of words)

			Args:
				sentence (array): the sentence
				n (int): max number of scds to return; -1 for all (limited to 1000)
		'''
		return self._get_non_annotations(sentence, n)

Get array of wrong/ non matching scds for given sentence (list of words)

Args
  • sentence (array): the sentence
  • n (int): max number of scds to return; -1 for all (limited to 1000)
#   def get_cachename(self):
View Source
	def get_cachename(self):
		'''
			External cachename, also takes care of seed and percentages (internal does not!)
		'''
		if self.use_percentages:
			percentage_str = str(self.len_percentages) + "-" + str(self.part_start) + "-" + str(self.part_len) + "-"
		else:
			percentage_str = ""
		return self._get_cachename() + "_" + percentage_str + str(self.random_seed) + "_"

External cachename, also takes care of seed and percentages (internal does not!)

#  
@abstractmethod
def get_inverse_annotator(self):
View Source
	@abstractmethod
	def get_inverse_annotator(self):
		"""
			Get an instance of `core.corpus.annotator.InverseAnnotator` for the  
			Annotator.
		"""
		pass

Get an instance of core.corpus.annotator.InverseAnnotator for the
Annotator.

#   class InverseAnnotator(abc.ABC):
View Source
class InverseAnnotator(ABC):
	"""
		To check predicted annotations it is sometimes necessary to
		map back from an annotation to the text.

		This class allows to get the similarity between a sentence and scd 
		for some annotator. 
	"""

	def __init__(self, preprocessor):
		"""
			Args:
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor used by the Annotator!
		"""
		self.annotator = type(self).ANNOTATOR_CLASS(preprocessor=preprocessor)

		if self.annotator._is_cached(suffix='inversed'):
			cache_data = self.annotator._get_cached(suffix='inversed')
		else:
			cache_data = self._create_cache_data()
			self.annotator._set_cached(cache_data, suffix='inversed')
		
		self._init(cache_data)

	@abstractmethod
	def _init(self, cache_data):
		pass

	@abstractmethod
	def _create_cache_data(self):
		pass

	def is_similar_annotation(self, sentence, annotation, other_annotation):
		"""
			Checks if the sentence could get both annotation.
			I.e. both annotations are similar in the context of the sentence.

			Args:
				sentence (list of str): The sentence annotated
				annotation (list of str): A annotation given for sentence
				other_annotation (list of str): Another annotation given for sentence
			Returns:
				bool, both annotations are possible annotations
		"""
		return self.is_annotation(sentence, annotation) and self.is_annotation(sentence, other_annotation)

	@abstractmethod
	def is_annotation(self, sentence, annotation):
		"""
			Checks if the sentence could get the annotation.

			Args:
				sentence (list of str): The sentence annotated
				annotation (list of str): The annotation given for sentence
			Returns:
				bool, is possible annotation
		"""
		pass

To check predicted annotations it is sometimes necessary to map back from an annotation to the text.

This class allows to get the similarity between a sentence and scd for some annotator.

#   InverseAnnotator(preprocessor)
View Source
	def __init__(self, preprocessor):
		"""
			Args:
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor used by the Annotator!
		"""
		self.annotator = type(self).ANNOTATOR_CLASS(preprocessor=preprocessor)

		if self.annotator._is_cached(suffix='inversed'):
			cache_data = self.annotator._get_cached(suffix='inversed')
		else:
			cache_data = self._create_cache_data()
			self.annotator._set_cached(cache_data, suffix='inversed')
		
		self._init(cache_data)
Args
#   def is_similar_annotation(self, sentence, annotation, other_annotation):
View Source
	def is_similar_annotation(self, sentence, annotation, other_annotation):
		"""
			Checks if the sentence could get both annotation.
			I.e. both annotations are similar in the context of the sentence.

			Args:
				sentence (list of str): The sentence annotated
				annotation (list of str): A annotation given for sentence
				other_annotation (list of str): Another annotation given for sentence
			Returns:
				bool, both annotations are possible annotations
		"""
		return self.is_annotation(sentence, annotation) and self.is_annotation(sentence, other_annotation)

Checks if the sentence could get both annotation. I.e. both annotations are similar in the context of the sentence.

Args
  • sentence (list of str): The sentence annotated
  • annotation (list of str): A annotation given for sentence
  • other_annotation (list of str): Another annotation given for sentence
Returns

bool, both annotations are possible annotations

#  
@abstractmethod
def is_annotation(self, sentence, annotation):
View Source
	@abstractmethod
	def is_annotation(self, sentence, annotation):
		"""
			Checks if the sentence could get the annotation.

			Args:
				sentence (list of str): The sentence annotated
				annotation (list of str): The annotation given for sentence
			Returns:
				bool, is possible annotation
		"""
		pass

Checks if the sentence could get the annotation.

Args
  • sentence (list of str): The sentence annotated
  • annotation (list of str): The annotation given for sentence
Returns

bool, is possible annotation