core.corpus.annotated_corpus

View Source
from abc import ABC, abstractmethod

from core.corpus.corpus import Corpus
from core.corpus.annotator import Annotator

class AnnotatedCorpus(ABC):
	'''
		Annotated corpus, combine annotater and corpus and allows iterating e.g. over sentences with their scds 

		This class is abstract and provides two subclasses `core.corpus.annotated_corpora.SingleAnnotatedCorpus` and
		`core.corpus.annotated_corpora.MultiAnnotatedCorpus`.
	'''

	def __init__(self, corpus, annotator):
		"""
			Args:
				corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): to iterate over and get the texts (sentences) 
				annotator (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): to create annotations for each of the sentences of the corpus

			The class `core.corpus.annotated_corpora.SingleAnnotatedCorpus` needs one corpus and one annotator.

			The class `core.corpus.annotated_corpora.MultiAnnotatedCorpus` needs two or more corpora and one annotators, 
			it will create a *context-sensitive annotated corpus* (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...).
			It thus allows to create context dependent annotations (different context and annotations per corpus).

			Also see `core.corpus.corpora` and `core.corpus.annotators`.
		"""
		if isinstance(corpus, (list,tuple)) and isinstance(annotator, (list,tuple)) and type(self).__name__ == 'MultiAnnotatedCorpus':
			if len(corpus) == len(annotator) > 1:
				if all([isinstance(c, Corpus) for c in corpus]) and all([isinstance(a, Annotator) for a in annotator]):
					self.corpora = corpus
					self.annotators = annotator
				else:
					raise AttributeError("MultiAnnotatedCorpus needs lists of Corpus and Annotator objects!")
			else:
				raise AttributeError("MultiAnnotatedCorpus needs same length lists of corpora and annotators! For only one corpus and one annotator use SingleAnnotatedCorpus.")
		elif isinstance(corpus, Corpus) and isinstance(annotator, Annotator) and type(self).__name__ == 'SingleAnnotatedCorpus':
			self.annotator = annotator
			self.corpus = corpus
		else:
			raise AttributeError("SingleAnnotatedCorpus needs one corpus and one annotator, MultiAnnotatedCorpus lists of both!")

	@abstractmethod
	def get_cachename(self):
		pass

	@abstractmethod
	def is_cacheable(self):
		pass

	@abstractmethod
	def get_num_sentences(self):
		'''
			Returns the number of sentences in this corpus.
		'''
		pass

	@abstractmethod
	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		'''
			Calls the inverse annotator of used annotator(s) and returns as
			`core.corpus.annotator.InverseAnnotator.is_similar_annotation()`.
		'''
		pass

	@abstractmethod
	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		'''
			Generate an inline SCD Text

			Works as generator (yield)  
				Each generated item is a tuple ``<array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>``

			Args:
				do_scd (int): add scds for all sentences (=1), every second (=2), ...
				n_scd (int): number of scds to add per sentence
		'''
		pass

	@abstractmethod
	def iterate_sentence_scds(self, n=1):
		'''
			Generate a list (length n) of possible scds per sentence.

			Works as generator (yield)  
				Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>``

			Args:
				n (int): Number of scd per sentence
		'''
		pass

	@abstractmethod
	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		'''
			Generate a list (length n) of possible and not possible scds per sentence.

			Works as generator (yield)  
				Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>``

			Args:
				matching_n (int): Number of possible scds
				non_matching_n (int): Number of not possible scds
		'''
		pass

	@abstractmethod
	def iterate_assign_scds_text(self):
		'''
			Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds  
				=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD) 

			Works as generator (yield)  
				Each item is a tupel ``<array of sentences (array of words) from text>, <array of sentences (array of words) from scds>,
					<array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>``

		'''
		pass
#   class AnnotatedCorpus(abc.ABC):
View Source
class AnnotatedCorpus(ABC):
	'''
		Annotated corpus, combine annotater and corpus and allows iterating e.g. over sentences with their scds 

		This class is abstract and provides two subclasses `core.corpus.annotated_corpora.SingleAnnotatedCorpus` and
		`core.corpus.annotated_corpora.MultiAnnotatedCorpus`.
	'''

	def __init__(self, corpus, annotator):
		"""
			Args:
				corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): to iterate over and get the texts (sentences) 
				annotator (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): to create annotations for each of the sentences of the corpus

			The class `core.corpus.annotated_corpora.SingleAnnotatedCorpus` needs one corpus and one annotator.

			The class `core.corpus.annotated_corpora.MultiAnnotatedCorpus` needs two or more corpora and one annotators, 
			it will create a *context-sensitive annotated corpus* (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...).
			It thus allows to create context dependent annotations (different context and annotations per corpus).

			Also see `core.corpus.corpora` and `core.corpus.annotators`.
		"""
		if isinstance(corpus, (list,tuple)) and isinstance(annotator, (list,tuple)) and type(self).__name__ == 'MultiAnnotatedCorpus':
			if len(corpus) == len(annotator) > 1:
				if all([isinstance(c, Corpus) for c in corpus]) and all([isinstance(a, Annotator) for a in annotator]):
					self.corpora = corpus
					self.annotators = annotator
				else:
					raise AttributeError("MultiAnnotatedCorpus needs lists of Corpus and Annotator objects!")
			else:
				raise AttributeError("MultiAnnotatedCorpus needs same length lists of corpora and annotators! For only one corpus and one annotator use SingleAnnotatedCorpus.")
		elif isinstance(corpus, Corpus) and isinstance(annotator, Annotator) and type(self).__name__ == 'SingleAnnotatedCorpus':
			self.annotator = annotator
			self.corpus = corpus
		else:
			raise AttributeError("SingleAnnotatedCorpus needs one corpus and one annotator, MultiAnnotatedCorpus lists of both!")

	@abstractmethod
	def get_cachename(self):
		pass

	@abstractmethod
	def is_cacheable(self):
		pass

	@abstractmethod
	def get_num_sentences(self):
		'''
			Returns the number of sentences in this corpus.
		'''
		pass

	@abstractmethod
	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		'''
			Calls the inverse annotator of used annotator(s) and returns as
			`core.corpus.annotator.InverseAnnotator.is_similar_annotation()`.
		'''
		pass

	@abstractmethod
	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		'''
			Generate an inline SCD Text

			Works as generator (yield)  
				Each generated item is a tuple ``<array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>``

			Args:
				do_scd (int): add scds for all sentences (=1), every second (=2), ...
				n_scd (int): number of scds to add per sentence
		'''
		pass

	@abstractmethod
	def iterate_sentence_scds(self, n=1):
		'''
			Generate a list (length n) of possible scds per sentence.

			Works as generator (yield)  
				Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>``

			Args:
				n (int): Number of scd per sentence
		'''
		pass

	@abstractmethod
	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		'''
			Generate a list (length n) of possible and not possible scds per sentence.

			Works as generator (yield)  
				Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>``

			Args:
				matching_n (int): Number of possible scds
				non_matching_n (int): Number of not possible scds
		'''
		pass

	@abstractmethod
	def iterate_assign_scds_text(self):
		'''
			Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds  
				=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD) 

			Works as generator (yield)  
				Each item is a tupel ``<array of sentences (array of words) from text>, <array of sentences (array of words) from scds>,
					<array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>``

		'''
		pass

Annotated corpus, combine annotater and corpus and allows iterating e.g. over sentences with their scds

This class is abstract and provides two subclasses core.corpus.annotated_corpora.SingleAnnotatedCorpus and core.corpus.annotated_corpora.MultiAnnotatedCorpus.

#   AnnotatedCorpus(corpus, annotator)
View Source
	def __init__(self, corpus, annotator):
		"""
			Args:
				corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): to iterate over and get the texts (sentences) 
				annotator (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): to create annotations for each of the sentences of the corpus

			The class `core.corpus.annotated_corpora.SingleAnnotatedCorpus` needs one corpus and one annotator.

			The class `core.corpus.annotated_corpora.MultiAnnotatedCorpus` needs two or more corpora and one annotators, 
			it will create a *context-sensitive annotated corpus* (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...).
			It thus allows to create context dependent annotations (different context and annotations per corpus).

			Also see `core.corpus.corpora` and `core.corpus.annotators`.
		"""
		if isinstance(corpus, (list,tuple)) and isinstance(annotator, (list,tuple)) and type(self).__name__ == 'MultiAnnotatedCorpus':
			if len(corpus) == len(annotator) > 1:
				if all([isinstance(c, Corpus) for c in corpus]) and all([isinstance(a, Annotator) for a in annotator]):
					self.corpora = corpus
					self.annotators = annotator
				else:
					raise AttributeError("MultiAnnotatedCorpus needs lists of Corpus and Annotator objects!")
			else:
				raise AttributeError("MultiAnnotatedCorpus needs same length lists of corpora and annotators! For only one corpus and one annotator use SingleAnnotatedCorpus.")
		elif isinstance(corpus, Corpus) and isinstance(annotator, Annotator) and type(self).__name__ == 'SingleAnnotatedCorpus':
			self.annotator = annotator
			self.corpus = corpus
		else:
			raise AttributeError("SingleAnnotatedCorpus needs one corpus and one annotator, MultiAnnotatedCorpus lists of both!")
Args

The class core.corpus.annotated_corpora.SingleAnnotatedCorpus needs one corpus and one annotator.

The class core.corpus.annotated_corpora.MultiAnnotatedCorpus needs two or more corpora and one annotators, it will create a context-sensitive annotated corpus (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...). It thus allows to create context dependent annotations (different context and annotations per corpus).

Also see core.corpus.corpora and core.corpus.annotators.

#  
@abstractmethod
def get_cachename(self):
View Source
	@abstractmethod
	def get_cachename(self):
		pass
#  
@abstractmethod
def is_cacheable(self):
View Source
	@abstractmethod
	def is_cacheable(self):
		pass
#  
@abstractmethod
def get_num_sentences(self):
View Source
	@abstractmethod
	def get_num_sentences(self):
		'''
			Returns the number of sentences in this corpus.
		'''
		pass

Returns the number of sentences in this corpus.

#  
@abstractmethod
def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
View Source
	@abstractmethod
	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		'''
			Calls the inverse annotator of used annotator(s) and returns as
			`core.corpus.annotator.InverseAnnotator.is_similar_annotation()`.
		'''
		pass

Calls the inverse annotator of used annotator(s) and returns as core.corpus.annotator.InverseAnnotator.is_similar_annotation().

#  
@abstractmethod
def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
View Source
	@abstractmethod
	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		'''
			Generate an inline SCD Text

			Works as generator (yield)  
				Each generated item is a tuple ``<array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>``

			Args:
				do_scd (int): add scds for all sentences (=1), every second (=2), ...
				n_scd (int): number of scds to add per sentence
		'''
		pass

Generate an inline SCD Text

Works as generator (yield)
Each generated item is a tuple <array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>

Args
  • do_scd (int): add scds for all sentences (=1), every second (=2), ...
  • n_scd (int): number of scds to add per sentence
#  
@abstractmethod
def iterate_sentence_scds(self, n=1):
View Source
	@abstractmethod
	def iterate_sentence_scds(self, n=1):
		'''
			Generate a list (length n) of possible scds per sentence.

			Works as generator (yield)  
				Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>``

			Args:
				n (int): Number of scd per sentence
		'''
		pass

Generate a list (length n) of possible scds per sentence.

Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>

Args
  • n (int): Number of scd per sentence
#  
@abstractmethod
def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
View Source
	@abstractmethod
	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		'''
			Generate a list (length n) of possible and not possible scds per sentence.

			Works as generator (yield)  
				Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>``

			Args:
				matching_n (int): Number of possible scds
				non_matching_n (int): Number of not possible scds
		'''
		pass

Generate a list (length n) of possible and not possible scds per sentence.

Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>

Args
  • matching_n (int): Number of possible scds
  • non_matching_n (int): Number of not possible scds
#  
@abstractmethod
def iterate_assign_scds_text(self):
View Source
	@abstractmethod
	def iterate_assign_scds_text(self):
		'''
			Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds  
				=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD) 

			Works as generator (yield)  
				Each item is a tupel ``<array of sentences (array of words) from text>, <array of sentences (array of words) from scds>,
					<array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>``

		'''
		pass

Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds
=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD)

Works as generator (yield)
Each item is a tupel <array of sentences (array of words) from text>, <array of sentences (array of words) from scds>, <array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>