core.corpus.annotated_corpora

View Source

import math

from core.corpus.annotated_corpus import AnnotatedCorpus

from core.utils import Random

class SingleAnnotatedCorpus(AnnotatedCorpus):
	'''
		Annotated corpus, combines one annotator with one coprus and allows iterating e.g. over sentences with their SCDs. 

		See `MultiAnnotatedCorpus` to combine multiple annotators and corpora.
	'''

	def __init__(self, *args, non_annotator = None):
		super().__init__(*args)
		self.non_annotator = non_annotator
		self.inverse_annotator = None

	def get_cachename(self):
		return self.corpus.get_cachename() + "-" + self.annotator.get_cachename() + "_" + str(Random.get_seed()) + "_"

	def is_cacheable(self):
		# annotators are always cacheable (they cache their data, not the sentences annotated)
		return self.corpus.is_cacheable()

	def get_num_sentences(self):
		return self.corpus.get_num_sentences()

	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		if self.inverse_annotator == None:
			self.inverse_annotator = self.annotator.get_inverse_annotator()
		return self.inverse_annotator.is_similar_annotation(sentence, correct_scd, predicted_scd)

	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		for text in self.corpus.iterate_texts():
			sentences = []
			is_scds = []
			for i,sentence in enumerate(text):
				sentences.append(sentence)
				is_scds.append(False)

				if i % do_scd == 0: 
					for scd in self.annotator.get_annotations(sentence, n=n_scd):
						sentences.append(scd)
						is_scds.append(True)

			yield sentences, is_scds

	def iterate_sentence_scds(self, n=1):
		for sentence in self.corpus.iterate_sentences():
			yield sentence, self.annotator.get_annotations(sentence, n=n)

	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		for sentence in self.corpus.iterate_sentences():
			if self.non_annotator == None:
				nons = self.annotator.get_non_annotations(sentence, n=non_matching_n)
			else:
				nons = self.annotator.get_non_annotations(sentence, n=math.ceil(non_matching_n/2)) + \
					self.non_annotator.get_annotations(sentence, n=math.floor(non_matching_n/2))

				if len(nons) < non_matching_n:
					nons += self.non_annotator.get_non_annotations(sentence, n=non_matching_n-len(nons))

			yield sentence, \
				self.annotator.get_annotations(sentence,n=matching_n), \
				nons


	def iterate_assign_scds_text(self):
		local_random = Random.get_generator()

		for sentences in self.corpus.iterate_texts():
			text = []
			scds = []
			for i,sentence in enumerate(sentences):
				scd = self.annotator.get_annotations(sentence, n=1)
				if len(scd) == 1:
					text.append((i, sentence))
					scds.append((i, scd[0]))

			local_random.shuffle(scds)
			scd_index, _ = zip(*scds)
			scd_index_swapped = { v:k for k,v in enumerate(scd_index) } 

			local_random.shuffle(text)
			text_index, _ = zip(*text)
			text_index_swapped = { v:k for k,v in enumerate(text_index) } 

			text_scd_map = [ scd_index_swapped[i]  for i in text_index ]
			scd_text_map = [ text_index_swapped[i]  for i in scd_index ]
			
			yield list(map(lambda t: t[1], text)), \
				list(map(lambda t: t[1], scds)), \
				text_scd_map, scd_text_map

class MultiAnnotatedCorpus(AnnotatedCorpus):
	'''
		Annotated corpus, combines multiple annotators with multiple corpora and allows iterating e.g. over sentences with their SCDs. 

		See `SingleAnnotatedCorpus` to combine one annotator and one corpus.
	'''

	def __init__(self, *args):
		super().__init__(*args)

		i = 1
		self.a_c_list = []
		for c,a in zip(self.corpora, self.annotators):
			self.a_c_list.append(SingleAnnotatedCorpus(
					c, a,
					non_annotator=self.annotators[i%len(self.annotators)]
				))
			i += 1
		
		self.inverse_annotators = None

	def get_cachename(self):
		return '-'.join([c.get_cachename() for c in self.corpora]) + '-' + \
			'-'.join([a.get_cachename() for a in self.annotators]) + '_' + \
			str(Random.get_seed()) + "_"

	def is_cacheable(self):
		# annotators are always cacheable (they cache their data, not the sentences annotated)
		return all([c.is_cacheable() for c in self.corpora])

	def get_num_sentences(self):
		return sum([c.get_num_sentences() for c in self.corpora])

	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		if self.inverse_annotators == None:
			self.inverse_annotators = [a.get_inverse_annotator() for a in self.annotators]
		return any([ia.is_similar_annotation(sentence, correct_scd, predicted_scd) for ia in self.inverse_annotators])

	def _yield_generator_list(self, generators):
		local_random = Random.get_generator()

		while(len(generators) > 0):
			if len(generators) > 1:
				local_random.shuffle(generators)

			empty_gen = []
			for generator in generators:
				try:
					yield next(generator)
				except StopIteration:
					empty_gen.append(generator)
			
			for generator in empty_gen:
				generators.remove(generator)

	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_inline_scd_texts(do_scd=do_scd, n_scd=n_scd))

		yield from self._yield_generator_list(generators)

	def iterate_sentence_scds(self, n=1):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_sentence_scds(n=n))

		yield from self._yield_generator_list(generators)

	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_sentence_scds_non_scds(matching_n=matching_n, non_matching_n=non_matching_n))

		yield from self._yield_generator_list(generators)

	def iterate_assign_scds_text(self):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_assign_scds_text())
		generator = self._yield_generator_list(generators)

		while(True):
			# always get two set of answers and put them together
			one_set = False
			try:
				one = next(generator)
				one_set = True
				two = next(generator)
			except StopIteration:
				# stop, if no more or less than two
				if one_set:
					yield one
				break
		
			# len of first, as offset of second
			one_l = len(one[0])

			# yield text, scds, text_scd_map, scd_text_map
			yield one[0] + two[0], \
				one[1] + two[1], \
				one[2] + [i + one_l for i in two[2]], \
				one[3] + [i + one_l for i in two[3]]

# class SingleAnnotatedCorpus(core.corpus.annotated_corpus.AnnotatedCorpus):

View Source

class SingleAnnotatedCorpus(AnnotatedCorpus):
	'''
		Annotated corpus, combines one annotator with one coprus and allows iterating e.g. over sentences with their SCDs. 

		See `MultiAnnotatedCorpus` to combine multiple annotators and corpora.
	'''

	def __init__(self, *args, non_annotator = None):
		super().__init__(*args)
		self.non_annotator = non_annotator
		self.inverse_annotator = None

	def get_cachename(self):
		return self.corpus.get_cachename() + "-" + self.annotator.get_cachename() + "_" + str(Random.get_seed()) + "_"

	def is_cacheable(self):
		# annotators are always cacheable (they cache their data, not the sentences annotated)
		return self.corpus.is_cacheable()

	def get_num_sentences(self):
		return self.corpus.get_num_sentences()

	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		if self.inverse_annotator == None:
			self.inverse_annotator = self.annotator.get_inverse_annotator()
		return self.inverse_annotator.is_similar_annotation(sentence, correct_scd, predicted_scd)

	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		for text in self.corpus.iterate_texts():
			sentences = []
			is_scds = []
			for i,sentence in enumerate(text):
				sentences.append(sentence)
				is_scds.append(False)

				if i % do_scd == 0: 
					for scd in self.annotator.get_annotations(sentence, n=n_scd):
						sentences.append(scd)
						is_scds.append(True)

			yield sentences, is_scds

	def iterate_sentence_scds(self, n=1):
		for sentence in self.corpus.iterate_sentences():
			yield sentence, self.annotator.get_annotations(sentence, n=n)

	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		for sentence in self.corpus.iterate_sentences():
			if self.non_annotator == None:
				nons = self.annotator.get_non_annotations(sentence, n=non_matching_n)
			else:
				nons = self.annotator.get_non_annotations(sentence, n=math.ceil(non_matching_n/2)) + \
					self.non_annotator.get_annotations(sentence, n=math.floor(non_matching_n/2))

				if len(nons) < non_matching_n:
					nons += self.non_annotator.get_non_annotations(sentence, n=non_matching_n-len(nons))

			yield sentence, \
				self.annotator.get_annotations(sentence,n=matching_n), \
				nons


	def iterate_assign_scds_text(self):
		local_random = Random.get_generator()

		for sentences in self.corpus.iterate_texts():
			text = []
			scds = []
			for i,sentence in enumerate(sentences):
				scd = self.annotator.get_annotations(sentence, n=1)
				if len(scd) == 1:
					text.append((i, sentence))
					scds.append((i, scd[0]))

			local_random.shuffle(scds)
			scd_index, _ = zip(*scds)
			scd_index_swapped = { v:k for k,v in enumerate(scd_index) } 

			local_random.shuffle(text)
			text_index, _ = zip(*text)
			text_index_swapped = { v:k for k,v in enumerate(text_index) } 

			text_scd_map = [ scd_index_swapped[i]  for i in text_index ]
			scd_text_map = [ text_index_swapped[i]  for i in scd_index ]
			
			yield list(map(lambda t: t[1], text)), \
				list(map(lambda t: t[1], scds)), \
				text_scd_map, scd_text_map

Annotated corpus, combines one annotator with one coprus and allows iterating e.g. over sentences with their SCDs.

See MultiAnnotatedCorpus to combine multiple annotators and corpora.

# SingleAnnotatedCorpus(*args, non_annotator=None)

View Source

	def __init__(self, *args, non_annotator = None):
		super().__init__(*args)
		self.non_annotator = non_annotator
		self.inverse_annotator = None

Args

corpus (core.corpus.corpus.Corpus or list of core.corpus.corpus.Corpus): to iterate over and get the texts (sentences)
annotator (core.corpus.annotator.Annotator or list of core.corpus.annotator.Annotator): to create annotations for each of the sentences of the corpus

The class core.corpus.annotated_corpora.SingleAnnotatedCorpus needs one corpus and one annotator.

The class core.corpus.annotated_corpora.MultiAnnotatedCorpus needs two or more corpora and one annotators, it will create a context-sensitive annotated corpus (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...). It thus allows to create context dependent annotations (different context and annotations per corpus).

Also see core.corpus.corpora and core.corpus.annotators.

# def get_cachename(self):

View Source

	def get_cachename(self):
		return self.corpus.get_cachename() + "-" + self.annotator.get_cachename() + "_" + str(Random.get_seed()) + "_"

# def is_cacheable(self):

View Source

	def is_cacheable(self):
		# annotators are always cacheable (they cache their data, not the sentences annotated)
		return self.corpus.is_cacheable()

# def get_num_sentences(self):

View Source

	def get_num_sentences(self):
		return self.corpus.get_num_sentences()

Returns the number of sentences in this corpus.

# def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):

View Source

	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		if self.inverse_annotator == None:
			self.inverse_annotator = self.annotator.get_inverse_annotator()
		return self.inverse_annotator.is_similar_annotation(sentence, correct_scd, predicted_scd)

Calls the inverse annotator of used annotator(s) and returns as core.corpus.annotator.InverseAnnotator.is_similar_annotation().

# def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):

View Source

	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		for text in self.corpus.iterate_texts():
			sentences = []
			is_scds = []
			for i,sentence in enumerate(text):
				sentences.append(sentence)
				is_scds.append(False)

				if i % do_scd == 0: 
					for scd in self.annotator.get_annotations(sentence, n=n_scd):
						sentences.append(scd)
						is_scds.append(True)

			yield sentences, is_scds

Generate an inline SCD Text

Works as generator (yield)
Each generated item is a tuple <array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>

Args

do_scd (int): add scds for all sentences (=1), every second (=2), ...
n_scd (int): number of scds to add per sentence

# def iterate_sentence_scds(self, n=1):

View Source

	def iterate_sentence_scds(self, n=1):
		for sentence in self.corpus.iterate_sentences():
			yield sentence, self.annotator.get_annotations(sentence, n=n)

Generate a list (length n) of possible scds per sentence.

Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>

Args

n (int): Number of scd per sentence

# def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):

View Source

	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		for sentence in self.corpus.iterate_sentences():
			if self.non_annotator == None:
				nons = self.annotator.get_non_annotations(sentence, n=non_matching_n)
			else:
				nons = self.annotator.get_non_annotations(sentence, n=math.ceil(non_matching_n/2)) + \
					self.non_annotator.get_annotations(sentence, n=math.floor(non_matching_n/2))

				if len(nons) < non_matching_n:
					nons += self.non_annotator.get_non_annotations(sentence, n=non_matching_n-len(nons))

			yield sentence, \
				self.annotator.get_annotations(sentence,n=matching_n), \
				nons

Generate a list (length n) of possible and not possible scds per sentence.

Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>

Args

matching_n (int): Number of possible scds
non_matching_n (int): Number of not possible scds

# def iterate_assign_scds_text(self):

View Source

	def iterate_assign_scds_text(self):
		local_random = Random.get_generator()

		for sentences in self.corpus.iterate_texts():
			text = []
			scds = []
			for i,sentence in enumerate(sentences):
				scd = self.annotator.get_annotations(sentence, n=1)
				if len(scd) == 1:
					text.append((i, sentence))
					scds.append((i, scd[0]))

			local_random.shuffle(scds)
			scd_index, _ = zip(*scds)
			scd_index_swapped = { v:k for k,v in enumerate(scd_index) } 

			local_random.shuffle(text)
			text_index, _ = zip(*text)
			text_index_swapped = { v:k for k,v in enumerate(text_index) } 

			text_scd_map = [ scd_index_swapped[i]  for i in text_index ]
			scd_text_map = [ text_index_swapped[i]  for i in scd_index ]
			
			yield list(map(lambda t: t[1], text)), \
				list(map(lambda t: t[1], scds)), \
				text_scd_map, scd_text_map

Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds
=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD)

Works as generator (yield)
Each item is a tupel <array of sentences (array of words) from text>, <array of sentences (array of words) from scds>, <array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>

# class MultiAnnotatedCorpus(core.corpus.annotated_corpus.AnnotatedCorpus):

View Source

class MultiAnnotatedCorpus(AnnotatedCorpus):
	'''
		Annotated corpus, combines multiple annotators with multiple corpora and allows iterating e.g. over sentences with their SCDs. 

		See `SingleAnnotatedCorpus` to combine one annotator and one corpus.
	'''

	def __init__(self, *args):
		super().__init__(*args)

		i = 1
		self.a_c_list = []
		for c,a in zip(self.corpora, self.annotators):
			self.a_c_list.append(SingleAnnotatedCorpus(
					c, a,
					non_annotator=self.annotators[i%len(self.annotators)]
				))
			i += 1
		
		self.inverse_annotators = None

	def get_cachename(self):
		return '-'.join([c.get_cachename() for c in self.corpora]) + '-' + \
			'-'.join([a.get_cachename() for a in self.annotators]) + '_' + \
			str(Random.get_seed()) + "_"

	def is_cacheable(self):
		# annotators are always cacheable (they cache their data, not the sentences annotated)
		return all([c.is_cacheable() for c in self.corpora])

	def get_num_sentences(self):
		return sum([c.get_num_sentences() for c in self.corpora])

	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		if self.inverse_annotators == None:
			self.inverse_annotators = [a.get_inverse_annotator() for a in self.annotators]
		return any([ia.is_similar_annotation(sentence, correct_scd, predicted_scd) for ia in self.inverse_annotators])

	def _yield_generator_list(self, generators):
		local_random = Random.get_generator()

		while(len(generators) > 0):
			if len(generators) > 1:
				local_random.shuffle(generators)

			empty_gen = []
			for generator in generators:
				try:
					yield next(generator)
				except StopIteration:
					empty_gen.append(generator)
			
			for generator in empty_gen:
				generators.remove(generator)

	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_inline_scd_texts(do_scd=do_scd, n_scd=n_scd))

		yield from self._yield_generator_list(generators)

	def iterate_sentence_scds(self, n=1):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_sentence_scds(n=n))

		yield from self._yield_generator_list(generators)

	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_sentence_scds_non_scds(matching_n=matching_n, non_matching_n=non_matching_n))

		yield from self._yield_generator_list(generators)

	def iterate_assign_scds_text(self):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_assign_scds_text())
		generator = self._yield_generator_list(generators)

		while(True):
			# always get two set of answers and put them together
			one_set = False
			try:
				one = next(generator)
				one_set = True
				two = next(generator)
			except StopIteration:
				# stop, if no more or less than two
				if one_set:
					yield one
				break
		
			# len of first, as offset of second
			one_l = len(one[0])

			# yield text, scds, text_scd_map, scd_text_map
			yield one[0] + two[0], \
				one[1] + two[1], \
				one[2] + [i + one_l for i in two[2]], \
				one[3] + [i + one_l for i in two[3]]

Annotated corpus, combines multiple annotators with multiple corpora and allows iterating e.g. over sentences with their SCDs.

See SingleAnnotatedCorpus to combine one annotator and one corpus.

# MultiAnnotatedCorpus(*args)

View Source

	def __init__(self, *args):
		super().__init__(*args)

		i = 1
		self.a_c_list = []
		for c,a in zip(self.corpora, self.annotators):
			self.a_c_list.append(SingleAnnotatedCorpus(
					c, a,
					non_annotator=self.annotators[i%len(self.annotators)]
				))
			i += 1
		
		self.inverse_annotators = None

Args

corpus (core.corpus.corpus.Corpus or list of core.corpus.corpus.Corpus): to iterate over and get the texts (sentences)
annotator (core.corpus.annotator.Annotator or list of core.corpus.annotator.Annotator): to create annotations for each of the sentences of the corpus

The class core.corpus.annotated_corpora.SingleAnnotatedCorpus needs one corpus and one annotator.

Also see core.corpus.corpora and core.corpus.annotators.

# def get_cachename(self):

View Source

	def get_cachename(self):
		return '-'.join([c.get_cachename() for c in self.corpora]) + '-' + \
			'-'.join([a.get_cachename() for a in self.annotators]) + '_' + \
			str(Random.get_seed()) + "_"

# def is_cacheable(self):

View Source

	def is_cacheable(self):
		# annotators are always cacheable (they cache their data, not the sentences annotated)
		return all([c.is_cacheable() for c in self.corpora])

# def get_num_sentences(self):

View Source

	def get_num_sentences(self):
		return sum([c.get_num_sentences() for c in self.corpora])

Returns the number of sentences in this corpus.

# def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):

View Source

	def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd):
		if self.inverse_annotators == None:
			self.inverse_annotators = [a.get_inverse_annotator() for a in self.annotators]
		return any([ia.is_similar_annotation(sentence, correct_scd, predicted_scd) for ia in self.inverse_annotators])

Calls the inverse annotator of used annotator(s) and returns as core.corpus.annotator.InverseAnnotator.is_similar_annotation().

# def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):

View Source

	def iterate_inline_scd_texts(self, do_scd=1, n_scd=1):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_inline_scd_texts(do_scd=do_scd, n_scd=n_scd))

		yield from self._yield_generator_list(generators)

Generate an inline SCD Text

Works as generator (yield)
Each generated item is a tuple <array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>

Args

do_scd (int): add scds for all sentences (=1), every second (=2), ...
n_scd (int): number of scds to add per sentence

# def iterate_sentence_scds(self, n=1):

View Source

	def iterate_sentence_scds(self, n=1):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_sentence_scds(n=n))

		yield from self._yield_generator_list(generators)

Generate a list (length n) of possible scds per sentence.

Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>

Args

n (int): Number of scd per sentence

# def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):

View Source

	def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_sentence_scds_non_scds(matching_n=matching_n, non_matching_n=non_matching_n))

		yield from self._yield_generator_list(generators)

Generate a list (length n) of possible and not possible scds per sentence.

Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>

Args

matching_n (int): Number of possible scds
non_matching_n (int): Number of not possible scds

# def iterate_assign_scds_text(self):

View Source

	def iterate_assign_scds_text(self):
		generators = []
		for a_c in self.a_c_list:
			generators.append(a_c.iterate_assign_scds_text())
		generator = self._yield_generator_list(generators)

		while(True):
			# always get two set of answers and put them together
			one_set = False
			try:
				one = next(generator)
				one_set = True
				two = next(generator)
			except StopIteration:
				# stop, if no more or less than two
				if one_set:
					yield one
				break
		
			# len of first, as offset of second
			one_l = len(one[0])

			# yield text, scds, text_scd_map, scd_text_map
			yield one[0] + two[0], \
				one[1] + two[1], \
				one[2] + [i + one_l for i in two[2]], \
				one[3] + [i + one_l for i in two[3]]