core.corpus.annotated_corpora
View Source
import math from core.corpus.annotated_corpus import AnnotatedCorpus from core.utils import Random class SingleAnnotatedCorpus(AnnotatedCorpus): ''' Annotated corpus, combines one annotator with one coprus and allows iterating e.g. over sentences with their SCDs. See `MultiAnnotatedCorpus` to combine multiple annotators and corpora. ''' def __init__(self, *args, non_annotator = None): super().__init__(*args) self.non_annotator = non_annotator self.inverse_annotator = None def get_cachename(self): return self.corpus.get_cachename() + "-" + self.annotator.get_cachename() + "_" + str(Random.get_seed()) + "_" def is_cacheable(self): # annotators are always cacheable (they cache their data, not the sentences annotated) return self.corpus.is_cacheable() def get_num_sentences(self): return self.corpus.get_num_sentences() def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): if self.inverse_annotator == None: self.inverse_annotator = self.annotator.get_inverse_annotator() return self.inverse_annotator.is_similar_annotation(sentence, correct_scd, predicted_scd) def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): for text in self.corpus.iterate_texts(): sentences = [] is_scds = [] for i,sentence in enumerate(text): sentences.append(sentence) is_scds.append(False) if i % do_scd == 0: for scd in self.annotator.get_annotations(sentence, n=n_scd): sentences.append(scd) is_scds.append(True) yield sentences, is_scds def iterate_sentence_scds(self, n=1): for sentence in self.corpus.iterate_sentences(): yield sentence, self.annotator.get_annotations(sentence, n=n) def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): for sentence in self.corpus.iterate_sentences(): if self.non_annotator == None: nons = self.annotator.get_non_annotations(sentence, n=non_matching_n) else: nons = self.annotator.get_non_annotations(sentence, n=math.ceil(non_matching_n/2)) + \ self.non_annotator.get_annotations(sentence, n=math.floor(non_matching_n/2)) if len(nons) < non_matching_n: nons += self.non_annotator.get_non_annotations(sentence, n=non_matching_n-len(nons)) yield sentence, \ self.annotator.get_annotations(sentence,n=matching_n), \ nons def iterate_assign_scds_text(self): local_random = Random.get_generator() for sentences in self.corpus.iterate_texts(): text = [] scds = [] for i,sentence in enumerate(sentences): scd = self.annotator.get_annotations(sentence, n=1) if len(scd) == 1: text.append((i, sentence)) scds.append((i, scd[0])) local_random.shuffle(scds) scd_index, _ = zip(*scds) scd_index_swapped = { v:k for k,v in enumerate(scd_index) } local_random.shuffle(text) text_index, _ = zip(*text) text_index_swapped = { v:k for k,v in enumerate(text_index) } text_scd_map = [ scd_index_swapped[i] for i in text_index ] scd_text_map = [ text_index_swapped[i] for i in scd_index ] yield list(map(lambda t: t[1], text)), \ list(map(lambda t: t[1], scds)), \ text_scd_map, scd_text_map class MultiAnnotatedCorpus(AnnotatedCorpus): ''' Annotated corpus, combines multiple annotators with multiple corpora and allows iterating e.g. over sentences with their SCDs. See `SingleAnnotatedCorpus` to combine one annotator and one corpus. ''' def __init__(self, *args): super().__init__(*args) i = 1 self.a_c_list = [] for c,a in zip(self.corpora, self.annotators): self.a_c_list.append(SingleAnnotatedCorpus( c, a, non_annotator=self.annotators[i%len(self.annotators)] )) i += 1 self.inverse_annotators = None def get_cachename(self): return '-'.join([c.get_cachename() for c in self.corpora]) + '-' + \ '-'.join([a.get_cachename() for a in self.annotators]) + '_' + \ str(Random.get_seed()) + "_" def is_cacheable(self): # annotators are always cacheable (they cache their data, not the sentences annotated) return all([c.is_cacheable() for c in self.corpora]) def get_num_sentences(self): return sum([c.get_num_sentences() for c in self.corpora]) def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): if self.inverse_annotators == None: self.inverse_annotators = [a.get_inverse_annotator() for a in self.annotators] return any([ia.is_similar_annotation(sentence, correct_scd, predicted_scd) for ia in self.inverse_annotators]) def _yield_generator_list(self, generators): local_random = Random.get_generator() while(len(generators) > 0): if len(generators) > 1: local_random.shuffle(generators) empty_gen = [] for generator in generators: try: yield next(generator) except StopIteration: empty_gen.append(generator) for generator in empty_gen: generators.remove(generator) def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_inline_scd_texts(do_scd=do_scd, n_scd=n_scd)) yield from self._yield_generator_list(generators) def iterate_sentence_scds(self, n=1): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_sentence_scds(n=n)) yield from self._yield_generator_list(generators) def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_sentence_scds_non_scds(matching_n=matching_n, non_matching_n=non_matching_n)) yield from self._yield_generator_list(generators) def iterate_assign_scds_text(self): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_assign_scds_text()) generator = self._yield_generator_list(generators) while(True): # always get two set of answers and put them together one_set = False try: one = next(generator) one_set = True two = next(generator) except StopIteration: # stop, if no more or less than two if one_set: yield one break # len of first, as offset of second one_l = len(one[0]) # yield text, scds, text_scd_map, scd_text_map yield one[0] + two[0], \ one[1] + two[1], \ one[2] + [i + one_l for i in two[2]], \ one[3] + [i + one_l for i in two[3]]
View Source
class SingleAnnotatedCorpus(AnnotatedCorpus): ''' Annotated corpus, combines one annotator with one coprus and allows iterating e.g. over sentences with their SCDs. See `MultiAnnotatedCorpus` to combine multiple annotators and corpora. ''' def __init__(self, *args, non_annotator = None): super().__init__(*args) self.non_annotator = non_annotator self.inverse_annotator = None def get_cachename(self): return self.corpus.get_cachename() + "-" + self.annotator.get_cachename() + "_" + str(Random.get_seed()) + "_" def is_cacheable(self): # annotators are always cacheable (they cache their data, not the sentences annotated) return self.corpus.is_cacheable() def get_num_sentences(self): return self.corpus.get_num_sentences() def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): if self.inverse_annotator == None: self.inverse_annotator = self.annotator.get_inverse_annotator() return self.inverse_annotator.is_similar_annotation(sentence, correct_scd, predicted_scd) def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): for text in self.corpus.iterate_texts(): sentences = [] is_scds = [] for i,sentence in enumerate(text): sentences.append(sentence) is_scds.append(False) if i % do_scd == 0: for scd in self.annotator.get_annotations(sentence, n=n_scd): sentences.append(scd) is_scds.append(True) yield sentences, is_scds def iterate_sentence_scds(self, n=1): for sentence in self.corpus.iterate_sentences(): yield sentence, self.annotator.get_annotations(sentence, n=n) def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): for sentence in self.corpus.iterate_sentences(): if self.non_annotator == None: nons = self.annotator.get_non_annotations(sentence, n=non_matching_n) else: nons = self.annotator.get_non_annotations(sentence, n=math.ceil(non_matching_n/2)) + \ self.non_annotator.get_annotations(sentence, n=math.floor(non_matching_n/2)) if len(nons) < non_matching_n: nons += self.non_annotator.get_non_annotations(sentence, n=non_matching_n-len(nons)) yield sentence, \ self.annotator.get_annotations(sentence,n=matching_n), \ nons def iterate_assign_scds_text(self): local_random = Random.get_generator() for sentences in self.corpus.iterate_texts(): text = [] scds = [] for i,sentence in enumerate(sentences): scd = self.annotator.get_annotations(sentence, n=1) if len(scd) == 1: text.append((i, sentence)) scds.append((i, scd[0])) local_random.shuffle(scds) scd_index, _ = zip(*scds) scd_index_swapped = { v:k for k,v in enumerate(scd_index) } local_random.shuffle(text) text_index, _ = zip(*text) text_index_swapped = { v:k for k,v in enumerate(text_index) } text_scd_map = [ scd_index_swapped[i] for i in text_index ] scd_text_map = [ text_index_swapped[i] for i in scd_index ] yield list(map(lambda t: t[1], text)), \ list(map(lambda t: t[1], scds)), \ text_scd_map, scd_text_map
Annotated corpus, combines one annotator with one coprus and allows iterating e.g. over sentences with their SCDs.
See MultiAnnotatedCorpus
to combine multiple annotators and corpora.
View Source
def __init__(self, *args, non_annotator = None): super().__init__(*args) self.non_annotator = non_annotator self.inverse_annotator = None
Args
- corpus (
core.corpus.corpus.Corpus
or list ofcore.corpus.corpus.Corpus
): to iterate over and get the texts (sentences) - annotator (
core.corpus.annotator.Annotator
or list ofcore.corpus.annotator.Annotator
): to create annotations for each of the sentences of the corpus
The class core.corpus.annotated_corpora.SingleAnnotatedCorpus
needs one corpus and one annotator.
The class core.corpus.annotated_corpora.MultiAnnotatedCorpus
needs two or more corpora and one annotators,
it will create a context-sensitive annotated corpus (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...).
It thus allows to create context dependent annotations (different context and annotations per corpus).
Also see core.corpus.corpora
and core.corpus.annotators
.
View Source
def get_cachename(self): return self.corpus.get_cachename() + "-" + self.annotator.get_cachename() + "_" + str(Random.get_seed()) + "_"
View Source
def is_cacheable(self): # annotators are always cacheable (they cache their data, not the sentences annotated) return self.corpus.is_cacheable()
View Source
def get_num_sentences(self): return self.corpus.get_num_sentences()
Returns the number of sentences in this corpus.
View Source
def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): if self.inverse_annotator == None: self.inverse_annotator = self.annotator.get_inverse_annotator() return self.inverse_annotator.is_similar_annotation(sentence, correct_scd, predicted_scd)
Calls the inverse annotator of used annotator(s) and returns as
core.corpus.annotator.InverseAnnotator.is_similar_annotation()
.
View Source
def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): for text in self.corpus.iterate_texts(): sentences = [] is_scds = [] for i,sentence in enumerate(text): sentences.append(sentence) is_scds.append(False) if i % do_scd == 0: for scd in self.annotator.get_annotations(sentence, n=n_scd): sentences.append(scd) is_scds.append(True) yield sentences, is_scds
Generate an inline SCD Text
Works as generator (yield)
Each generated item is a tuple <array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>
Args
- do_scd (int): add scds for all sentences (=1), every second (=2), ...
- n_scd (int): number of scds to add per sentence
View Source
def iterate_sentence_scds(self, n=1): for sentence in self.corpus.iterate_sentences(): yield sentence, self.annotator.get_annotations(sentence, n=n)
Generate a list (length n) of possible scds per sentence.
Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>
Args
- n (int): Number of scd per sentence
View Source
def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): for sentence in self.corpus.iterate_sentences(): if self.non_annotator == None: nons = self.annotator.get_non_annotations(sentence, n=non_matching_n) else: nons = self.annotator.get_non_annotations(sentence, n=math.ceil(non_matching_n/2)) + \ self.non_annotator.get_annotations(sentence, n=math.floor(non_matching_n/2)) if len(nons) < non_matching_n: nons += self.non_annotator.get_non_annotations(sentence, n=non_matching_n-len(nons)) yield sentence, \ self.annotator.get_annotations(sentence,n=matching_n), \ nons
Generate a list (length n) of possible and not possible scds per sentence.
Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>
Args
- matching_n (int): Number of possible scds
- non_matching_n (int): Number of not possible scds
View Source
def iterate_assign_scds_text(self): local_random = Random.get_generator() for sentences in self.corpus.iterate_texts(): text = [] scds = [] for i,sentence in enumerate(sentences): scd = self.annotator.get_annotations(sentence, n=1) if len(scd) == 1: text.append((i, sentence)) scds.append((i, scd[0])) local_random.shuffle(scds) scd_index, _ = zip(*scds) scd_index_swapped = { v:k for k,v in enumerate(scd_index) } local_random.shuffle(text) text_index, _ = zip(*text) text_index_swapped = { v:k for k,v in enumerate(text_index) } text_scd_map = [ scd_index_swapped[i] for i in text_index ] scd_text_map = [ text_index_swapped[i] for i in scd_index ] yield list(map(lambda t: t[1], text)), \ list(map(lambda t: t[1], scds)), \ text_scd_map, scd_text_map
Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds
=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD)
Works as generator (yield)
Each item is a tupel <array of sentences (array of words) from text>, <array of sentences (array of words) from scds>,
<array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>
View Source
class MultiAnnotatedCorpus(AnnotatedCorpus): ''' Annotated corpus, combines multiple annotators with multiple corpora and allows iterating e.g. over sentences with their SCDs. See `SingleAnnotatedCorpus` to combine one annotator and one corpus. ''' def __init__(self, *args): super().__init__(*args) i = 1 self.a_c_list = [] for c,a in zip(self.corpora, self.annotators): self.a_c_list.append(SingleAnnotatedCorpus( c, a, non_annotator=self.annotators[i%len(self.annotators)] )) i += 1 self.inverse_annotators = None def get_cachename(self): return '-'.join([c.get_cachename() for c in self.corpora]) + '-' + \ '-'.join([a.get_cachename() for a in self.annotators]) + '_' + \ str(Random.get_seed()) + "_" def is_cacheable(self): # annotators are always cacheable (they cache their data, not the sentences annotated) return all([c.is_cacheable() for c in self.corpora]) def get_num_sentences(self): return sum([c.get_num_sentences() for c in self.corpora]) def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): if self.inverse_annotators == None: self.inverse_annotators = [a.get_inverse_annotator() for a in self.annotators] return any([ia.is_similar_annotation(sentence, correct_scd, predicted_scd) for ia in self.inverse_annotators]) def _yield_generator_list(self, generators): local_random = Random.get_generator() while(len(generators) > 0): if len(generators) > 1: local_random.shuffle(generators) empty_gen = [] for generator in generators: try: yield next(generator) except StopIteration: empty_gen.append(generator) for generator in empty_gen: generators.remove(generator) def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_inline_scd_texts(do_scd=do_scd, n_scd=n_scd)) yield from self._yield_generator_list(generators) def iterate_sentence_scds(self, n=1): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_sentence_scds(n=n)) yield from self._yield_generator_list(generators) def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_sentence_scds_non_scds(matching_n=matching_n, non_matching_n=non_matching_n)) yield from self._yield_generator_list(generators) def iterate_assign_scds_text(self): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_assign_scds_text()) generator = self._yield_generator_list(generators) while(True): # always get two set of answers and put them together one_set = False try: one = next(generator) one_set = True two = next(generator) except StopIteration: # stop, if no more or less than two if one_set: yield one break # len of first, as offset of second one_l = len(one[0]) # yield text, scds, text_scd_map, scd_text_map yield one[0] + two[0], \ one[1] + two[1], \ one[2] + [i + one_l for i in two[2]], \ one[3] + [i + one_l for i in two[3]]
Annotated corpus, combines multiple annotators with multiple corpora and allows iterating e.g. over sentences with their SCDs.
See SingleAnnotatedCorpus
to combine one annotator and one corpus.
View Source
def __init__(self, *args): super().__init__(*args) i = 1 self.a_c_list = [] for c,a in zip(self.corpora, self.annotators): self.a_c_list.append(SingleAnnotatedCorpus( c, a, non_annotator=self.annotators[i%len(self.annotators)] )) i += 1 self.inverse_annotators = None
Args
- corpus (
core.corpus.corpus.Corpus
or list ofcore.corpus.corpus.Corpus
): to iterate over and get the texts (sentences) - annotator (
core.corpus.annotator.Annotator
or list ofcore.corpus.annotator.Annotator
): to create annotations for each of the sentences of the corpus
The class core.corpus.annotated_corpora.SingleAnnotatedCorpus
needs one corpus and one annotator.
The class core.corpus.annotated_corpora.MultiAnnotatedCorpus
needs two or more corpora and one annotators,
it will create a context-sensitive annotated corpus (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...).
It thus allows to create context dependent annotations (different context and annotations per corpus).
Also see core.corpus.corpora
and core.corpus.annotators
.
View Source
def get_cachename(self): return '-'.join([c.get_cachename() for c in self.corpora]) + '-' + \ '-'.join([a.get_cachename() for a in self.annotators]) + '_' + \ str(Random.get_seed()) + "_"
View Source
def is_cacheable(self): # annotators are always cacheable (they cache their data, not the sentences annotated) return all([c.is_cacheable() for c in self.corpora])
View Source
def get_num_sentences(self): return sum([c.get_num_sentences() for c in self.corpora])
Returns the number of sentences in this corpus.
View Source
def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): if self.inverse_annotators == None: self.inverse_annotators = [a.get_inverse_annotator() for a in self.annotators] return any([ia.is_similar_annotation(sentence, correct_scd, predicted_scd) for ia in self.inverse_annotators])
Calls the inverse annotator of used annotator(s) and returns as
core.corpus.annotator.InverseAnnotator.is_similar_annotation()
.
View Source
def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_inline_scd_texts(do_scd=do_scd, n_scd=n_scd)) yield from self._yield_generator_list(generators)
Generate an inline SCD Text
Works as generator (yield)
Each generated item is a tuple <array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>
Args
- do_scd (int): add scds for all sentences (=1), every second (=2), ...
- n_scd (int): number of scds to add per sentence
View Source
def iterate_sentence_scds(self, n=1): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_sentence_scds(n=n)) yield from self._yield_generator_list(generators)
Generate a list (length n) of possible scds per sentence.
Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>
Args
- n (int): Number of scd per sentence
View Source
def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_sentence_scds_non_scds(matching_n=matching_n, non_matching_n=non_matching_n)) yield from self._yield_generator_list(generators)
Generate a list (length n) of possible and not possible scds per sentence.
Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>
Args
- matching_n (int): Number of possible scds
- non_matching_n (int): Number of not possible scds
View Source
def iterate_assign_scds_text(self): generators = [] for a_c in self.a_c_list: generators.append(a_c.iterate_assign_scds_text()) generator = self._yield_generator_list(generators) while(True): # always get two set of answers and put them together one_set = False try: one = next(generator) one_set = True two = next(generator) except StopIteration: # stop, if no more or less than two if one_set: yield one break # len of first, as offset of second one_l = len(one[0]) # yield text, scds, text_scd_map, scd_text_map yield one[0] + two[0], \ one[1] + two[1], \ one[2] + [i + one_l for i in two[2]], \ one[3] + [i + one_l for i in two[3]]
Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds
=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD)
Works as generator (yield)
Each item is a tupel <array of sentences (array of words) from text>, <array of sentences (array of words) from scds>,
<array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>