core.corpus.annotated_corpus
View Source
from abc import ABC, abstractmethod from core.corpus.corpus import Corpus from core.corpus.annotator import Annotator class AnnotatedCorpus(ABC): ''' Annotated corpus, combine annotater and corpus and allows iterating e.g. over sentences with their scds This class is abstract and provides two subclasses `core.corpus.annotated_corpora.SingleAnnotatedCorpus` and `core.corpus.annotated_corpora.MultiAnnotatedCorpus`. ''' def __init__(self, corpus, annotator): """ Args: corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): to iterate over and get the texts (sentences) annotator (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): to create annotations for each of the sentences of the corpus The class `core.corpus.annotated_corpora.SingleAnnotatedCorpus` needs one corpus and one annotator. The class `core.corpus.annotated_corpora.MultiAnnotatedCorpus` needs two or more corpora and one annotators, it will create a *context-sensitive annotated corpus* (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...). It thus allows to create context dependent annotations (different context and annotations per corpus). Also see `core.corpus.corpora` and `core.corpus.annotators`. """ if isinstance(corpus, (list,tuple)) and isinstance(annotator, (list,tuple)) and type(self).__name__ == 'MultiAnnotatedCorpus': if len(corpus) == len(annotator) > 1: if all([isinstance(c, Corpus) for c in corpus]) and all([isinstance(a, Annotator) for a in annotator]): self.corpora = corpus self.annotators = annotator else: raise AttributeError("MultiAnnotatedCorpus needs lists of Corpus and Annotator objects!") else: raise AttributeError("MultiAnnotatedCorpus needs same length lists of corpora and annotators! For only one corpus and one annotator use SingleAnnotatedCorpus.") elif isinstance(corpus, Corpus) and isinstance(annotator, Annotator) and type(self).__name__ == 'SingleAnnotatedCorpus': self.annotator = annotator self.corpus = corpus else: raise AttributeError("SingleAnnotatedCorpus needs one corpus and one annotator, MultiAnnotatedCorpus lists of both!") @abstractmethod def get_cachename(self): pass @abstractmethod def is_cacheable(self): pass @abstractmethod def get_num_sentences(self): ''' Returns the number of sentences in this corpus. ''' pass @abstractmethod def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): ''' Calls the inverse annotator of used annotator(s) and returns as `core.corpus.annotator.InverseAnnotator.is_similar_annotation()`. ''' pass @abstractmethod def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): ''' Generate an inline SCD Text Works as generator (yield) Each generated item is a tuple ``<array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>`` Args: do_scd (int): add scds for all sentences (=1), every second (=2), ... n_scd (int): number of scds to add per sentence ''' pass @abstractmethod def iterate_sentence_scds(self, n=1): ''' Generate a list (length n) of possible scds per sentence. Works as generator (yield) Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>`` Args: n (int): Number of scd per sentence ''' pass @abstractmethod def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): ''' Generate a list (length n) of possible and not possible scds per sentence. Works as generator (yield) Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>`` Args: matching_n (int): Number of possible scds non_matching_n (int): Number of not possible scds ''' pass @abstractmethod def iterate_assign_scds_text(self): ''' Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds => Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD) Works as generator (yield) Each item is a tupel ``<array of sentences (array of words) from text>, <array of sentences (array of words) from scds>, <array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>`` ''' pass
View Source
class AnnotatedCorpus(ABC): ''' Annotated corpus, combine annotater and corpus and allows iterating e.g. over sentences with their scds This class is abstract and provides two subclasses `core.corpus.annotated_corpora.SingleAnnotatedCorpus` and `core.corpus.annotated_corpora.MultiAnnotatedCorpus`. ''' def __init__(self, corpus, annotator): """ Args: corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): to iterate over and get the texts (sentences) annotator (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): to create annotations for each of the sentences of the corpus The class `core.corpus.annotated_corpora.SingleAnnotatedCorpus` needs one corpus and one annotator. The class `core.corpus.annotated_corpora.MultiAnnotatedCorpus` needs two or more corpora and one annotators, it will create a *context-sensitive annotated corpus* (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...). It thus allows to create context dependent annotations (different context and annotations per corpus). Also see `core.corpus.corpora` and `core.corpus.annotators`. """ if isinstance(corpus, (list,tuple)) and isinstance(annotator, (list,tuple)) and type(self).__name__ == 'MultiAnnotatedCorpus': if len(corpus) == len(annotator) > 1: if all([isinstance(c, Corpus) for c in corpus]) and all([isinstance(a, Annotator) for a in annotator]): self.corpora = corpus self.annotators = annotator else: raise AttributeError("MultiAnnotatedCorpus needs lists of Corpus and Annotator objects!") else: raise AttributeError("MultiAnnotatedCorpus needs same length lists of corpora and annotators! For only one corpus and one annotator use SingleAnnotatedCorpus.") elif isinstance(corpus, Corpus) and isinstance(annotator, Annotator) and type(self).__name__ == 'SingleAnnotatedCorpus': self.annotator = annotator self.corpus = corpus else: raise AttributeError("SingleAnnotatedCorpus needs one corpus and one annotator, MultiAnnotatedCorpus lists of both!") @abstractmethod def get_cachename(self): pass @abstractmethod def is_cacheable(self): pass @abstractmethod def get_num_sentences(self): ''' Returns the number of sentences in this corpus. ''' pass @abstractmethod def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): ''' Calls the inverse annotator of used annotator(s) and returns as `core.corpus.annotator.InverseAnnotator.is_similar_annotation()`. ''' pass @abstractmethod def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): ''' Generate an inline SCD Text Works as generator (yield) Each generated item is a tuple ``<array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>`` Args: do_scd (int): add scds for all sentences (=1), every second (=2), ... n_scd (int): number of scds to add per sentence ''' pass @abstractmethod def iterate_sentence_scds(self, n=1): ''' Generate a list (length n) of possible scds per sentence. Works as generator (yield) Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>`` Args: n (int): Number of scd per sentence ''' pass @abstractmethod def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): ''' Generate a list (length n) of possible and not possible scds per sentence. Works as generator (yield) Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>`` Args: matching_n (int): Number of possible scds non_matching_n (int): Number of not possible scds ''' pass @abstractmethod def iterate_assign_scds_text(self): ''' Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds => Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD) Works as generator (yield) Each item is a tupel ``<array of sentences (array of words) from text>, <array of sentences (array of words) from scds>, <array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>`` ''' pass
Annotated corpus, combine annotater and corpus and allows iterating e.g. over sentences with their scds
This class is abstract and provides two subclasses core.corpus.annotated_corpora.SingleAnnotatedCorpus
and
core.corpus.annotated_corpora.MultiAnnotatedCorpus
.
View Source
def __init__(self, corpus, annotator): """ Args: corpus (`core.corpus.corpus.Corpus` or list of `core.corpus.corpus.Corpus`): to iterate over and get the texts (sentences) annotator (`core.corpus.annotator.Annotator` or list of `core.corpus.annotator.Annotator`): to create annotations for each of the sentences of the corpus The class `core.corpus.annotated_corpora.SingleAnnotatedCorpus` needs one corpus and one annotator. The class `core.corpus.annotated_corpora.MultiAnnotatedCorpus` needs two or more corpora and one annotators, it will create a *context-sensitive annotated corpus* (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...). It thus allows to create context dependent annotations (different context and annotations per corpus). Also see `core.corpus.corpora` and `core.corpus.annotators`. """ if isinstance(corpus, (list,tuple)) and isinstance(annotator, (list,tuple)) and type(self).__name__ == 'MultiAnnotatedCorpus': if len(corpus) == len(annotator) > 1: if all([isinstance(c, Corpus) for c in corpus]) and all([isinstance(a, Annotator) for a in annotator]): self.corpora = corpus self.annotators = annotator else: raise AttributeError("MultiAnnotatedCorpus needs lists of Corpus and Annotator objects!") else: raise AttributeError("MultiAnnotatedCorpus needs same length lists of corpora and annotators! For only one corpus and one annotator use SingleAnnotatedCorpus.") elif isinstance(corpus, Corpus) and isinstance(annotator, Annotator) and type(self).__name__ == 'SingleAnnotatedCorpus': self.annotator = annotator self.corpus = corpus else: raise AttributeError("SingleAnnotatedCorpus needs one corpus and one annotator, MultiAnnotatedCorpus lists of both!")
Args
- corpus (
core.corpus.corpus.Corpus
or list ofcore.corpus.corpus.Corpus
): to iterate over and get the texts (sentences) - annotator (
core.corpus.annotator.Annotator
or list ofcore.corpus.annotator.Annotator
): to create annotations for each of the sentences of the corpus
The class core.corpus.annotated_corpora.SingleAnnotatedCorpus
needs one corpus and one annotator.
The class core.corpus.annotated_corpora.MultiAnnotatedCorpus
needs two or more corpora and one annotators,
it will create a context-sensitive annotated corpus (corpus[0] uses annotator[0], corpus[1] uses annotator[1], ...).
It thus allows to create context dependent annotations (different context and annotations per corpus).
Also see core.corpus.corpora
and core.corpus.annotators
.
View Source
@abstractmethod def get_cachename(self): pass
View Source
@abstractmethod def is_cacheable(self): pass
View Source
@abstractmethod def get_num_sentences(self): ''' Returns the number of sentences in this corpus. ''' pass
Returns the number of sentences in this corpus.
View Source
@abstractmethod def inverse_annotator_is_similar_annotation(self, sentence, correct_scd, predicted_scd): ''' Calls the inverse annotator of used annotator(s) and returns as `core.corpus.annotator.InverseAnnotator.is_similar_annotation()`. ''' pass
Calls the inverse annotator of used annotator(s) and returns as
core.corpus.annotator.InverseAnnotator.is_similar_annotation()
.
View Source
@abstractmethod def iterate_inline_scd_texts(self, do_scd=1, n_scd=1): ''' Generate an inline SCD Text Works as generator (yield) Each generated item is a tuple ``<array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>`` Args: do_scd (int): add scds for all sentences (=1), every second (=2), ... n_scd (int): number of scds to add per sentence ''' pass
Generate an inline SCD Text
Works as generator (yield)
Each generated item is a tuple <array of sentences(array of words) with "Text+iSCD">,<array of labels "is scd?" per sentence>
Args
- do_scd (int): add scds for all sentences (=1), every second (=2), ...
- n_scd (int): number of scds to add per sentence
View Source
@abstractmethod def iterate_sentence_scds(self, n=1): ''' Generate a list (length n) of possible scds per sentence. Works as generator (yield) Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>`` Args: n (int): Number of scd per sentence ''' pass
Generate a list (length n) of possible scds per sentence.
Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>
Args
- n (int): Number of scd per sentence
View Source
@abstractmethod def iterate_sentence_scds_non_scds(self, matching_n=1, non_matching_n=5): ''' Generate a list (length n) of possible and not possible scds per sentence. Works as generator (yield) Each item is a tuple ``<sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>`` Args: matching_n (int): Number of possible scds non_matching_n (int): Number of not possible scds ''' pass
Generate a list (length n) of possible and not possible scds per sentence.
Works as generator (yield)
Each item is a tuple <sentence(array of words)>,<array of possible scds(array of words)>, <array of *not* possible scds(array of words)>
Args
- matching_n (int): Number of possible scds
- non_matching_n (int): Number of not possible scds
View Source
@abstractmethod def iterate_assign_scds_text(self): ''' Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds => Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD) Works as generator (yield) Each item is a tupel ``<array of sentences (array of words) from text>, <array of sentences (array of words) from scds>, <array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>`` ''' pass
Generate pairs of lists from iSCD texts, but yielding two lists one containing all sentences and one of all scds
=> Goal is to select SCD for a sentence from text of SCDs or other way round (select sentence from text for single SCD)
Works as generator (yield)
Each item is a tupel <array of sentences (array of words) from text>, <array of sentences (array of words) from scds>,
<array mapping indices from text array to matching scd in scd array>, <array mapping indices from scd array to matching sentence in text array>