core.corpus.corpus
View Source
import json, os, math from abc import ABC, abstractmethod from core.corpus.preprocess import DefaultPreprocessor from core.utils import clear_filename, write_json_file, read_json_file, check_and_create_folder import core.utils.const as const from core.utils import Random, CacheName class Corpus(ABC): ''' Each subclass represents a special corpus, this class provides a general interface. ''' def __init__(self, ignore_cache=False, memory_only=False, preprocessor=None): ''' Args: ignore_cache (bool): Ignore an existing cache and overwrite it memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always) preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` ''' # properties self.ignore_cache = ignore_cache self.memory_only = memory_only or not self.is_cacheable() if preprocessor == None: self.preprocessor = DefaultPreprocessor() else: self.preprocessor = preprocessor self.preprocessor_name = type(self.preprocessor).__name__ if not self.memory_only and (self.ignore_cache or not self._has_cache()): self._create_cache() elif not self.memory_only: # load from cache self.meta = read_json_file(self._get_cachefilename(suffix='meta')) else: # ram only (on the fly when needed) self.meta = {} @abstractmethod def _texts_generator(self): ''' Load texts using `yield text` for each text (string), defined in subclass for a custom data set ''' pass @abstractmethod def is_cacheable(self): ''' Defines if a corpus is cacheable Returns: bool ''' pass @abstractmethod def get_cachename(self): ''' Returns string what the corpus should be called (and used for name of cache) ''' pass def _get_cachefilename(self, suffix='data'): ''' Returns the path (string) to the corpus' cachefile Args: suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file ''' return CacheName.filename(os.path.join( const.CORPUS_CACHEDIR, clear_filename(self.get_cachename()) + "_" + suffix + ".json" )) # checks for cachefile def _has_cache(self): ''' Checks if cache exists ''' return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename()) def _preprocessed_texts_generator(self): ''' Yields the preprocessed texts ''' for text in self._texts_generator(): yield self.preprocessor.preprocess_document(text) def _create_cache(self): ''' Writes the corpus' cache file ''' check_and_create_folder(const.CORPUS_CACHEDIR) self.meta = { 'num_texts' : 0, 'num_sentences' : 0 } with open(self._get_cachefilename(), "w", errors='ignore') as f: for text in self._preprocessed_texts_generator(): f.write(json.dumps(text) + "\n") self.meta['num_texts'] += 1 self.meta['num_sentences'] += len(text) write_json_file(self._get_cachefilename(suffix='meta'), self.meta) def split(self, percentages=[0.8, 0.2]): ''' Split a corpus by given percentages of texts. Args: percentages (array/ float): The parts/ sizes of each coprus returned Returns: Returns the `x` corpora containing pecantage of overall texts each ``train, test = c.split()`` ''' seeded_random = Random.get_generator() if sum(percentages) != 1: raise AttributeError("Percentages have to sum up to 1!") # get number of texts num_texts = self.get_num_texts() splitted_corpora = [ { 'name' : self.get_cachename() + '_' + str(i) + '-' + str(p) + '-' + str(Random.get_seed()) + '_', # name for caching 'text_ids' : [False for _ in range(num_texts)], # bitmap to select texts } for i,p in enumerate(percentages) ] available_texts = [i for i in range(num_texts)] seeded_random.shuffle(available_texts) start = 0 for i,p in enumerate(percentages): end = start + math.floor(num_texts * p) for text_id in available_texts[start:end]: splitted_corpora[i]['text_ids'][text_id] = True start = end if num_texts - end >= 1: for text_id in available_texts[end:]: splitted_corpora[0]['text_ids'][text_id] = True return [CorpusPart(self, s_c['text_ids'], s_c['name']) for s_c in splitted_corpora] def iterate_sentences(self): ''' Iterate sentence for sentence (generator) ''' for text in self.iterate_texts(): for sentence in text: yield sentence def iterate_texts(self): ''' Iterate texte for text (generator) ''' if self.memory_only: for text in self._preprocessed_texts_generator(): yield text else: # use cache with open(self._get_cachefilename(), "r", errors='ignore') as f: for line in f: yield json.loads(line) def get_num_sentences(self): """ Get the number of sentences in this corpus. Returns: integer """ if 'num_sentences' not in self.meta: # will happen when using memory only self.meta['num_sentences'] = sum( len(t) for t in self._preprocessed_texts_generator() ) return self.meta['num_sentences'] def get_num_texts(self): """ Get the number of texts (sequence of sentences) in this corpus. Returns: integer """ if 'num_texts' not in self.meta: # will happen when using memory only self.meta['num_texts'] = sum(1 for _ in self._texts_generator()) return self.meta['num_texts'] class StringCorpus(Corpus): ''' Simple Corpus to preprocess a single text or multiple texts Transfers string(s) to corpus object ''' def __init__(self, text='', texts=[], **kwargs): ''' Args: text (string): The text the corpus should contain texts (array of string): The texts the corpus should contain *Only use ``text`` or ``texts``, never both!* ''' if len(text) > 0 and len(texts) == 0: self.data = [text] elif len(text) == 0 and len(texts) > 0: self.data = texts else: raise AttributeError("Give one of text='' or texts=[]") super().__init__(**kwargs) def _texts_generator(self): for text in self.data: yield text def is_cacheable(self): return False def get_cachename(self): return "temporary-stringcorpus" class CorpusPart(Corpus): ''' **Internal Class** Part of a Corpus, used for splitting corpora! ''' def __init__(self, super_corpus, text_ids, name): ''' Args: super_corpus (`Corpus`): the splitted corpus text_ids (array of bool): Is part of this coprus for each text in ``super_corpus`` name (string): The name of the part ''' self.name = name self.text_ids = text_ids self.super_corpus = super_corpus super().__init__( ignore_cache=self.super_corpus.ignore_cache, memory_only=self.super_corpus.memory_only, preprocessor=self.super_corpus.preprocessor ) def _preprocessed_texts_generator(self): index = 0 for text in self.super_corpus.iterate_texts(): if self.text_ids[index]: yield text index += 1 def _texts_generator(self): for text in self._preprocessed_texts_generator(): yield '. '.join( [ ' '.join(t) for t in text ] ) def is_cacheable(self): return self.super_corpus.is_cacheable() def get_cachename(self): return self.name
View Source
class Corpus(ABC): ''' Each subclass represents a special corpus, this class provides a general interface. ''' def __init__(self, ignore_cache=False, memory_only=False, preprocessor=None): ''' Args: ignore_cache (bool): Ignore an existing cache and overwrite it memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always) preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` ''' # properties self.ignore_cache = ignore_cache self.memory_only = memory_only or not self.is_cacheable() if preprocessor == None: self.preprocessor = DefaultPreprocessor() else: self.preprocessor = preprocessor self.preprocessor_name = type(self.preprocessor).__name__ if not self.memory_only and (self.ignore_cache or not self._has_cache()): self._create_cache() elif not self.memory_only: # load from cache self.meta = read_json_file(self._get_cachefilename(suffix='meta')) else: # ram only (on the fly when needed) self.meta = {} @abstractmethod def _texts_generator(self): ''' Load texts using `yield text` for each text (string), defined in subclass for a custom data set ''' pass @abstractmethod def is_cacheable(self): ''' Defines if a corpus is cacheable Returns: bool ''' pass @abstractmethod def get_cachename(self): ''' Returns string what the corpus should be called (and used for name of cache) ''' pass def _get_cachefilename(self, suffix='data'): ''' Returns the path (string) to the corpus' cachefile Args: suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file ''' return CacheName.filename(os.path.join( const.CORPUS_CACHEDIR, clear_filename(self.get_cachename()) + "_" + suffix + ".json" )) # checks for cachefile def _has_cache(self): ''' Checks if cache exists ''' return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename()) def _preprocessed_texts_generator(self): ''' Yields the preprocessed texts ''' for text in self._texts_generator(): yield self.preprocessor.preprocess_document(text) def _create_cache(self): ''' Writes the corpus' cache file ''' check_and_create_folder(const.CORPUS_CACHEDIR) self.meta = { 'num_texts' : 0, 'num_sentences' : 0 } with open(self._get_cachefilename(), "w", errors='ignore') as f: for text in self._preprocessed_texts_generator(): f.write(json.dumps(text) + "\n") self.meta['num_texts'] += 1 self.meta['num_sentences'] += len(text) write_json_file(self._get_cachefilename(suffix='meta'), self.meta) def split(self, percentages=[0.8, 0.2]): ''' Split a corpus by given percentages of texts. Args: percentages (array/ float): The parts/ sizes of each coprus returned Returns: Returns the `x` corpora containing pecantage of overall texts each ``train, test = c.split()`` ''' seeded_random = Random.get_generator() if sum(percentages) != 1: raise AttributeError("Percentages have to sum up to 1!") # get number of texts num_texts = self.get_num_texts() splitted_corpora = [ { 'name' : self.get_cachename() + '_' + str(i) + '-' + str(p) + '-' + str(Random.get_seed()) + '_', # name for caching 'text_ids' : [False for _ in range(num_texts)], # bitmap to select texts } for i,p in enumerate(percentages) ] available_texts = [i for i in range(num_texts)] seeded_random.shuffle(available_texts) start = 0 for i,p in enumerate(percentages): end = start + math.floor(num_texts * p) for text_id in available_texts[start:end]: splitted_corpora[i]['text_ids'][text_id] = True start = end if num_texts - end >= 1: for text_id in available_texts[end:]: splitted_corpora[0]['text_ids'][text_id] = True return [CorpusPart(self, s_c['text_ids'], s_c['name']) for s_c in splitted_corpora] def iterate_sentences(self): ''' Iterate sentence for sentence (generator) ''' for text in self.iterate_texts(): for sentence in text: yield sentence def iterate_texts(self): ''' Iterate texte for text (generator) ''' if self.memory_only: for text in self._preprocessed_texts_generator(): yield text else: # use cache with open(self._get_cachefilename(), "r", errors='ignore') as f: for line in f: yield json.loads(line) def get_num_sentences(self): """ Get the number of sentences in this corpus. Returns: integer """ if 'num_sentences' not in self.meta: # will happen when using memory only self.meta['num_sentences'] = sum( len(t) for t in self._preprocessed_texts_generator() ) return self.meta['num_sentences'] def get_num_texts(self): """ Get the number of texts (sequence of sentences) in this corpus. Returns: integer """ if 'num_texts' not in self.meta: # will happen when using memory only self.meta['num_texts'] = sum(1 for _ in self._texts_generator()) return self.meta['num_texts']
Each subclass represents a special corpus, this class provides a general interface.
View Source
def __init__(self, ignore_cache=False, memory_only=False, preprocessor=None): ''' Args: ignore_cache (bool): Ignore an existing cache and overwrite it memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always) preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` ''' # properties self.ignore_cache = ignore_cache self.memory_only = memory_only or not self.is_cacheable() if preprocessor == None: self.preprocessor = DefaultPreprocessor() else: self.preprocessor = preprocessor self.preprocessor_name = type(self.preprocessor).__name__ if not self.memory_only and (self.ignore_cache or not self._has_cache()): self._create_cache() elif not self.memory_only: # load from cache self.meta = read_json_file(self._get_cachefilename(suffix='meta')) else: # ram only (on the fly when needed) self.meta = {}
Args
- ignore_cache (bool): Ignore an existing cache and overwrite it
- memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always)
- preprocessor (
core.corpus.preprocess.Preprocessor
): The preprocessor to use, ifNone
usescore.corpus.preprocess.DefaultPreprocessor
View Source
@abstractmethod def is_cacheable(self): ''' Defines if a corpus is cacheable Returns: bool ''' pass
Defines if a corpus is cacheable
Returns
bool
View Source
@abstractmethod def get_cachename(self): ''' Returns string what the corpus should be called (and used for name of cache) ''' pass
Returns string what the corpus should be called (and used for name of cache)
View Source
def split(self, percentages=[0.8, 0.2]): ''' Split a corpus by given percentages of texts. Args: percentages (array/ float): The parts/ sizes of each coprus returned Returns: Returns the `x` corpora containing pecantage of overall texts each ``train, test = c.split()`` ''' seeded_random = Random.get_generator() if sum(percentages) != 1: raise AttributeError("Percentages have to sum up to 1!") # get number of texts num_texts = self.get_num_texts() splitted_corpora = [ { 'name' : self.get_cachename() + '_' + str(i) + '-' + str(p) + '-' + str(Random.get_seed()) + '_', # name for caching 'text_ids' : [False for _ in range(num_texts)], # bitmap to select texts } for i,p in enumerate(percentages) ] available_texts = [i for i in range(num_texts)] seeded_random.shuffle(available_texts) start = 0 for i,p in enumerate(percentages): end = start + math.floor(num_texts * p) for text_id in available_texts[start:end]: splitted_corpora[i]['text_ids'][text_id] = True start = end if num_texts - end >= 1: for text_id in available_texts[end:]: splitted_corpora[0]['text_ids'][text_id] = True return [CorpusPart(self, s_c['text_ids'], s_c['name']) for s_c in splitted_corpora]
Split a corpus by given percentages of texts.
Args
- percentages (array/ float): The parts/ sizes of each coprus returned
Returns
Returns the
x
corpora containing pecantage of overall texts each
train, test = c.split()
View Source
def iterate_sentences(self): ''' Iterate sentence for sentence (generator) ''' for text in self.iterate_texts(): for sentence in text: yield sentence
Iterate sentence for sentence (generator)
View Source
def iterate_texts(self): ''' Iterate texte for text (generator) ''' if self.memory_only: for text in self._preprocessed_texts_generator(): yield text else: # use cache with open(self._get_cachefilename(), "r", errors='ignore') as f: for line in f: yield json.loads(line)
Iterate texte for text (generator)
View Source
def get_num_sentences(self): """ Get the number of sentences in this corpus. Returns: integer """ if 'num_sentences' not in self.meta: # will happen when using memory only self.meta['num_sentences'] = sum( len(t) for t in self._preprocessed_texts_generator() ) return self.meta['num_sentences']
Get the number of sentences in this corpus.
Returns
integer
View Source
def get_num_texts(self): """ Get the number of texts (sequence of sentences) in this corpus. Returns: integer """ if 'num_texts' not in self.meta: # will happen when using memory only self.meta['num_texts'] = sum(1 for _ in self._texts_generator()) return self.meta['num_texts']
Get the number of texts (sequence of sentences) in this corpus.
Returns
integer
View Source
class StringCorpus(Corpus): ''' Simple Corpus to preprocess a single text or multiple texts Transfers string(s) to corpus object ''' def __init__(self, text='', texts=[], **kwargs): ''' Args: text (string): The text the corpus should contain texts (array of string): The texts the corpus should contain *Only use ``text`` or ``texts``, never both!* ''' if len(text) > 0 and len(texts) == 0: self.data = [text] elif len(text) == 0 and len(texts) > 0: self.data = texts else: raise AttributeError("Give one of text='' or texts=[]") super().__init__(**kwargs) def _texts_generator(self): for text in self.data: yield text def is_cacheable(self): return False def get_cachename(self): return "temporary-stringcorpus"
Simple Corpus to preprocess a single text or multiple texts Transfers string(s) to corpus object
View Source
def __init__(self, text='', texts=[], **kwargs): ''' Args: text (string): The text the corpus should contain texts (array of string): The texts the corpus should contain *Only use ``text`` or ``texts``, never both!* ''' if len(text) > 0 and len(texts) == 0: self.data = [text] elif len(text) == 0 and len(texts) > 0: self.data = texts else: raise AttributeError("Give one of text='' or texts=[]") super().__init__(**kwargs)
Args
- text (string): The text the corpus should contain
- texts (array of string): The texts the corpus should contain
Only use text
or texts
, never both!
View Source
def is_cacheable(self): return False
Defines if a corpus is cacheable
Returns
bool
View Source
def get_cachename(self): return "temporary-stringcorpus"
Returns string what the corpus should be called (and used for name of cache)
Inherited Members
View Source
class CorpusPart(Corpus): ''' **Internal Class** Part of a Corpus, used for splitting corpora! ''' def __init__(self, super_corpus, text_ids, name): ''' Args: super_corpus (`Corpus`): the splitted corpus text_ids (array of bool): Is part of this coprus for each text in ``super_corpus`` name (string): The name of the part ''' self.name = name self.text_ids = text_ids self.super_corpus = super_corpus super().__init__( ignore_cache=self.super_corpus.ignore_cache, memory_only=self.super_corpus.memory_only, preprocessor=self.super_corpus.preprocessor ) def _preprocessed_texts_generator(self): index = 0 for text in self.super_corpus.iterate_texts(): if self.text_ids[index]: yield text index += 1 def _texts_generator(self): for text in self._preprocessed_texts_generator(): yield '. '.join( [ ' '.join(t) for t in text ] ) def is_cacheable(self): return self.super_corpus.is_cacheable() def get_cachename(self): return self.name
Internal Class
Part of a Corpus, used for splitting corpora!
View Source
def __init__(self, super_corpus, text_ids, name): ''' Args: super_corpus (`Corpus`): the splitted corpus text_ids (array of bool): Is part of this coprus for each text in ``super_corpus`` name (string): The name of the part ''' self.name = name self.text_ids = text_ids self.super_corpus = super_corpus super().__init__( ignore_cache=self.super_corpus.ignore_cache, memory_only=self.super_corpus.memory_only, preprocessor=self.super_corpus.preprocessor )
Args
- super_corpus (
Corpus
): the splitted corpus - text_ids (array of bool): Is part of this coprus for each text in
super_corpus
- name (string): The name of the part
View Source
def is_cacheable(self): return self.super_corpus.is_cacheable()
Defines if a corpus is cacheable
Returns
bool
View Source
def get_cachename(self): return self.name
Returns string what the corpus should be called (and used for name of cache)