core.corpus.corpus

View Source
import json, os, math

from abc import ABC, abstractmethod

from core.corpus.preprocess import DefaultPreprocessor

from core.utils import clear_filename, write_json_file, read_json_file, check_and_create_folder
import core.utils.const as const
from core.utils import Random, CacheName

class Corpus(ABC):
	'''
		Each subclass represents a special corpus, this class provides a general interface.
	'''

	def __init__(self, ignore_cache=False, memory_only=False, preprocessor=None):
		'''
			Args:
				ignore_cache (bool): Ignore an existing cache and overwrite it
				memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always)
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` 
		'''

		# properties
		self.ignore_cache = ignore_cache
		self.memory_only = memory_only or not self.is_cacheable()
		if preprocessor == None:
			self.preprocessor = DefaultPreprocessor()
		else:
			self.preprocessor = preprocessor
		self.preprocessor_name = type(self.preprocessor).__name__

		if not self.memory_only and (self.ignore_cache or not self._has_cache()):
			self._create_cache()
		elif not self.memory_only: # load from cache
			self.meta = read_json_file(self._get_cachefilename(suffix='meta'))
		else: # ram only (on the fly when needed)
			self.meta = {}
		
	@abstractmethod
	def _texts_generator(self):
		'''
			Load texts using `yield text` for each text (string), defined in subclass for a custom data set
		'''
		pass

	@abstractmethod
	def is_cacheable(self):
		'''
			Defines if a corpus is cacheable
			Returns:
				bool
		'''
		pass

	@abstractmethod
	def get_cachename(self):
		'''
			Returns string what the corpus should be called (and used for name of cache)
		'''
		pass

	def _get_cachefilename(self, suffix='data'):
		'''
			Returns the path (string) to the corpus' cachefile 
			Args:
				suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file
		'''
		return CacheName.filename(os.path.join(
			const.CORPUS_CACHEDIR,
			clear_filename(self.get_cachename()) + "_" + suffix + ".json"
		))

	# checks for cachefile
	def _has_cache(self):
		'''
			Checks if cache exists
		'''
		return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename())

	def _preprocessed_texts_generator(self):
		'''
			Yields the preprocessed texts
		'''
		for text in self._texts_generator():
			yield self.preprocessor.preprocess_document(text)

	def _create_cache(self):
		'''
			Writes the corpus' cache file
		'''
		check_and_create_folder(const.CORPUS_CACHEDIR)

		self.meta = {
			'num_texts' : 0,
			'num_sentences' : 0
		}

		with open(self._get_cachefilename(), "w", errors='ignore') as f:
			for text in self._preprocessed_texts_generator():
				f.write(json.dumps(text) + "\n")
				self.meta['num_texts'] += 1
				self.meta['num_sentences'] += len(text)

		write_json_file(self._get_cachefilename(suffix='meta'), self.meta)

	def split(self, percentages=[0.8, 0.2]):
		'''
			Split a corpus by given percentages of texts.

			Args:
				percentages (array/ float): The parts/ sizes of each coprus returned 

			Returns:
				Returns the `x` corpora containing pecantage of overall texts each  
				``train, test = c.split()``
		'''
		seeded_random = Random.get_generator()

		if sum(percentages) != 1:
			raise AttributeError("Percentages have to sum up to 1!")

		# get number of texts
		num_texts = self.get_num_texts()

		splitted_corpora = [
			{
				'name' : self.get_cachename() + '_' + str(i) + '-' + str(p) + '-' + str(Random.get_seed()) + '_', # name for caching
				'text_ids' : [False for _ in range(num_texts)], # bitmap to select texts
			} for i,p in enumerate(percentages)
		]

		available_texts = [i for i in range(num_texts)]
		seeded_random.shuffle(available_texts)
		
		start = 0
		for i,p in enumerate(percentages):
			end = start + math.floor(num_texts * p)
			for text_id in available_texts[start:end]:
				splitted_corpora[i]['text_ids'][text_id] = True
			start = end

		if num_texts - end >= 1:
			for text_id in available_texts[end:]:
				splitted_corpora[0]['text_ids'][text_id] = True

		return [CorpusPart(self, s_c['text_ids'], s_c['name']) for s_c in splitted_corpora]

	
	def iterate_sentences(self):
		'''
			Iterate sentence for sentence (generator)
		'''
		for text in self.iterate_texts():
			for sentence in text:
				yield sentence

	def iterate_texts(self):
		'''
			Iterate texte for text (generator)
		'''
		if self.memory_only:
			for text in self._preprocessed_texts_generator():
				yield text 
		else: # use cache
			with open(self._get_cachefilename(), "r", errors='ignore') as f:
				for line in f:
					yield json.loads(line)

	def get_num_sentences(self):
		"""
			Get the number of sentences in this corpus.

			Returns:
				integer
		"""
		if  'num_sentences' not in self.meta: # will happen when using memory only
			self.meta['num_sentences'] = sum( len(t) for t in self._preprocessed_texts_generator() )
		
		return self.meta['num_sentences']

	def get_num_texts(self):
		"""
			Get the number of texts (sequence of sentences) in this corpus.

			Returns:
				integer
		"""
		if  'num_texts' not in self.meta: # will happen when using memory only
			self.meta['num_texts'] = sum(1 for _ in self._texts_generator())
		
		return self.meta['num_texts']
		

class StringCorpus(Corpus):
	'''
	Simple Corpus to preprocess a single text or multiple texts
		Transfers string(s) to corpus object
	'''

	def __init__(self, text='', texts=[], **kwargs):
		'''
			Args:
				text (string): The text the corpus should contain
				texts (array of string): The texts the corpus should contain

			*Only use ``text`` or ``texts``, never both!* 
		'''
		if len(text) > 0 and len(texts) == 0:
			self.data = [text]
		elif len(text) == 0 and len(texts) > 0:
			self.data = texts
		else:
			raise AttributeError("Give one of text='' or texts=[]")

		super().__init__(**kwargs)

	def _texts_generator(self):
		for text in self.data:
			yield text

	def is_cacheable(self):
		return False

	def get_cachename(self):
		return "temporary-stringcorpus" 

class CorpusPart(Corpus):
	'''
		**Internal Class**

		Part of a Corpus, used for splitting corpora!
	'''

	def __init__(self, super_corpus, text_ids, name):
		'''
			Args:
				super_corpus (`Corpus`): the splitted corpus
				text_ids (array of bool): Is part of this coprus for each text in ``super_corpus`` 
				name (string): The name of the part

		'''
		self.name = name
		self.text_ids = text_ids
		self.super_corpus = super_corpus
		
		super().__init__(
				ignore_cache=self.super_corpus.ignore_cache,
				memory_only=self.super_corpus.memory_only,
				preprocessor=self.super_corpus.preprocessor
			)

	def _preprocessed_texts_generator(self):
		index = 0
		for text in self.super_corpus.iterate_texts():
			if self.text_ids[index]:
				yield text
			index += 1

	def _texts_generator(self):
		for text in self._preprocessed_texts_generator():
			yield '. '.join( [ ' '.join(t) for t in text ] )

	def is_cacheable(self):
		return self.super_corpus.is_cacheable()

	def get_cachename(self):
		return self.name
#   class Corpus(abc.ABC):
View Source
class Corpus(ABC):
	'''
		Each subclass represents a special corpus, this class provides a general interface.
	'''

	def __init__(self, ignore_cache=False, memory_only=False, preprocessor=None):
		'''
			Args:
				ignore_cache (bool): Ignore an existing cache and overwrite it
				memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always)
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` 
		'''

		# properties
		self.ignore_cache = ignore_cache
		self.memory_only = memory_only or not self.is_cacheable()
		if preprocessor == None:
			self.preprocessor = DefaultPreprocessor()
		else:
			self.preprocessor = preprocessor
		self.preprocessor_name = type(self.preprocessor).__name__

		if not self.memory_only and (self.ignore_cache or not self._has_cache()):
			self._create_cache()
		elif not self.memory_only: # load from cache
			self.meta = read_json_file(self._get_cachefilename(suffix='meta'))
		else: # ram only (on the fly when needed)
			self.meta = {}
		
	@abstractmethod
	def _texts_generator(self):
		'''
			Load texts using `yield text` for each text (string), defined in subclass for a custom data set
		'''
		pass

	@abstractmethod
	def is_cacheable(self):
		'''
			Defines if a corpus is cacheable
			Returns:
				bool
		'''
		pass

	@abstractmethod
	def get_cachename(self):
		'''
			Returns string what the corpus should be called (and used for name of cache)
		'''
		pass

	def _get_cachefilename(self, suffix='data'):
		'''
			Returns the path (string) to the corpus' cachefile 
			Args:
				suffix (string): Suffix in file, if multiple files (distinguished by `suffix`) needed to cache file
		'''
		return CacheName.filename(os.path.join(
			const.CORPUS_CACHEDIR,
			clear_filename(self.get_cachename()) + "_" + suffix + ".json"
		))

	# checks for cachefile
	def _has_cache(self):
		'''
			Checks if cache exists
		'''
		return os.path.isdir(const.CORPUS_CACHEDIR) and os.path.isfile(self._get_cachefilename())

	def _preprocessed_texts_generator(self):
		'''
			Yields the preprocessed texts
		'''
		for text in self._texts_generator():
			yield self.preprocessor.preprocess_document(text)

	def _create_cache(self):
		'''
			Writes the corpus' cache file
		'''
		check_and_create_folder(const.CORPUS_CACHEDIR)

		self.meta = {
			'num_texts' : 0,
			'num_sentences' : 0
		}

		with open(self._get_cachefilename(), "w", errors='ignore') as f:
			for text in self._preprocessed_texts_generator():
				f.write(json.dumps(text) + "\n")
				self.meta['num_texts'] += 1
				self.meta['num_sentences'] += len(text)

		write_json_file(self._get_cachefilename(suffix='meta'), self.meta)

	def split(self, percentages=[0.8, 0.2]):
		'''
			Split a corpus by given percentages of texts.

			Args:
				percentages (array/ float): The parts/ sizes of each coprus returned 

			Returns:
				Returns the `x` corpora containing pecantage of overall texts each  
				``train, test = c.split()``
		'''
		seeded_random = Random.get_generator()

		if sum(percentages) != 1:
			raise AttributeError("Percentages have to sum up to 1!")

		# get number of texts
		num_texts = self.get_num_texts()

		splitted_corpora = [
			{
				'name' : self.get_cachename() + '_' + str(i) + '-' + str(p) + '-' + str(Random.get_seed()) + '_', # name for caching
				'text_ids' : [False for _ in range(num_texts)], # bitmap to select texts
			} for i,p in enumerate(percentages)
		]

		available_texts = [i for i in range(num_texts)]
		seeded_random.shuffle(available_texts)
		
		start = 0
		for i,p in enumerate(percentages):
			end = start + math.floor(num_texts * p)
			for text_id in available_texts[start:end]:
				splitted_corpora[i]['text_ids'][text_id] = True
			start = end

		if num_texts - end >= 1:
			for text_id in available_texts[end:]:
				splitted_corpora[0]['text_ids'][text_id] = True

		return [CorpusPart(self, s_c['text_ids'], s_c['name']) for s_c in splitted_corpora]

	
	def iterate_sentences(self):
		'''
			Iterate sentence for sentence (generator)
		'''
		for text in self.iterate_texts():
			for sentence in text:
				yield sentence

	def iterate_texts(self):
		'''
			Iterate texte for text (generator)
		'''
		if self.memory_only:
			for text in self._preprocessed_texts_generator():
				yield text 
		else: # use cache
			with open(self._get_cachefilename(), "r", errors='ignore') as f:
				for line in f:
					yield json.loads(line)

	def get_num_sentences(self):
		"""
			Get the number of sentences in this corpus.

			Returns:
				integer
		"""
		if  'num_sentences' not in self.meta: # will happen when using memory only
			self.meta['num_sentences'] = sum( len(t) for t in self._preprocessed_texts_generator() )
		
		return self.meta['num_sentences']

	def get_num_texts(self):
		"""
			Get the number of texts (sequence of sentences) in this corpus.

			Returns:
				integer
		"""
		if  'num_texts' not in self.meta: # will happen when using memory only
			self.meta['num_texts'] = sum(1 for _ in self._texts_generator())
		
		return self.meta['num_texts']

Each subclass represents a special corpus, this class provides a general interface.

#   Corpus(ignore_cache=False, memory_only=False, preprocessor=None)
View Source
	def __init__(self, ignore_cache=False, memory_only=False, preprocessor=None):
		'''
			Args:
				ignore_cache (bool): Ignore an existing cache and overwrite it
				memory_only (bool): Only work in memory and don't use or write cache (has to rerun preprocess always)
				preprocessor (`core.corpus.preprocess.Preprocessor`): The preprocessor to use, if `None` uses `core.corpus.preprocess.DefaultPreprocessor` 
		'''

		# properties
		self.ignore_cache = ignore_cache
		self.memory_only = memory_only or not self.is_cacheable()
		if preprocessor == None:
			self.preprocessor = DefaultPreprocessor()
		else:
			self.preprocessor = preprocessor
		self.preprocessor_name = type(self.preprocessor).__name__

		if not self.memory_only and (self.ignore_cache or not self._has_cache()):
			self._create_cache()
		elif not self.memory_only: # load from cache
			self.meta = read_json_file(self._get_cachefilename(suffix='meta'))
		else: # ram only (on the fly when needed)
			self.meta = {}
Args
#  
@abstractmethod
def is_cacheable(self):
View Source
	@abstractmethod
	def is_cacheable(self):
		'''
			Defines if a corpus is cacheable
			Returns:
				bool
		'''
		pass

Defines if a corpus is cacheable

Returns

bool

#  
@abstractmethod
def get_cachename(self):
View Source
	@abstractmethod
	def get_cachename(self):
		'''
			Returns string what the corpus should be called (and used for name of cache)
		'''
		pass

Returns string what the corpus should be called (and used for name of cache)

#   def split(self, percentages=[0.8, 0.2]):
View Source
	def split(self, percentages=[0.8, 0.2]):
		'''
			Split a corpus by given percentages of texts.

			Args:
				percentages (array/ float): The parts/ sizes of each coprus returned 

			Returns:
				Returns the `x` corpora containing pecantage of overall texts each  
				``train, test = c.split()``
		'''
		seeded_random = Random.get_generator()

		if sum(percentages) != 1:
			raise AttributeError("Percentages have to sum up to 1!")

		# get number of texts
		num_texts = self.get_num_texts()

		splitted_corpora = [
			{
				'name' : self.get_cachename() + '_' + str(i) + '-' + str(p) + '-' + str(Random.get_seed()) + '_', # name for caching
				'text_ids' : [False for _ in range(num_texts)], # bitmap to select texts
			} for i,p in enumerate(percentages)
		]

		available_texts = [i for i in range(num_texts)]
		seeded_random.shuffle(available_texts)
		
		start = 0
		for i,p in enumerate(percentages):
			end = start + math.floor(num_texts * p)
			for text_id in available_texts[start:end]:
				splitted_corpora[i]['text_ids'][text_id] = True
			start = end

		if num_texts - end >= 1:
			for text_id in available_texts[end:]:
				splitted_corpora[0]['text_ids'][text_id] = True

		return [CorpusPart(self, s_c['text_ids'], s_c['name']) for s_c in splitted_corpora]

Split a corpus by given percentages of texts.

Args
  • percentages (array/ float): The parts/ sizes of each coprus returned
Returns

Returns the x corpora containing pecantage of overall texts each
train, test = c.split()

#   def iterate_sentences(self):
View Source
	def iterate_sentences(self):
		'''
			Iterate sentence for sentence (generator)
		'''
		for text in self.iterate_texts():
			for sentence in text:
				yield sentence

Iterate sentence for sentence (generator)

#   def iterate_texts(self):
View Source
	def iterate_texts(self):
		'''
			Iterate texte for text (generator)
		'''
		if self.memory_only:
			for text in self._preprocessed_texts_generator():
				yield text 
		else: # use cache
			with open(self._get_cachefilename(), "r", errors='ignore') as f:
				for line in f:
					yield json.loads(line)

Iterate texte for text (generator)

#   def get_num_sentences(self):
View Source
	def get_num_sentences(self):
		"""
			Get the number of sentences in this corpus.

			Returns:
				integer
		"""
		if  'num_sentences' not in self.meta: # will happen when using memory only
			self.meta['num_sentences'] = sum( len(t) for t in self._preprocessed_texts_generator() )
		
		return self.meta['num_sentences']

Get the number of sentences in this corpus.

Returns

integer

#   def get_num_texts(self):
View Source
	def get_num_texts(self):
		"""
			Get the number of texts (sequence of sentences) in this corpus.

			Returns:
				integer
		"""
		if  'num_texts' not in self.meta: # will happen when using memory only
			self.meta['num_texts'] = sum(1 for _ in self._texts_generator())
		
		return self.meta['num_texts']

Get the number of texts (sequence of sentences) in this corpus.

Returns

integer

#   class StringCorpus(Corpus):
View Source
class StringCorpus(Corpus):
	'''
	Simple Corpus to preprocess a single text or multiple texts
		Transfers string(s) to corpus object
	'''

	def __init__(self, text='', texts=[], **kwargs):
		'''
			Args:
				text (string): The text the corpus should contain
				texts (array of string): The texts the corpus should contain

			*Only use ``text`` or ``texts``, never both!* 
		'''
		if len(text) > 0 and len(texts) == 0:
			self.data = [text]
		elif len(text) == 0 and len(texts) > 0:
			self.data = texts
		else:
			raise AttributeError("Give one of text='' or texts=[]")

		super().__init__(**kwargs)

	def _texts_generator(self):
		for text in self.data:
			yield text

	def is_cacheable(self):
		return False

	def get_cachename(self):
		return "temporary-stringcorpus" 

Simple Corpus to preprocess a single text or multiple texts Transfers string(s) to corpus object

#   StringCorpus(text='', texts=[], **kwargs)
View Source
	def __init__(self, text='', texts=[], **kwargs):
		'''
			Args:
				text (string): The text the corpus should contain
				texts (array of string): The texts the corpus should contain

			*Only use ``text`` or ``texts``, never both!* 
		'''
		if len(text) > 0 and len(texts) == 0:
			self.data = [text]
		elif len(text) == 0 and len(texts) > 0:
			self.data = texts
		else:
			raise AttributeError("Give one of text='' or texts=[]")

		super().__init__(**kwargs)
Args
  • text (string): The text the corpus should contain
  • texts (array of string): The texts the corpus should contain

Only use text or texts, never both!

#   def is_cacheable(self):
View Source
	def is_cacheable(self):
		return False

Defines if a corpus is cacheable

Returns

bool

#   def get_cachename(self):
View Source
	def get_cachename(self):
		return "temporary-stringcorpus" 

Returns string what the corpus should be called (and used for name of cache)

#   class CorpusPart(Corpus):
View Source
class CorpusPart(Corpus):
	'''
		**Internal Class**

		Part of a Corpus, used for splitting corpora!
	'''

	def __init__(self, super_corpus, text_ids, name):
		'''
			Args:
				super_corpus (`Corpus`): the splitted corpus
				text_ids (array of bool): Is part of this coprus for each text in ``super_corpus`` 
				name (string): The name of the part

		'''
		self.name = name
		self.text_ids = text_ids
		self.super_corpus = super_corpus
		
		super().__init__(
				ignore_cache=self.super_corpus.ignore_cache,
				memory_only=self.super_corpus.memory_only,
				preprocessor=self.super_corpus.preprocessor
			)

	def _preprocessed_texts_generator(self):
		index = 0
		for text in self.super_corpus.iterate_texts():
			if self.text_ids[index]:
				yield text
			index += 1

	def _texts_generator(self):
		for text in self._preprocessed_texts_generator():
			yield '. '.join( [ ' '.join(t) for t in text ] )

	def is_cacheable(self):
		return self.super_corpus.is_cacheable()

	def get_cachename(self):
		return self.name

Internal Class

Part of a Corpus, used for splitting corpora!

#   CorpusPart(super_corpus, text_ids, name)
View Source
	def __init__(self, super_corpus, text_ids, name):
		'''
			Args:
				super_corpus (`Corpus`): the splitted corpus
				text_ids (array of bool): Is part of this coprus for each text in ``super_corpus`` 
				name (string): The name of the part

		'''
		self.name = name
		self.text_ids = text_ids
		self.super_corpus = super_corpus
		
		super().__init__(
				ignore_cache=self.super_corpus.ignore_cache,
				memory_only=self.super_corpus.memory_only,
				preprocessor=self.super_corpus.preprocessor
			)
Args
  • super_corpus (Corpus): the splitted corpus
  • text_ids (array of bool): Is part of this coprus for each text in super_corpus
  • name (string): The name of the part
#   def is_cacheable(self):
View Source
	def is_cacheable(self):
		return self.super_corpus.is_cacheable()

Defines if a corpus is cacheable

Returns

bool

#   def get_cachename(self):
View Source
	def get_cachename(self):
		return self.name

Returns string what the corpus should be called (and used for name of cache)