core.model.scdmatrix.model

View Source
import os, warnings, time

from abc import abstractmethod
from ast import literal_eval
from multiprocessing import Queue, Pool

warnings.filterwarnings(action='ignore', category=UserWarning) # some annoying warning
from gensim.corpora import Dictionary
warnings.filterwarnings(action='default', category=UserWarning) # reset defaults

import numpy as np
from scipy.sparse import dok_matrix, save_npz, load_npz, csr_matrix
from tqdm import tqdm

from core.model import Model 
from core.utils import const, clear_filename, check_and_create_folder, write_json_file, read_json_file, CacheName

class SCDMatrix(Model):
	'''
		Represents a model containing a trained SCD matrix.  
		See [To Extend or not to Extend? Context-specific Corpus Enrichment](http://ifis.uni-luebeck.de/uploads/tx_wapublications/public_AI2019_paper_79.pdf) for more information.

		This class is abstract, use subclasses `core.model.scdmatrix.models.iSCDMatrix` or `core.model.scdmatrix.models.MPSCDMatrix`

		**This model does not use a GPU and will never. The calculation of SCD similarity values via `_get_scds` (used
		by e.g. `train()` and `evaluate()` ) uses multiple cores per default.**
	'''

	SENTENCE_BATCH_SIZE = 250
	'''
		Batch size for multicore usage (number of sentences per batch)
	'''

	def __init__(self,
			annotated_corpus_train, annotated_corpus_eval,
			num_scds_train=5, num_processes=-1,
			**kwargs
		):
		"""
			Args:
				num_scds_train(int): Number of scd per sentence to train the model on
				num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use `os.cpu_count()`)
		"""
		self.num_scds_train = num_scds_train
		self.num_processes = os.cpu_count() if num_processes < 1 else num_processes

		if self.num_processes > os.cpu_count():
			self.num_processes = os.cpu_count()

		self.scd_id_map = None

		super().__init__(annotated_corpus_train, annotated_corpus_eval, **kwargs)

	def is_gpu_optimized():
		return False

	def _prepare_training(self):
		self.cache_name = os.path.join(
			const.MATRIX_MODELDIR,
			clear_filename(self.annotated_corpus_train.get_cachename() + '-' + str(self.num_scds_train))
		)
		self.dict_cache_name = CacheName.filename(self.cache_name + ".dict")
		self.matrix_cache_name = CacheName.filename(self.cache_name + ".npz")
		self.map_cache_name = CacheName.filename(self.cache_name + "_map.json")
		self.len_cache_name = CacheName.filename(self.cache_name + "_len.json")

	def _is_cached(self):
		for f in [self.dict_cache_name, self.matrix_cache_name,  self.map_cache_name, self.len_cache_name]:
			if not os.path.isfile(f):
				return False
		return True

	def _load_cached(self):
		self.dict = Dictionary.load(self.dict_cache_name)
		self.scd_map = read_json_file(self.map_cache_name)
		self.scd_matrix = load_npz(self.matrix_cache_name)
		self.scd_lengths = np.array(read_json_file(self.len_cache_name))

		self.query_object = self.get_query_object()

		self._init_subclass()

	def _train(self):
		dict_start = time.time()
		print("Training Step 1/4:")
		# create dictionary of words
		self.dict = Dictionary()
		for i, (sentence, scds) in enumerate(self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train)):
			if i % 10000 == 0:
				self.dict.filter_extremes(no_below=5, no_above=0.9, keep_n=None, keep_tokens=None)

			self.dict.doc2bow(sentence, allow_update=True)
			self.dict.add_documents(scds, prune_at=None)			

		map_start = time.time()
		print("Training Step 2/4:")
		# create scd => id mapping
		self.scd_map = {}
		num_scds, num_iter = 0, 0
		for _, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train):
			for scd in scds:
				idx = str(tuple(self.dict.doc2idx(scd)))
				if idx not in self.scd_map:
					self.scd_map[idx] = num_scds
					num_scds += 1

			num_iter += 1

		matrix_start = time.time()
		print("Training Step 3/4:")
		# train matrix
		matrix = dok_matrix((num_scds, len(self.dict)), 'int')
		with tqdm(total=num_iter) as timeline:
			for sentence, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train):
				words = self.dict.doc2idx(sentence)
				for scd in scds:
					scd_id = self.scd_map[str(tuple(self.dict.doc2idx(scd)))]
					for word in words:
						if word >= 0:
							matrix[scd_id, word] += 1

				timeline.update(1)

		# optimize format
		self.scd_matrix = matrix.tocsr()

		# euclidian norms per scd
		self.scd_lengths = SCDMatrixQuery.matrix_row_length(self.scd_matrix);

		all_end = time.time()

		# write model dat
		if self.write_model:
			check_and_create_folder(const.MATRIX_MODELDIR)

			self.dict.save(self.dict_cache_name)
			write_json_file(self.map_cache_name, self.scd_map)
			save_npz(self.matrix_cache_name, self.scd_matrix)
			write_json_file(self.len_cache_name, self.scd_lengths.tolist())

		# create query (MPSCD) object
		self.query_object = self.get_query_object()

		# train, if defined by concret subclass
		subclass_start = time.time()
		self._init_subclass()
		subclass_runtime = time.time() - subclass_start

		return {
			'runtime' : all_end - dict_start,
			'runtime_dict' : map_start - dict_start,
			'runtime_map' : matrix_start - map_start,
			'runtime_matrix' : all_end - matrix_start,
			'runtime_subclass' : subclass_runtime
		}

	@abstractmethod
	def _init_subclass(self):
		"""
			Train for subclasses or load from cache.
		"""
		pass

	@abstractmethod
	def _evaluate(self):
		pass

	def get_query_object(self):
		"""
			Gets a SCD Similarity Query object of the model.

			Returns:
				The SCD Similarity Query object `SCDMatrixQuery`
		"""
		return SCDMatrixQuery(
				self.dict,
				self.scd_matrix,
				self.scd_lengths
			)

	def _get_scd_text(self, scd_id):
		if self.scd_id_map == None:
			self.scd_id_map = [None for _ in range(len(self.scd_map))]
			for k,v in self.scd_map.items():
				self.scd_id_map[v] = k
		
		return [self.dict[i] for i in filter(lambda i: i >= 0, literal_eval(self.scd_id_map[scd_id]))]

	def _get_scds(self, sentence_generator, timeline=None):
		"""
			Similar to `_get_scd()`, but may use multiple processes.

			Args:
				sentence_generator (generator/ iterateable of list of words): Generator to generate sentences each passed to `SCDMatrixQuery.get_scd(sentence, value=value)`
					The generator has to yield `sentence, value` tuples, cause order of elements in result may no be stable.
				timeline (tqdm, optional): A `tqdm` object to display a timeline, has to be initalized with number of sentences in generator
			Returns:
				List of tuples similarity values, scd_ids, values (list of returned from `SCDMatrixQuery.get_scd(sentence, value=value)`)
		"""
		has_timeline = type(timeline) != type(None)

		results = []
		if self.num_processes == 1: # only one process, use self!
			for sentence, value in sentence_generator():
				results.append(self.query_object.get_scd(sentence, value=value))
				if has_timeline:
					timeline.update(1)
		else:
			tasks_queue = Queue(self.num_processes*2) # max 2 tasks per core in queue per time
			results_queue = Queue()

			# create the worker each with the matrix
			with Pool(self.num_processes, QueryWorker.main, (self.get_query_object(), tasks_queue, results_queue)) as pool:

				# get all sentences and submit them (chunk wise to save memory and sync-costs!)
				sentences_batch = []
				for sentence_value in sentence_generator():
					sentences_batch.append(sentence_value)

					if len(sentences_batch) >= SCDMatrix.SENTENCE_BATCH_SIZE:
						tasks_queue.put(sentences_batch, block=True)
						sentences_batch = []

						if has_timeline:
							timeline.update(results_queue.qsize() * SCDMatrix.SENTENCE_BATCH_SIZE - timeline.n)

				# left sentences (less than SCDMatrix.SENTENCE_BATCH_SIZE)
				tasks_queue.put(sentences_batch)

				# mark end of tasks for each process
				for _ in range(self.num_processes):
					tasks_queue.put(None)

				# mark tasks queue as done
				tasks_queue.close()
				tasks_queue.join_thread()

				# wait for all processed to end
				pool.close()

				# collect the results
				none_count = 0
				while True:
					result = results_queue.get(block=True)
					if result == None: # each process adds None as last Element!
						none_count += 1
						if none_count >= self.num_processes:
							break
						continue
					
					results.extend(result)

				results_queue.close()
				results_queue.join_thread()

		# finish timeline
		if has_timeline:
			timeline.update(timeline.total - timeline.n)

		return results

	def _get_scd(self, sentence, n=1):
		return self.query_object.get_scd(sentence, n=n)

	@abstractmethod
	def _predict(self, *args):
		pass

class SCDMatrixQuery:
	'''
		Queries a SCD Matrix given a sentence for the most probable scd.

		Create from a SCD Matrix model via `SCDMatrix.get_query_object()`
	'''

	def __init__(self, word_dict, scd_matrix, scd_lengths):
		"""
			Args:
				word_dict (`gensim.corpora.Dictionary`): The dictionray to translate words in indices
				scd_matrix (`scipy.sparse.csr_matrix`): The SCD matrix as sparse matrix
				scd_lengths (`numpy.ndarray`): The length of each SCD word vector in the matrix
		"""
		self.dict = word_dict
		self.scd_matrix = scd_matrix
		self.scd_lengths = scd_lengths

	def get_scd(self, sentence, n=1, value=None):
		"""
			Get the best scd (as scd_id) and similarity value for a given sentence

			Args:
				sentence (list of str): The sentence to predict a SCD for
				n (int): Return top-n best results
				value (any, optional): A value which will be passed back in "return"
			Returns:
				similarity value, the id of the scd [,list of tuple(id, similarity) *if n > 1*] [,value *if value != None*]
		"""
		words = np.array(self.dict.doc2idx(sentence))
		words = words[(words >= 0)] # remove all unknown

		sentence_matrix = csr_matrix((
					np.ones(len(words)), # each word occurs once => value 1 per word
					words, # the words are their indices
					[0, len(words)] # we have one row, therefore all values there
				), shape=(1, len(self.dict))
			)
		
		# consine calculation
		# 	euclidian norm of sentence
		sentence_length = SCDMatrixQuery.matrix_row_length(sentence_matrix)
		# 	all dot products
		dot_products = np.squeeze((self.scd_matrix * sentence_matrix.T).toarray())
		#	product of length values
		dividers = self.scd_lengths * sentence_length
		# 	calculate the similarities
		similarities = np.divide(
				dot_products, dividers,
				out=np.zeros_like(dot_products),
				where=(dividers!=0)
			)

		# get best
		scd_id = np.argmax(similarities)
		sim = similarities[scd_id]

		# add top-n list and/or value?
		if n > 1:
			best_idx = np.argsort(-similarities, axis=-1)
			best_val = np.take_along_axis(similarities, best_idx, axis=-1)
			if value == None:
				return sim, scd_id, list(zip(best_idx, best_val))[:n]
			else:
				return sim, scd_id, list(zip(best_idx, best_val))[:n], value
		else:
			if value == None:
				return sim, scd_id
			else:
				return sim, scd_id, value

	def matrix_row_length(matrix):
		"""
			Calculate the euclidian norm (length) of each row of the matrix and resturns list

			Args:
				matrix (`scipy.sparse.csr_matrix`): The matrix to calc row lenghts
			Returns:
				Vector `numpy.array` of euclidian norm values
		"""
		return np.sqrt( # root per row
				np.squeeze(np.asarray( # make matrix a vector
					matrix.copy().power(2).sum(axis=1) # elementwise ** 2 and sum rows
				))
			)

class QueryWorker():
	"""
		The query worker used by each process spawned in `SCDMatrix._get_scds()`.
	"""

	def main(scd_matrix_query, tasks_queue, results_queue):
		while True:
			sentences_batch = tasks_queue.get(block=True)
			if sentences_batch == None:
				results_queue.put(None) # add "done mark"
				results_queue.close()
				results_queue.join_thread()
				break

			results = []
			for sentence, value in sentences_batch:
				results.append(scd_matrix_query.get_scd(sentence, value=value))

			results_queue.put(results)
#   class SCDMatrix(core.model.base.Model):
View Source
class SCDMatrix(Model):
	'''
		Represents a model containing a trained SCD matrix.  
		See [To Extend or not to Extend? Context-specific Corpus Enrichment](http://ifis.uni-luebeck.de/uploads/tx_wapublications/public_AI2019_paper_79.pdf) for more information.

		This class is abstract, use subclasses `core.model.scdmatrix.models.iSCDMatrix` or `core.model.scdmatrix.models.MPSCDMatrix`

		**This model does not use a GPU and will never. The calculation of SCD similarity values via `_get_scds` (used
		by e.g. `train()` and `evaluate()` ) uses multiple cores per default.**
	'''

	SENTENCE_BATCH_SIZE = 250
	'''
		Batch size for multicore usage (number of sentences per batch)
	'''

	def __init__(self,
			annotated_corpus_train, annotated_corpus_eval,
			num_scds_train=5, num_processes=-1,
			**kwargs
		):
		"""
			Args:
				num_scds_train(int): Number of scd per sentence to train the model on
				num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use `os.cpu_count()`)
		"""
		self.num_scds_train = num_scds_train
		self.num_processes = os.cpu_count() if num_processes < 1 else num_processes

		if self.num_processes > os.cpu_count():
			self.num_processes = os.cpu_count()

		self.scd_id_map = None

		super().__init__(annotated_corpus_train, annotated_corpus_eval, **kwargs)

	def is_gpu_optimized():
		return False

	def _prepare_training(self):
		self.cache_name = os.path.join(
			const.MATRIX_MODELDIR,
			clear_filename(self.annotated_corpus_train.get_cachename() + '-' + str(self.num_scds_train))
		)
		self.dict_cache_name = CacheName.filename(self.cache_name + ".dict")
		self.matrix_cache_name = CacheName.filename(self.cache_name + ".npz")
		self.map_cache_name = CacheName.filename(self.cache_name + "_map.json")
		self.len_cache_name = CacheName.filename(self.cache_name + "_len.json")

	def _is_cached(self):
		for f in [self.dict_cache_name, self.matrix_cache_name,  self.map_cache_name, self.len_cache_name]:
			if not os.path.isfile(f):
				return False
		return True

	def _load_cached(self):
		self.dict = Dictionary.load(self.dict_cache_name)
		self.scd_map = read_json_file(self.map_cache_name)
		self.scd_matrix = load_npz(self.matrix_cache_name)
		self.scd_lengths = np.array(read_json_file(self.len_cache_name))

		self.query_object = self.get_query_object()

		self._init_subclass()

	def _train(self):
		dict_start = time.time()
		print("Training Step 1/4:")
		# create dictionary of words
		self.dict = Dictionary()
		for i, (sentence, scds) in enumerate(self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train)):
			if i % 10000 == 0:
				self.dict.filter_extremes(no_below=5, no_above=0.9, keep_n=None, keep_tokens=None)

			self.dict.doc2bow(sentence, allow_update=True)
			self.dict.add_documents(scds, prune_at=None)			

		map_start = time.time()
		print("Training Step 2/4:")
		# create scd => id mapping
		self.scd_map = {}
		num_scds, num_iter = 0, 0
		for _, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train):
			for scd in scds:
				idx = str(tuple(self.dict.doc2idx(scd)))
				if idx not in self.scd_map:
					self.scd_map[idx] = num_scds
					num_scds += 1

			num_iter += 1

		matrix_start = time.time()
		print("Training Step 3/4:")
		# train matrix
		matrix = dok_matrix((num_scds, len(self.dict)), 'int')
		with tqdm(total=num_iter) as timeline:
			for sentence, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train):
				words = self.dict.doc2idx(sentence)
				for scd in scds:
					scd_id = self.scd_map[str(tuple(self.dict.doc2idx(scd)))]
					for word in words:
						if word >= 0:
							matrix[scd_id, word] += 1

				timeline.update(1)

		# optimize format
		self.scd_matrix = matrix.tocsr()

		# euclidian norms per scd
		self.scd_lengths = SCDMatrixQuery.matrix_row_length(self.scd_matrix);

		all_end = time.time()

		# write model dat
		if self.write_model:
			check_and_create_folder(const.MATRIX_MODELDIR)

			self.dict.save(self.dict_cache_name)
			write_json_file(self.map_cache_name, self.scd_map)
			save_npz(self.matrix_cache_name, self.scd_matrix)
			write_json_file(self.len_cache_name, self.scd_lengths.tolist())

		# create query (MPSCD) object
		self.query_object = self.get_query_object()

		# train, if defined by concret subclass
		subclass_start = time.time()
		self._init_subclass()
		subclass_runtime = time.time() - subclass_start

		return {
			'runtime' : all_end - dict_start,
			'runtime_dict' : map_start - dict_start,
			'runtime_map' : matrix_start - map_start,
			'runtime_matrix' : all_end - matrix_start,
			'runtime_subclass' : subclass_runtime
		}

	@abstractmethod
	def _init_subclass(self):
		"""
			Train for subclasses or load from cache.
		"""
		pass

	@abstractmethod
	def _evaluate(self):
		pass

	def get_query_object(self):
		"""
			Gets a SCD Similarity Query object of the model.

			Returns:
				The SCD Similarity Query object `SCDMatrixQuery`
		"""
		return SCDMatrixQuery(
				self.dict,
				self.scd_matrix,
				self.scd_lengths
			)

	def _get_scd_text(self, scd_id):
		if self.scd_id_map == None:
			self.scd_id_map = [None for _ in range(len(self.scd_map))]
			for k,v in self.scd_map.items():
				self.scd_id_map[v] = k
		
		return [self.dict[i] for i in filter(lambda i: i >= 0, literal_eval(self.scd_id_map[scd_id]))]

	def _get_scds(self, sentence_generator, timeline=None):
		"""
			Similar to `_get_scd()`, but may use multiple processes.

			Args:
				sentence_generator (generator/ iterateable of list of words): Generator to generate sentences each passed to `SCDMatrixQuery.get_scd(sentence, value=value)`
					The generator has to yield `sentence, value` tuples, cause order of elements in result may no be stable.
				timeline (tqdm, optional): A `tqdm` object to display a timeline, has to be initalized with number of sentences in generator
			Returns:
				List of tuples similarity values, scd_ids, values (list of returned from `SCDMatrixQuery.get_scd(sentence, value=value)`)
		"""
		has_timeline = type(timeline) != type(None)

		results = []
		if self.num_processes == 1: # only one process, use self!
			for sentence, value in sentence_generator():
				results.append(self.query_object.get_scd(sentence, value=value))
				if has_timeline:
					timeline.update(1)
		else:
			tasks_queue = Queue(self.num_processes*2) # max 2 tasks per core in queue per time
			results_queue = Queue()

			# create the worker each with the matrix
			with Pool(self.num_processes, QueryWorker.main, (self.get_query_object(), tasks_queue, results_queue)) as pool:

				# get all sentences and submit them (chunk wise to save memory and sync-costs!)
				sentences_batch = []
				for sentence_value in sentence_generator():
					sentences_batch.append(sentence_value)

					if len(sentences_batch) >= SCDMatrix.SENTENCE_BATCH_SIZE:
						tasks_queue.put(sentences_batch, block=True)
						sentences_batch = []

						if has_timeline:
							timeline.update(results_queue.qsize() * SCDMatrix.SENTENCE_BATCH_SIZE - timeline.n)

				# left sentences (less than SCDMatrix.SENTENCE_BATCH_SIZE)
				tasks_queue.put(sentences_batch)

				# mark end of tasks for each process
				for _ in range(self.num_processes):
					tasks_queue.put(None)

				# mark tasks queue as done
				tasks_queue.close()
				tasks_queue.join_thread()

				# wait for all processed to end
				pool.close()

				# collect the results
				none_count = 0
				while True:
					result = results_queue.get(block=True)
					if result == None: # each process adds None as last Element!
						none_count += 1
						if none_count >= self.num_processes:
							break
						continue
					
					results.extend(result)

				results_queue.close()
				results_queue.join_thread()

		# finish timeline
		if has_timeline:
			timeline.update(timeline.total - timeline.n)

		return results

	def _get_scd(self, sentence, n=1):
		return self.query_object.get_scd(sentence, n=n)

	@abstractmethod
	def _predict(self, *args):
		pass

Represents a model containing a trained SCD matrix.
See To Extend or not to Extend? Context-specific Corpus Enrichment for more information.

This class is abstract, use subclasses core.model.scdmatrix.models.iSCDMatrix or core.model.scdmatrix.models.MPSCDMatrix

This model does not use a GPU and will never. The calculation of SCD similarity values via _get_scds (used by e.g. train() and evaluate() ) uses multiple cores per default.

#   SCDMatrix( annotated_corpus_train, annotated_corpus_eval, num_scds_train=5, num_processes=-1, **kwargs )
View Source
	def __init__(self,
			annotated_corpus_train, annotated_corpus_eval,
			num_scds_train=5, num_processes=-1,
			**kwargs
		):
		"""
			Args:
				num_scds_train(int): Number of scd per sentence to train the model on
				num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use `os.cpu_count()`)
		"""
		self.num_scds_train = num_scds_train
		self.num_processes = os.cpu_count() if num_processes < 1 else num_processes

		if self.num_processes > os.cpu_count():
			self.num_processes = os.cpu_count()

		self.scd_id_map = None

		super().__init__(annotated_corpus_train, annotated_corpus_eval, **kwargs)
Args
  • num_scds_train(int): Number of scd per sentence to train the model on
  • num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use os.cpu_count())
#   SENTENCE_BATCH_SIZE = 250

Batch size for multicore usage (number of sentences per batch)

#   def is_gpu_optimized():
View Source
	def is_gpu_optimized():
		return False

Check if a model is optimized for GPU usage!

Returns

bool

#   def get_query_object(self):
View Source
	def get_query_object(self):
		"""
			Gets a SCD Similarity Query object of the model.

			Returns:
				The SCD Similarity Query object `SCDMatrixQuery`
		"""
		return SCDMatrixQuery(
				self.dict,
				self.scd_matrix,
				self.scd_lengths
			)

Gets a SCD Similarity Query object of the model.

Returns

The SCD Similarity Query object SCDMatrixQuery

#   class SCDMatrixQuery:
View Source
class SCDMatrixQuery:
	'''
		Queries a SCD Matrix given a sentence for the most probable scd.

		Create from a SCD Matrix model via `SCDMatrix.get_query_object()`
	'''

	def __init__(self, word_dict, scd_matrix, scd_lengths):
		"""
			Args:
				word_dict (`gensim.corpora.Dictionary`): The dictionray to translate words in indices
				scd_matrix (`scipy.sparse.csr_matrix`): The SCD matrix as sparse matrix
				scd_lengths (`numpy.ndarray`): The length of each SCD word vector in the matrix
		"""
		self.dict = word_dict
		self.scd_matrix = scd_matrix
		self.scd_lengths = scd_lengths

	def get_scd(self, sentence, n=1, value=None):
		"""
			Get the best scd (as scd_id) and similarity value for a given sentence

			Args:
				sentence (list of str): The sentence to predict a SCD for
				n (int): Return top-n best results
				value (any, optional): A value which will be passed back in "return"
			Returns:
				similarity value, the id of the scd [,list of tuple(id, similarity) *if n > 1*] [,value *if value != None*]
		"""
		words = np.array(self.dict.doc2idx(sentence))
		words = words[(words >= 0)] # remove all unknown

		sentence_matrix = csr_matrix((
					np.ones(len(words)), # each word occurs once => value 1 per word
					words, # the words are their indices
					[0, len(words)] # we have one row, therefore all values there
				), shape=(1, len(self.dict))
			)
		
		# consine calculation
		# 	euclidian norm of sentence
		sentence_length = SCDMatrixQuery.matrix_row_length(sentence_matrix)
		# 	all dot products
		dot_products = np.squeeze((self.scd_matrix * sentence_matrix.T).toarray())
		#	product of length values
		dividers = self.scd_lengths * sentence_length
		# 	calculate the similarities
		similarities = np.divide(
				dot_products, dividers,
				out=np.zeros_like(dot_products),
				where=(dividers!=0)
			)

		# get best
		scd_id = np.argmax(similarities)
		sim = similarities[scd_id]

		# add top-n list and/or value?
		if n > 1:
			best_idx = np.argsort(-similarities, axis=-1)
			best_val = np.take_along_axis(similarities, best_idx, axis=-1)
			if value == None:
				return sim, scd_id, list(zip(best_idx, best_val))[:n]
			else:
				return sim, scd_id, list(zip(best_idx, best_val))[:n], value
		else:
			if value == None:
				return sim, scd_id
			else:
				return sim, scd_id, value

	def matrix_row_length(matrix):
		"""
			Calculate the euclidian norm (length) of each row of the matrix and resturns list

			Args:
				matrix (`scipy.sparse.csr_matrix`): The matrix to calc row lenghts
			Returns:
				Vector `numpy.array` of euclidian norm values
		"""
		return np.sqrt( # root per row
				np.squeeze(np.asarray( # make matrix a vector
					matrix.copy().power(2).sum(axis=1) # elementwise ** 2 and sum rows
				))
			)

Queries a SCD Matrix given a sentence for the most probable scd.

Create from a SCD Matrix model via SCDMatrix.get_query_object()

#   SCDMatrixQuery(word_dict, scd_matrix, scd_lengths)
View Source
	def __init__(self, word_dict, scd_matrix, scd_lengths):
		"""
			Args:
				word_dict (`gensim.corpora.Dictionary`): The dictionray to translate words in indices
				scd_matrix (`scipy.sparse.csr_matrix`): The SCD matrix as sparse matrix
				scd_lengths (`numpy.ndarray`): The length of each SCD word vector in the matrix
		"""
		self.dict = word_dict
		self.scd_matrix = scd_matrix
		self.scd_lengths = scd_lengths
Args
  • word_dict (gensim.corpora.Dictionary): The dictionray to translate words in indices
  • scd_matrix (scipy.sparse.csr_matrix): The SCD matrix as sparse matrix
  • scd_lengths (numpy.ndarray): The length of each SCD word vector in the matrix
#   def get_scd(self, sentence, n=1, value=None):
View Source
	def get_scd(self, sentence, n=1, value=None):
		"""
			Get the best scd (as scd_id) and similarity value for a given sentence

			Args:
				sentence (list of str): The sentence to predict a SCD for
				n (int): Return top-n best results
				value (any, optional): A value which will be passed back in "return"
			Returns:
				similarity value, the id of the scd [,list of tuple(id, similarity) *if n > 1*] [,value *if value != None*]
		"""
		words = np.array(self.dict.doc2idx(sentence))
		words = words[(words >= 0)] # remove all unknown

		sentence_matrix = csr_matrix((
					np.ones(len(words)), # each word occurs once => value 1 per word
					words, # the words are their indices
					[0, len(words)] # we have one row, therefore all values there
				), shape=(1, len(self.dict))
			)
		
		# consine calculation
		# 	euclidian norm of sentence
		sentence_length = SCDMatrixQuery.matrix_row_length(sentence_matrix)
		# 	all dot products
		dot_products = np.squeeze((self.scd_matrix * sentence_matrix.T).toarray())
		#	product of length values
		dividers = self.scd_lengths * sentence_length
		# 	calculate the similarities
		similarities = np.divide(
				dot_products, dividers,
				out=np.zeros_like(dot_products),
				where=(dividers!=0)
			)

		# get best
		scd_id = np.argmax(similarities)
		sim = similarities[scd_id]

		# add top-n list and/or value?
		if n > 1:
			best_idx = np.argsort(-similarities, axis=-1)
			best_val = np.take_along_axis(similarities, best_idx, axis=-1)
			if value == None:
				return sim, scd_id, list(zip(best_idx, best_val))[:n]
			else:
				return sim, scd_id, list(zip(best_idx, best_val))[:n], value
		else:
			if value == None:
				return sim, scd_id
			else:
				return sim, scd_id, value

Get the best scd (as scd_id) and similarity value for a given sentence

Args
  • sentence (list of str): The sentence to predict a SCD for
  • n (int): Return top-n best results
  • value (any, optional): A value which will be passed back in "return"
Returns

similarity value, the id of the scd [,list of tuple(id, similarity) if n > 1] [,value if value != None]

#   def matrix_row_length(matrix):
View Source
	def matrix_row_length(matrix):
		"""
			Calculate the euclidian norm (length) of each row of the matrix and resturns list

			Args:
				matrix (`scipy.sparse.csr_matrix`): The matrix to calc row lenghts
			Returns:
				Vector `numpy.array` of euclidian norm values
		"""
		return np.sqrt( # root per row
				np.squeeze(np.asarray( # make matrix a vector
					matrix.copy().power(2).sum(axis=1) # elementwise ** 2 and sum rows
				))
			)

Calculate the euclidian norm (length) of each row of the matrix and resturns list

Args
  • matrix (scipy.sparse.csr_matrix): The matrix to calc row lenghts
Returns

Vector numpy.array of euclidian norm values

#   class QueryWorker:
View Source
class QueryWorker():
	"""
		The query worker used by each process spawned in `SCDMatrix._get_scds()`.
	"""

	def main(scd_matrix_query, tasks_queue, results_queue):
		while True:
			sentences_batch = tasks_queue.get(block=True)
			if sentences_batch == None:
				results_queue.put(None) # add "done mark"
				results_queue.close()
				results_queue.join_thread()
				break

			results = []
			for sentence, value in sentences_batch:
				results.append(scd_matrix_query.get_scd(sentence, value=value))

			results_queue.put(results)

The query worker used by each process spawned in SCDMatrix._get_scds().

#   QueryWorker()
#   def main(scd_matrix_query, tasks_queue, results_queue):
View Source
	def main(scd_matrix_query, tasks_queue, results_queue):
		while True:
			sentences_batch = tasks_queue.get(block=True)
			if sentences_batch == None:
				results_queue.put(None) # add "done mark"
				results_queue.close()
				results_queue.join_thread()
				break

			results = []
			for sentence, value in sentences_batch:
				results.append(scd_matrix_query.get_scd(sentence, value=value))

			results_queue.put(results)