core.model.scdmatrix.model
View Source
import os, warnings, time from abc import abstractmethod from ast import literal_eval from multiprocessing import Queue, Pool warnings.filterwarnings(action='ignore', category=UserWarning) # some annoying warning from gensim.corpora import Dictionary warnings.filterwarnings(action='default', category=UserWarning) # reset defaults import numpy as np from scipy.sparse import dok_matrix, save_npz, load_npz, csr_matrix from tqdm import tqdm from core.model import Model from core.utils import const, clear_filename, check_and_create_folder, write_json_file, read_json_file, CacheName class SCDMatrix(Model): ''' Represents a model containing a trained SCD matrix. See [To Extend or not to Extend? Context-specific Corpus Enrichment](http://ifis.uni-luebeck.de/uploads/tx_wapublications/public_AI2019_paper_79.pdf) for more information. This class is abstract, use subclasses `core.model.scdmatrix.models.iSCDMatrix` or `core.model.scdmatrix.models.MPSCDMatrix` **This model does not use a GPU and will never. The calculation of SCD similarity values via `_get_scds` (used by e.g. `train()` and `evaluate()` ) uses multiple cores per default.** ''' SENTENCE_BATCH_SIZE = 250 ''' Batch size for multicore usage (number of sentences per batch) ''' def __init__(self, annotated_corpus_train, annotated_corpus_eval, num_scds_train=5, num_processes=-1, **kwargs ): """ Args: num_scds_train(int): Number of scd per sentence to train the model on num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use `os.cpu_count()`) """ self.num_scds_train = num_scds_train self.num_processes = os.cpu_count() if num_processes < 1 else num_processes if self.num_processes > os.cpu_count(): self.num_processes = os.cpu_count() self.scd_id_map = None super().__init__(annotated_corpus_train, annotated_corpus_eval, **kwargs) def is_gpu_optimized(): return False def _prepare_training(self): self.cache_name = os.path.join( const.MATRIX_MODELDIR, clear_filename(self.annotated_corpus_train.get_cachename() + '-' + str(self.num_scds_train)) ) self.dict_cache_name = CacheName.filename(self.cache_name + ".dict") self.matrix_cache_name = CacheName.filename(self.cache_name + ".npz") self.map_cache_name = CacheName.filename(self.cache_name + "_map.json") self.len_cache_name = CacheName.filename(self.cache_name + "_len.json") def _is_cached(self): for f in [self.dict_cache_name, self.matrix_cache_name, self.map_cache_name, self.len_cache_name]: if not os.path.isfile(f): return False return True def _load_cached(self): self.dict = Dictionary.load(self.dict_cache_name) self.scd_map = read_json_file(self.map_cache_name) self.scd_matrix = load_npz(self.matrix_cache_name) self.scd_lengths = np.array(read_json_file(self.len_cache_name)) self.query_object = self.get_query_object() self._init_subclass() def _train(self): dict_start = time.time() print("Training Step 1/4:") # create dictionary of words self.dict = Dictionary() for i, (sentence, scds) in enumerate(self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train)): if i % 10000 == 0: self.dict.filter_extremes(no_below=5, no_above=0.9, keep_n=None, keep_tokens=None) self.dict.doc2bow(sentence, allow_update=True) self.dict.add_documents(scds, prune_at=None) map_start = time.time() print("Training Step 2/4:") # create scd => id mapping self.scd_map = {} num_scds, num_iter = 0, 0 for _, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train): for scd in scds: idx = str(tuple(self.dict.doc2idx(scd))) if idx not in self.scd_map: self.scd_map[idx] = num_scds num_scds += 1 num_iter += 1 matrix_start = time.time() print("Training Step 3/4:") # train matrix matrix = dok_matrix((num_scds, len(self.dict)), 'int') with tqdm(total=num_iter) as timeline: for sentence, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train): words = self.dict.doc2idx(sentence) for scd in scds: scd_id = self.scd_map[str(tuple(self.dict.doc2idx(scd)))] for word in words: if word >= 0: matrix[scd_id, word] += 1 timeline.update(1) # optimize format self.scd_matrix = matrix.tocsr() # euclidian norms per scd self.scd_lengths = SCDMatrixQuery.matrix_row_length(self.scd_matrix); all_end = time.time() # write model dat if self.write_model: check_and_create_folder(const.MATRIX_MODELDIR) self.dict.save(self.dict_cache_name) write_json_file(self.map_cache_name, self.scd_map) save_npz(self.matrix_cache_name, self.scd_matrix) write_json_file(self.len_cache_name, self.scd_lengths.tolist()) # create query (MPSCD) object self.query_object = self.get_query_object() # train, if defined by concret subclass subclass_start = time.time() self._init_subclass() subclass_runtime = time.time() - subclass_start return { 'runtime' : all_end - dict_start, 'runtime_dict' : map_start - dict_start, 'runtime_map' : matrix_start - map_start, 'runtime_matrix' : all_end - matrix_start, 'runtime_subclass' : subclass_runtime } @abstractmethod def _init_subclass(self): """ Train for subclasses or load from cache. """ pass @abstractmethod def _evaluate(self): pass def get_query_object(self): """ Gets a SCD Similarity Query object of the model. Returns: The SCD Similarity Query object `SCDMatrixQuery` """ return SCDMatrixQuery( self.dict, self.scd_matrix, self.scd_lengths ) def _get_scd_text(self, scd_id): if self.scd_id_map == None: self.scd_id_map = [None for _ in range(len(self.scd_map))] for k,v in self.scd_map.items(): self.scd_id_map[v] = k return [self.dict[i] for i in filter(lambda i: i >= 0, literal_eval(self.scd_id_map[scd_id]))] def _get_scds(self, sentence_generator, timeline=None): """ Similar to `_get_scd()`, but may use multiple processes. Args: sentence_generator (generator/ iterateable of list of words): Generator to generate sentences each passed to `SCDMatrixQuery.get_scd(sentence, value=value)` The generator has to yield `sentence, value` tuples, cause order of elements in result may no be stable. timeline (tqdm, optional): A `tqdm` object to display a timeline, has to be initalized with number of sentences in generator Returns: List of tuples similarity values, scd_ids, values (list of returned from `SCDMatrixQuery.get_scd(sentence, value=value)`) """ has_timeline = type(timeline) != type(None) results = [] if self.num_processes == 1: # only one process, use self! for sentence, value in sentence_generator(): results.append(self.query_object.get_scd(sentence, value=value)) if has_timeline: timeline.update(1) else: tasks_queue = Queue(self.num_processes*2) # max 2 tasks per core in queue per time results_queue = Queue() # create the worker each with the matrix with Pool(self.num_processes, QueryWorker.main, (self.get_query_object(), tasks_queue, results_queue)) as pool: # get all sentences and submit them (chunk wise to save memory and sync-costs!) sentences_batch = [] for sentence_value in sentence_generator(): sentences_batch.append(sentence_value) if len(sentences_batch) >= SCDMatrix.SENTENCE_BATCH_SIZE: tasks_queue.put(sentences_batch, block=True) sentences_batch = [] if has_timeline: timeline.update(results_queue.qsize() * SCDMatrix.SENTENCE_BATCH_SIZE - timeline.n) # left sentences (less than SCDMatrix.SENTENCE_BATCH_SIZE) tasks_queue.put(sentences_batch) # mark end of tasks for each process for _ in range(self.num_processes): tasks_queue.put(None) # mark tasks queue as done tasks_queue.close() tasks_queue.join_thread() # wait for all processed to end pool.close() # collect the results none_count = 0 while True: result = results_queue.get(block=True) if result == None: # each process adds None as last Element! none_count += 1 if none_count >= self.num_processes: break continue results.extend(result) results_queue.close() results_queue.join_thread() # finish timeline if has_timeline: timeline.update(timeline.total - timeline.n) return results def _get_scd(self, sentence, n=1): return self.query_object.get_scd(sentence, n=n) @abstractmethod def _predict(self, *args): pass class SCDMatrixQuery: ''' Queries a SCD Matrix given a sentence for the most probable scd. Create from a SCD Matrix model via `SCDMatrix.get_query_object()` ''' def __init__(self, word_dict, scd_matrix, scd_lengths): """ Args: word_dict (`gensim.corpora.Dictionary`): The dictionray to translate words in indices scd_matrix (`scipy.sparse.csr_matrix`): The SCD matrix as sparse matrix scd_lengths (`numpy.ndarray`): The length of each SCD word vector in the matrix """ self.dict = word_dict self.scd_matrix = scd_matrix self.scd_lengths = scd_lengths def get_scd(self, sentence, n=1, value=None): """ Get the best scd (as scd_id) and similarity value for a given sentence Args: sentence (list of str): The sentence to predict a SCD for n (int): Return top-n best results value (any, optional): A value which will be passed back in "return" Returns: similarity value, the id of the scd [,list of tuple(id, similarity) *if n > 1*] [,value *if value != None*] """ words = np.array(self.dict.doc2idx(sentence)) words = words[(words >= 0)] # remove all unknown sentence_matrix = csr_matrix(( np.ones(len(words)), # each word occurs once => value 1 per word words, # the words are their indices [0, len(words)] # we have one row, therefore all values there ), shape=(1, len(self.dict)) ) # consine calculation # euclidian norm of sentence sentence_length = SCDMatrixQuery.matrix_row_length(sentence_matrix) # all dot products dot_products = np.squeeze((self.scd_matrix * sentence_matrix.T).toarray()) # product of length values dividers = self.scd_lengths * sentence_length # calculate the similarities similarities = np.divide( dot_products, dividers, out=np.zeros_like(dot_products), where=(dividers!=0) ) # get best scd_id = np.argmax(similarities) sim = similarities[scd_id] # add top-n list and/or value? if n > 1: best_idx = np.argsort(-similarities, axis=-1) best_val = np.take_along_axis(similarities, best_idx, axis=-1) if value == None: return sim, scd_id, list(zip(best_idx, best_val))[:n] else: return sim, scd_id, list(zip(best_idx, best_val))[:n], value else: if value == None: return sim, scd_id else: return sim, scd_id, value def matrix_row_length(matrix): """ Calculate the euclidian norm (length) of each row of the matrix and resturns list Args: matrix (`scipy.sparse.csr_matrix`): The matrix to calc row lenghts Returns: Vector `numpy.array` of euclidian norm values """ return np.sqrt( # root per row np.squeeze(np.asarray( # make matrix a vector matrix.copy().power(2).sum(axis=1) # elementwise ** 2 and sum rows )) ) class QueryWorker(): """ The query worker used by each process spawned in `SCDMatrix._get_scds()`. """ def main(scd_matrix_query, tasks_queue, results_queue): while True: sentences_batch = tasks_queue.get(block=True) if sentences_batch == None: results_queue.put(None) # add "done mark" results_queue.close() results_queue.join_thread() break results = [] for sentence, value in sentences_batch: results.append(scd_matrix_query.get_scd(sentence, value=value)) results_queue.put(results)
View Source
class SCDMatrix(Model): ''' Represents a model containing a trained SCD matrix. See [To Extend or not to Extend? Context-specific Corpus Enrichment](http://ifis.uni-luebeck.de/uploads/tx_wapublications/public_AI2019_paper_79.pdf) for more information. This class is abstract, use subclasses `core.model.scdmatrix.models.iSCDMatrix` or `core.model.scdmatrix.models.MPSCDMatrix` **This model does not use a GPU and will never. The calculation of SCD similarity values via `_get_scds` (used by e.g. `train()` and `evaluate()` ) uses multiple cores per default.** ''' SENTENCE_BATCH_SIZE = 250 ''' Batch size for multicore usage (number of sentences per batch) ''' def __init__(self, annotated_corpus_train, annotated_corpus_eval, num_scds_train=5, num_processes=-1, **kwargs ): """ Args: num_scds_train(int): Number of scd per sentence to train the model on num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use `os.cpu_count()`) """ self.num_scds_train = num_scds_train self.num_processes = os.cpu_count() if num_processes < 1 else num_processes if self.num_processes > os.cpu_count(): self.num_processes = os.cpu_count() self.scd_id_map = None super().__init__(annotated_corpus_train, annotated_corpus_eval, **kwargs) def is_gpu_optimized(): return False def _prepare_training(self): self.cache_name = os.path.join( const.MATRIX_MODELDIR, clear_filename(self.annotated_corpus_train.get_cachename() + '-' + str(self.num_scds_train)) ) self.dict_cache_name = CacheName.filename(self.cache_name + ".dict") self.matrix_cache_name = CacheName.filename(self.cache_name + ".npz") self.map_cache_name = CacheName.filename(self.cache_name + "_map.json") self.len_cache_name = CacheName.filename(self.cache_name + "_len.json") def _is_cached(self): for f in [self.dict_cache_name, self.matrix_cache_name, self.map_cache_name, self.len_cache_name]: if not os.path.isfile(f): return False return True def _load_cached(self): self.dict = Dictionary.load(self.dict_cache_name) self.scd_map = read_json_file(self.map_cache_name) self.scd_matrix = load_npz(self.matrix_cache_name) self.scd_lengths = np.array(read_json_file(self.len_cache_name)) self.query_object = self.get_query_object() self._init_subclass() def _train(self): dict_start = time.time() print("Training Step 1/4:") # create dictionary of words self.dict = Dictionary() for i, (sentence, scds) in enumerate(self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train)): if i % 10000 == 0: self.dict.filter_extremes(no_below=5, no_above=0.9, keep_n=None, keep_tokens=None) self.dict.doc2bow(sentence, allow_update=True) self.dict.add_documents(scds, prune_at=None) map_start = time.time() print("Training Step 2/4:") # create scd => id mapping self.scd_map = {} num_scds, num_iter = 0, 0 for _, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train): for scd in scds: idx = str(tuple(self.dict.doc2idx(scd))) if idx not in self.scd_map: self.scd_map[idx] = num_scds num_scds += 1 num_iter += 1 matrix_start = time.time() print("Training Step 3/4:") # train matrix matrix = dok_matrix((num_scds, len(self.dict)), 'int') with tqdm(total=num_iter) as timeline: for sentence, scds in self.annotated_corpus_train.iterate_sentence_scds(n=self.num_scds_train): words = self.dict.doc2idx(sentence) for scd in scds: scd_id = self.scd_map[str(tuple(self.dict.doc2idx(scd)))] for word in words: if word >= 0: matrix[scd_id, word] += 1 timeline.update(1) # optimize format self.scd_matrix = matrix.tocsr() # euclidian norms per scd self.scd_lengths = SCDMatrixQuery.matrix_row_length(self.scd_matrix); all_end = time.time() # write model dat if self.write_model: check_and_create_folder(const.MATRIX_MODELDIR) self.dict.save(self.dict_cache_name) write_json_file(self.map_cache_name, self.scd_map) save_npz(self.matrix_cache_name, self.scd_matrix) write_json_file(self.len_cache_name, self.scd_lengths.tolist()) # create query (MPSCD) object self.query_object = self.get_query_object() # train, if defined by concret subclass subclass_start = time.time() self._init_subclass() subclass_runtime = time.time() - subclass_start return { 'runtime' : all_end - dict_start, 'runtime_dict' : map_start - dict_start, 'runtime_map' : matrix_start - map_start, 'runtime_matrix' : all_end - matrix_start, 'runtime_subclass' : subclass_runtime } @abstractmethod def _init_subclass(self): """ Train for subclasses or load from cache. """ pass @abstractmethod def _evaluate(self): pass def get_query_object(self): """ Gets a SCD Similarity Query object of the model. Returns: The SCD Similarity Query object `SCDMatrixQuery` """ return SCDMatrixQuery( self.dict, self.scd_matrix, self.scd_lengths ) def _get_scd_text(self, scd_id): if self.scd_id_map == None: self.scd_id_map = [None for _ in range(len(self.scd_map))] for k,v in self.scd_map.items(): self.scd_id_map[v] = k return [self.dict[i] for i in filter(lambda i: i >= 0, literal_eval(self.scd_id_map[scd_id]))] def _get_scds(self, sentence_generator, timeline=None): """ Similar to `_get_scd()`, but may use multiple processes. Args: sentence_generator (generator/ iterateable of list of words): Generator to generate sentences each passed to `SCDMatrixQuery.get_scd(sentence, value=value)` The generator has to yield `sentence, value` tuples, cause order of elements in result may no be stable. timeline (tqdm, optional): A `tqdm` object to display a timeline, has to be initalized with number of sentences in generator Returns: List of tuples similarity values, scd_ids, values (list of returned from `SCDMatrixQuery.get_scd(sentence, value=value)`) """ has_timeline = type(timeline) != type(None) results = [] if self.num_processes == 1: # only one process, use self! for sentence, value in sentence_generator(): results.append(self.query_object.get_scd(sentence, value=value)) if has_timeline: timeline.update(1) else: tasks_queue = Queue(self.num_processes*2) # max 2 tasks per core in queue per time results_queue = Queue() # create the worker each with the matrix with Pool(self.num_processes, QueryWorker.main, (self.get_query_object(), tasks_queue, results_queue)) as pool: # get all sentences and submit them (chunk wise to save memory and sync-costs!) sentences_batch = [] for sentence_value in sentence_generator(): sentences_batch.append(sentence_value) if len(sentences_batch) >= SCDMatrix.SENTENCE_BATCH_SIZE: tasks_queue.put(sentences_batch, block=True) sentences_batch = [] if has_timeline: timeline.update(results_queue.qsize() * SCDMatrix.SENTENCE_BATCH_SIZE - timeline.n) # left sentences (less than SCDMatrix.SENTENCE_BATCH_SIZE) tasks_queue.put(sentences_batch) # mark end of tasks for each process for _ in range(self.num_processes): tasks_queue.put(None) # mark tasks queue as done tasks_queue.close() tasks_queue.join_thread() # wait for all processed to end pool.close() # collect the results none_count = 0 while True: result = results_queue.get(block=True) if result == None: # each process adds None as last Element! none_count += 1 if none_count >= self.num_processes: break continue results.extend(result) results_queue.close() results_queue.join_thread() # finish timeline if has_timeline: timeline.update(timeline.total - timeline.n) return results def _get_scd(self, sentence, n=1): return self.query_object.get_scd(sentence, n=n) @abstractmethod def _predict(self, *args): pass
Represents a model containing a trained SCD matrix.
See To Extend or not to Extend? Context-specific Corpus Enrichment for more information.
This class is abstract, use subclasses core.model.scdmatrix.models.iSCDMatrix
or core.model.scdmatrix.models.MPSCDMatrix
This model does not use a GPU and will never. The calculation of SCD similarity values via _get_scds
(used
by e.g. train()
and evaluate()
) uses multiple cores per default.
View Source
def __init__(self, annotated_corpus_train, annotated_corpus_eval, num_scds_train=5, num_processes=-1, **kwargs ): """ Args: num_scds_train(int): Number of scd per sentence to train the model on num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use `os.cpu_count()`) """ self.num_scds_train = num_scds_train self.num_processes = os.cpu_count() if num_processes < 1 else num_processes if self.num_processes > os.cpu_count(): self.num_processes = os.cpu_count() self.scd_id_map = None super().__init__(annotated_corpus_train, annotated_corpus_eval, **kwargs)
Args
- num_scds_train(int): Number of scd per sentence to train the model on
- num_processes (int): The number of cores/ processes to use for parallel sentence estimation (values smaller 1 use
os.cpu_count()
)
Batch size for multicore usage (number of sentences per batch)
View Source
def is_gpu_optimized(): return False
Check if a model is optimized for GPU usage!
Returns
bool
View Source
def get_query_object(self): """ Gets a SCD Similarity Query object of the model. Returns: The SCD Similarity Query object `SCDMatrixQuery` """ return SCDMatrixQuery( self.dict, self.scd_matrix, self.scd_lengths )
Gets a SCD Similarity Query object of the model.
Returns
The SCD Similarity Query object
SCDMatrixQuery
Inherited Members
View Source
class SCDMatrixQuery: ''' Queries a SCD Matrix given a sentence for the most probable scd. Create from a SCD Matrix model via `SCDMatrix.get_query_object()` ''' def __init__(self, word_dict, scd_matrix, scd_lengths): """ Args: word_dict (`gensim.corpora.Dictionary`): The dictionray to translate words in indices scd_matrix (`scipy.sparse.csr_matrix`): The SCD matrix as sparse matrix scd_lengths (`numpy.ndarray`): The length of each SCD word vector in the matrix """ self.dict = word_dict self.scd_matrix = scd_matrix self.scd_lengths = scd_lengths def get_scd(self, sentence, n=1, value=None): """ Get the best scd (as scd_id) and similarity value for a given sentence Args: sentence (list of str): The sentence to predict a SCD for n (int): Return top-n best results value (any, optional): A value which will be passed back in "return" Returns: similarity value, the id of the scd [,list of tuple(id, similarity) *if n > 1*] [,value *if value != None*] """ words = np.array(self.dict.doc2idx(sentence)) words = words[(words >= 0)] # remove all unknown sentence_matrix = csr_matrix(( np.ones(len(words)), # each word occurs once => value 1 per word words, # the words are their indices [0, len(words)] # we have one row, therefore all values there ), shape=(1, len(self.dict)) ) # consine calculation # euclidian norm of sentence sentence_length = SCDMatrixQuery.matrix_row_length(sentence_matrix) # all dot products dot_products = np.squeeze((self.scd_matrix * sentence_matrix.T).toarray()) # product of length values dividers = self.scd_lengths * sentence_length # calculate the similarities similarities = np.divide( dot_products, dividers, out=np.zeros_like(dot_products), where=(dividers!=0) ) # get best scd_id = np.argmax(similarities) sim = similarities[scd_id] # add top-n list and/or value? if n > 1: best_idx = np.argsort(-similarities, axis=-1) best_val = np.take_along_axis(similarities, best_idx, axis=-1) if value == None: return sim, scd_id, list(zip(best_idx, best_val))[:n] else: return sim, scd_id, list(zip(best_idx, best_val))[:n], value else: if value == None: return sim, scd_id else: return sim, scd_id, value def matrix_row_length(matrix): """ Calculate the euclidian norm (length) of each row of the matrix and resturns list Args: matrix (`scipy.sparse.csr_matrix`): The matrix to calc row lenghts Returns: Vector `numpy.array` of euclidian norm values """ return np.sqrt( # root per row np.squeeze(np.asarray( # make matrix a vector matrix.copy().power(2).sum(axis=1) # elementwise ** 2 and sum rows )) )
Queries a SCD Matrix given a sentence for the most probable scd.
Create from a SCD Matrix model via SCDMatrix.get_query_object()
View Source
def __init__(self, word_dict, scd_matrix, scd_lengths): """ Args: word_dict (`gensim.corpora.Dictionary`): The dictionray to translate words in indices scd_matrix (`scipy.sparse.csr_matrix`): The SCD matrix as sparse matrix scd_lengths (`numpy.ndarray`): The length of each SCD word vector in the matrix """ self.dict = word_dict self.scd_matrix = scd_matrix self.scd_lengths = scd_lengths
Args
- word_dict (
gensim.corpora.Dictionary
): The dictionray to translate words in indices - scd_matrix (
scipy.sparse.csr_matrix
): The SCD matrix as sparse matrix - scd_lengths (
numpy.ndarray
): The length of each SCD word vector in the matrix
View Source
def get_scd(self, sentence, n=1, value=None): """ Get the best scd (as scd_id) and similarity value for a given sentence Args: sentence (list of str): The sentence to predict a SCD for n (int): Return top-n best results value (any, optional): A value which will be passed back in "return" Returns: similarity value, the id of the scd [,list of tuple(id, similarity) *if n > 1*] [,value *if value != None*] """ words = np.array(self.dict.doc2idx(sentence)) words = words[(words >= 0)] # remove all unknown sentence_matrix = csr_matrix(( np.ones(len(words)), # each word occurs once => value 1 per word words, # the words are their indices [0, len(words)] # we have one row, therefore all values there ), shape=(1, len(self.dict)) ) # consine calculation # euclidian norm of sentence sentence_length = SCDMatrixQuery.matrix_row_length(sentence_matrix) # all dot products dot_products = np.squeeze((self.scd_matrix * sentence_matrix.T).toarray()) # product of length values dividers = self.scd_lengths * sentence_length # calculate the similarities similarities = np.divide( dot_products, dividers, out=np.zeros_like(dot_products), where=(dividers!=0) ) # get best scd_id = np.argmax(similarities) sim = similarities[scd_id] # add top-n list and/or value? if n > 1: best_idx = np.argsort(-similarities, axis=-1) best_val = np.take_along_axis(similarities, best_idx, axis=-1) if value == None: return sim, scd_id, list(zip(best_idx, best_val))[:n] else: return sim, scd_id, list(zip(best_idx, best_val))[:n], value else: if value == None: return sim, scd_id else: return sim, scd_id, value
Get the best scd (as scd_id) and similarity value for a given sentence
Args
- sentence (list of str): The sentence to predict a SCD for
- n (int): Return top-n best results
- value (any, optional): A value which will be passed back in "return"
Returns
similarity value, the id of the scd [,list of tuple(id, similarity) if n > 1] [,value if value != None]
View Source
def matrix_row_length(matrix): """ Calculate the euclidian norm (length) of each row of the matrix and resturns list Args: matrix (`scipy.sparse.csr_matrix`): The matrix to calc row lenghts Returns: Vector `numpy.array` of euclidian norm values """ return np.sqrt( # root per row np.squeeze(np.asarray( # make matrix a vector matrix.copy().power(2).sum(axis=1) # elementwise ** 2 and sum rows )) )
Calculate the euclidian norm (length) of each row of the matrix and resturns list
Args
- matrix (
scipy.sparse.csr_matrix
): The matrix to calc row lenghts
Returns
Vector
numpy.array
of euclidian norm values
View Source
class QueryWorker(): """ The query worker used by each process spawned in `SCDMatrix._get_scds()`. """ def main(scd_matrix_query, tasks_queue, results_queue): while True: sentences_batch = tasks_queue.get(block=True) if sentences_batch == None: results_queue.put(None) # add "done mark" results_queue.close() results_queue.join_thread() break results = [] for sentence, value in sentences_batch: results.append(scd_matrix_query.get_scd(sentence, value=value)) results_queue.put(results)
The query worker used by each process spawned in SCDMatrix._get_scds()
.
View Source
def main(scd_matrix_query, tasks_queue, results_queue): while True: sentences_batch = tasks_queue.get(block=True) if sentences_batch == None: results_queue.put(None) # add "done mark" results_queue.close() results_queue.join_thread() break results = [] for sentence, value in sentences_batch: results.append(scd_matrix_query.get_scd(sentence, value=value)) results_queue.put(results)