core.download

Corpus Download

This module provides the possibility to download the needed data sets for the core.corpus module. Also needed pretrained modules for core.model.transformer may be also downloaded.

And downloads files needed for preprocessing.

Run via core.setup!

View Source
"""
# Corpus Download

This module provides the possibility to download the needed data sets 
for the `core.corpus` module. Also needed pretrained modules 
for `core.model.transformer` may be also downloaded.

And downloads files needed for preprocessing.

Run via `core.setup`!
"""

import core.utils.const as const

def init_nltk():
	'''
		Set up NTLK for preprocessing of texts,
		needed by `core.corpus.corpus.Preprocessor`.
	'''
	import nltk

	nltk.download('punkt', download_dir=const.NLTK_DATA) # used for tokenize 
	nltk.download('stopwords', download_dir=const.NLTK_DATA) # used for stopwords list 
	nltk.download('wordnet', download_dir=const.NLTK_DATA) # used for lemmatizing 

def init_5d7():
	'''
		Load and verify data sets used.
	'''
	from core.download.cloud_5d7 import CorpusDownload

	c = CorpusDownload(const.DATASET_DIR)
	for dataset in const.DATA_SETS:
		c.request(dataset)

def init_transformers():
	'''
		Load (and cache) the models used by the transformer system
	'''
	from transformers import BertTokenizer, BertModel

	for bert_model in const.BERT_MODELS:
		BertTokenizer.from_pretrained(bert_model)
		BertModel.from_pretrained(bert_model)
#   def init_nltk():
View Source
def init_nltk():
	'''
		Set up NTLK for preprocessing of texts,
		needed by `core.corpus.corpus.Preprocessor`.
	'''
	import nltk

	nltk.download('punkt', download_dir=const.NLTK_DATA) # used for tokenize 
	nltk.download('stopwords', download_dir=const.NLTK_DATA) # used for stopwords list 
	nltk.download('wordnet', download_dir=const.NLTK_DATA) # used for lemmatizing 

Set up NTLK for preprocessing of texts, needed by core.corpus.corpus.Preprocessor.

#   def init_5d7():
View Source
def init_5d7():
	'''
		Load and verify data sets used.
	'''
	from core.download.cloud_5d7 import CorpusDownload

	c = CorpusDownload(const.DATASET_DIR)
	for dataset in const.DATA_SETS:
		c.request(dataset)

Load and verify data sets used.

#   def init_transformers():
View Source
def init_transformers():
	'''
		Load (and cache) the models used by the transformer system
	'''
	from transformers import BertTokenizer, BertModel

	for bert_model in const.BERT_MODELS:
		BertTokenizer.from_pretrained(bert_model)
		BertModel.from_pretrained(bert_model)

Load (and cache) the models used by the transformer system