core.download
Corpus Download
This module provides the possibility to download the needed data sets
for the core.corpus
module. Also needed pretrained modules
for core.model.transformer
may be also downloaded.
And downloads files needed for preprocessing.
Run via core.setup
!
View Source
""" # Corpus Download This module provides the possibility to download the needed data sets for the `core.corpus` module. Also needed pretrained modules for `core.model.transformer` may be also downloaded. And downloads files needed for preprocessing. Run via `core.setup`! """ import core.utils.const as const def init_nltk(): ''' Set up NTLK for preprocessing of texts, needed by `core.corpus.corpus.Preprocessor`. ''' import nltk nltk.download('punkt', download_dir=const.NLTK_DATA) # used for tokenize nltk.download('stopwords', download_dir=const.NLTK_DATA) # used for stopwords list nltk.download('wordnet', download_dir=const.NLTK_DATA) # used for lemmatizing def init_5d7(): ''' Load and verify data sets used. ''' from core.download.cloud_5d7 import CorpusDownload c = CorpusDownload(const.DATASET_DIR) for dataset in const.DATA_SETS: c.request(dataset) def init_transformers(): ''' Load (and cache) the models used by the transformer system ''' from transformers import BertTokenizer, BertModel for bert_model in const.BERT_MODELS: BertTokenizer.from_pretrained(bert_model) BertModel.from_pretrained(bert_model)
View Source
def init_nltk(): ''' Set up NTLK for preprocessing of texts, needed by `core.corpus.corpus.Preprocessor`. ''' import nltk nltk.download('punkt', download_dir=const.NLTK_DATA) # used for tokenize nltk.download('stopwords', download_dir=const.NLTK_DATA) # used for stopwords list nltk.download('wordnet', download_dir=const.NLTK_DATA) # used for lemmatizing
Set up NTLK for preprocessing of texts,
needed by core.corpus.corpus.Preprocessor
.
View Source
def init_5d7(): ''' Load and verify data sets used. ''' from core.download.cloud_5d7 import CorpusDownload c = CorpusDownload(const.DATASET_DIR) for dataset in const.DATA_SETS: c.request(dataset)
Load and verify data sets used.
View Source
def init_transformers(): ''' Load (and cache) the models used by the transformer system ''' from transformers import BertTokenizer, BertModel for bert_model in const.BERT_MODELS: BertTokenizer.from_pretrained(bert_model) BertModel.from_pretrained(bert_model)
Load (and cache) the models used by the transformer system