core.utils.const

Directories and other parameters are globally defined here, according to work with the docker-compose setup!

See source for values!

View Source
"""
	Directories and other parameters are globally
	defined here, according to work with the docker-compose setup!

	See source for values!
"""

import os 
import torch



########
# directories and paths
########
#	used by nltk to store data for stemming etc.
NLTK_DATA = os.environ.get('NLTK_DATA') # defined in Dockerfile
# 	corpora and caches
DATASET_DIR = '/home/user/data/'
CORPUS_CACHEDIR = '/home/user/data/cache/'
TRANSFORMERS_CACHE = os.environ.get('TRANSFORMERS_CACHE') # defined in Dockerfile
TOKENIZER_CACHEDIR = '/home/user/models/tokenizer_cache/'
#	results and reports
RESULTS_DIR = '/home/user/results/'
TRAINING_LOGFILES = os.path.join(RESULTS_DIR, "logs")
FINETUNED_MODELDIR = '/home/user/models/finetuned/'
MATRIX_MODELDIR = '/home/user/models/matrix/'

########
# data sets
########
# 	data sets to download by downloader
DATA_SETS = [
	#'arxiv',
	'twentynews',
	'wiktionary',
	'quotes',
	'manuscriptcultures'
]
# 	pretrained models to download by downloader
BERT_MODELS = [
	'bert-base-uncased'
]
# 	default model BERT uses
BERT_MODEL_DEFAULT = BERT_MODELS[0]
#	disable that transformers checks for updates of the cached pretrained models
#	before each run
#		core.download.init_transformers() will always check for newer models
TRANSFORMERS_NO_NETWORK = True

########
# global parameters
########
#	do we have a GPU?
RUNNING_ON_GPU = torch.cuda.is_available()
# 	size of batches (for single row of input_ids)
if RUNNING_ON_GPU: # using a 40 GB NVIDA A 100
	TRAIN_BATCH_SIZE = 40
	EVAL_BATCH_SIZE = 40
else: # using a machine with 16 GB RAM
	TRAIN_BATCH_SIZE = 4
	EVAL_BATCH_SIZE = 4