core.corpus.corpora

View Source
import os, json

from core.corpus.corpus import Corpus
import core.utils.const as const


class TwentyNews(Corpus):
	'''
		TwentyNewsgroups corpus

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <http://qwone.com/~jason/20Newsgroups/>
	'''

	BASEDIR = os.path.join(const.DATASET_DIR, "twentynews")

	def __init__(self, subgroups=[], **kwargs):
		'''
			Args:
				subgroups (array): select one or more subgroups, empty selects all
		'''
		# scan subgroups
		self.all_subgroups = []
		for name in os.listdir(TwentyNews.BASEDIR):
			filename = os.path.join(TwentyNews.BASEDIR, name)
			if os.path.isfile(filename) and name.endswith(".json"):
				self.all_subgroups.append(name[:-5])

		# only load interesting
		if len(subgroups) > 0:
			self.subgroups = []
			for subgroup in self.all_subgroups:
				if subgroup in subgroups:
					self.subgroups.append(subgroup)
		else:
			self.subgroups = self.all_subgroups

		self.subgroups.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		for subgroup in self.subgroups:
			for document in json.load(open(os.path.join(TwentyNews.BASEDIR, subgroup + ".json"), "r")):
				yield document

	def is_cacheable(self):
		return True

	def get_cachename(self):
		name = "twentynews-" + self.preprocessor_name + "-"
		for s in self.subgroups:
			name += ''.join([p[0] for p in s.split('-')]) + "-"
		return name[0:-1]

class ArXiv(Corpus):
	'''
		ArXiv corpus, filter one or more categories via ``categories=[]``, empty selects all

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.kaggle.com/Cornell-University/arxiv>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "arxiv", "data.json")

	def __init__(self, categories=[], offset=0, limit=-1, **kwargs):
		'''
			Args:
				categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`)
				offset (int): The offset to start at (skip ``offset`` abstracts)
				limit (init): The maximum number of abstracts to return, -1 means no limit
		'''

		self.limit = limit
		self.offset = offset
		self.categories = categories	
		self.categories.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		with open(ArXiv.DATAFILE, "r", errors='ignore') as f:
			position, elements = 0, 0
		
			for line in f:
				if self.offset > position:
					position += 1
				else:
					line = json.loads(line)

					if len(self.categories) > 0:
						matches_cats = False
						for a_c in line['categories'].split(' '):
							for c in self.categories:
								matches_cats = matches_cats or a_c.startswith(c)
					else:
						matches_cats = True
					
					if matches_cats:
						elements += 1
						yield line['abstract']
				
				if self.limit != -1 and elements >= self.limit:
					break

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "arxiv-" + self.preprocessor_name + "-" \
			+ str(self.offset) \
			+ ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \
			+ ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")

class Dummy(Corpus):
	'''
		Very very small dummy corpus for testing!

		Dataset source: <https://en.wikipedia.org/wiki/European_bison>, <https://en.wikipedia.org/wiki/American_bison>
	'''

	def _texts_generator(self):
		data = [
			[
				"The European bison (Bison bonasus) or the European wood bison, also known as the wisent, or the zubr, is a European species of bison.",
				"It is one of two extant species of bison, alongside the American bison.",
				"The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than modern animals."
			],
			[
				"The American bison or simply bison (Bison bison), also commonly known as the American buffalo or simply buffalo, is an American species of bison that once roamed North America in vast herds.",
				"It nearly became extinct by a combination of commercial hunting and slaughter in the 19th century and introduction of bovine diseases from domestic cattle.",
				"With a population in excess of 60 million in the late 18th century, the species was down to just 541 animals by 1889."
			]
		]
		yield ' '.join(data[0])
		yield ' '.join(data[1])

	def is_cacheable(self):
		return False #True
		
	def get_cachename(self):
		return "dummy-bison-" + self.preprocessor_name + "-"

class ManuscriptCultures(Corpus):
	'''
		ManuscriptCultures corpus.

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.csmc.uni-hamburg.de/publications/mc.html>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "manuscriptcultures", "data.json")

	def _texts_generator(self):
		with open(ManuscriptCultures.DATAFILE, "r", errors='ignore') as f:
			for line in f:
				line = json.loads(line)
				for article in line['articles' if len(line['articles']) > 0 else 'others']:
					yield article

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "mc-" + self.preprocessor_name + "-"
#   class TwentyNews(core.corpus.corpus.Corpus):
View Source
class TwentyNews(Corpus):
	'''
		TwentyNewsgroups corpus

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <http://qwone.com/~jason/20Newsgroups/>
	'''

	BASEDIR = os.path.join(const.DATASET_DIR, "twentynews")

	def __init__(self, subgroups=[], **kwargs):
		'''
			Args:
				subgroups (array): select one or more subgroups, empty selects all
		'''
		# scan subgroups
		self.all_subgroups = []
		for name in os.listdir(TwentyNews.BASEDIR):
			filename = os.path.join(TwentyNews.BASEDIR, name)
			if os.path.isfile(filename) and name.endswith(".json"):
				self.all_subgroups.append(name[:-5])

		# only load interesting
		if len(subgroups) > 0:
			self.subgroups = []
			for subgroup in self.all_subgroups:
				if subgroup in subgroups:
					self.subgroups.append(subgroup)
		else:
			self.subgroups = self.all_subgroups

		self.subgroups.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		for subgroup in self.subgroups:
			for document in json.load(open(os.path.join(TwentyNews.BASEDIR, subgroup + ".json"), "r")):
				yield document

	def is_cacheable(self):
		return True

	def get_cachename(self):
		name = "twentynews-" + self.preprocessor_name + "-"
		for s in self.subgroups:
			name += ''.join([p[0] for p in s.split('-')]) + "-"
		return name[0:-1]

TwentyNewsgroups corpus

Make sure to run core.download.init_5d7() to downlad needed ressources.

Dataset source: http://qwone.com/~jason/20Newsgroups/

#   TwentyNews(subgroups=[], **kwargs)
View Source
	def __init__(self, subgroups=[], **kwargs):
		'''
			Args:
				subgroups (array): select one or more subgroups, empty selects all
		'''
		# scan subgroups
		self.all_subgroups = []
		for name in os.listdir(TwentyNews.BASEDIR):
			filename = os.path.join(TwentyNews.BASEDIR, name)
			if os.path.isfile(filename) and name.endswith(".json"):
				self.all_subgroups.append(name[:-5])

		# only load interesting
		if len(subgroups) > 0:
			self.subgroups = []
			for subgroup in self.all_subgroups:
				if subgroup in subgroups:
					self.subgroups.append(subgroup)
		else:
			self.subgroups = self.all_subgroups

		self.subgroups.sort()

		super().__init__(**kwargs)	
Args
  • subgroups (array): select one or more subgroups, empty selects all
#   BASEDIR = '/home/user/data/twentynews'
#   def is_cacheable(self):
View Source
	def is_cacheable(self):
		return True

Defines if a corpus is cacheable

Returns

bool

#   def get_cachename(self):
View Source
	def get_cachename(self):
		name = "twentynews-" + self.preprocessor_name + "-"
		for s in self.subgroups:
			name += ''.join([p[0] for p in s.split('-')]) + "-"
		return name[0:-1]

Returns string what the corpus should be called (and used for name of cache)

View Source
class ArXiv(Corpus):
	'''
		ArXiv corpus, filter one or more categories via ``categories=[]``, empty selects all

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.kaggle.com/Cornell-University/arxiv>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "arxiv", "data.json")

	def __init__(self, categories=[], offset=0, limit=-1, **kwargs):
		'''
			Args:
				categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`)
				offset (int): The offset to start at (skip ``offset`` abstracts)
				limit (init): The maximum number of abstracts to return, -1 means no limit
		'''

		self.limit = limit
		self.offset = offset
		self.categories = categories	
		self.categories.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		with open(ArXiv.DATAFILE, "r", errors='ignore') as f:
			position, elements = 0, 0
		
			for line in f:
				if self.offset > position:
					position += 1
				else:
					line = json.loads(line)

					if len(self.categories) > 0:
						matches_cats = False
						for a_c in line['categories'].split(' '):
							for c in self.categories:
								matches_cats = matches_cats or a_c.startswith(c)
					else:
						matches_cats = True
					
					if matches_cats:
						elements += 1
						yield line['abstract']
				
				if self.limit != -1 and elements >= self.limit:
					break

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "arxiv-" + self.preprocessor_name + "-" \
			+ str(self.offset) \
			+ ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \
			+ ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")

ArXiv corpus, filter one or more categories via categories=[], empty selects all

Make sure to run core.download.init_5d7() to downlad needed ressources.

Dataset source: https://www.kaggle.com/Cornell-University/arxiv

#   ArXiv(categories=[], offset=0, limit=-1, **kwargs)
View Source
	def __init__(self, categories=[], offset=0, limit=-1, **kwargs):
		'''
			Args:
				categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`)
				offset (int): The offset to start at (skip ``offset`` abstracts)
				limit (init): The maximum number of abstracts to return, -1 means no limit
		'''

		self.limit = limit
		self.offset = offset
		self.categories = categories	
		self.categories.sort()

		super().__init__(**kwargs)	
Args
  • categories (array): filter categories of the abstracts, empty means no filter (filtering is done via startswith)
  • offset (int): The offset to start at (skip offset abstracts)
  • limit (init): The maximum number of abstracts to return, -1 means no limit
#   DATAFILE = '/home/user/data/arxiv/data.json'
#   def is_cacheable(self):
View Source
	def is_cacheable(self):
		return True

Defines if a corpus is cacheable

Returns

bool

#   def get_cachename(self):
View Source
	def get_cachename(self):
		return "arxiv-" + self.preprocessor_name + "-" \
			+ str(self.offset) \
			+ ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \
			+ ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")

Returns string what the corpus should be called (and used for name of cache)

View Source
class Dummy(Corpus):
	'''
		Very very small dummy corpus for testing!

		Dataset source: <https://en.wikipedia.org/wiki/European_bison>, <https://en.wikipedia.org/wiki/American_bison>
	'''

	def _texts_generator(self):
		data = [
			[
				"The European bison (Bison bonasus) or the European wood bison, also known as the wisent, or the zubr, is a European species of bison.",
				"It is one of two extant species of bison, alongside the American bison.",
				"The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than modern animals."
			],
			[
				"The American bison or simply bison (Bison bison), also commonly known as the American buffalo or simply buffalo, is an American species of bison that once roamed North America in vast herds.",
				"It nearly became extinct by a combination of commercial hunting and slaughter in the 19th century and introduction of bovine diseases from domestic cattle.",
				"With a population in excess of 60 million in the late 18th century, the species was down to just 541 animals by 1889."
			]
		]
		yield ' '.join(data[0])
		yield ' '.join(data[1])

	def is_cacheable(self):
		return False #True
		
	def get_cachename(self):
		return "dummy-bison-" + self.preprocessor_name + "-"

Very very small dummy corpus for testing!

Dataset source: https://en.wikipedia.org/wiki/European_bison, https://en.wikipedia.org/wiki/American_bison

#   def is_cacheable(self):
View Source
	def is_cacheable(self):
		return False #True

Defines if a corpus is cacheable

Returns

bool

#   def get_cachename(self):
View Source
	def get_cachename(self):
		return "dummy-bison-" + self.preprocessor_name + "-"

Returns string what the corpus should be called (and used for name of cache)

#   class ManuscriptCultures(core.corpus.corpus.Corpus):
View Source
class ManuscriptCultures(Corpus):
	'''
		ManuscriptCultures corpus.

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.csmc.uni-hamburg.de/publications/mc.html>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "manuscriptcultures", "data.json")

	def _texts_generator(self):
		with open(ManuscriptCultures.DATAFILE, "r", errors='ignore') as f:
			for line in f:
				line = json.loads(line)
				for article in line['articles' if len(line['articles']) > 0 else 'others']:
					yield article

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "mc-" + self.preprocessor_name + "-"

ManuscriptCultures corpus.

Make sure to run core.download.init_5d7() to downlad needed ressources.

Dataset source: https://www.csmc.uni-hamburg.de/publications/mc.html

#   DATAFILE = '/home/user/data/manuscriptcultures/data.json'
#   def is_cacheable(self):
View Source
	def is_cacheable(self):
		return True

Defines if a corpus is cacheable

Returns

bool

#   def get_cachename(self):
View Source
	def get_cachename(self):
		return "mc-" + self.preprocessor_name + "-"

Returns string what the corpus should be called (and used for name of cache)