core.corpus.corpora

View Source

import os, json

from core.corpus.corpus import Corpus
import core.utils.const as const


class TwentyNews(Corpus):
	'''
		TwentyNewsgroups corpus

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <http://qwone.com/~jason/20Newsgroups/>
	'''

	BASEDIR = os.path.join(const.DATASET_DIR, "twentynews")

	def __init__(self, subgroups=[], **kwargs):
		'''
			Args:
				subgroups (array): select one or more subgroups, empty selects all
		'''
		# scan subgroups
		self.all_subgroups = []
		for name in os.listdir(TwentyNews.BASEDIR):
			filename = os.path.join(TwentyNews.BASEDIR, name)
			if os.path.isfile(filename) and name.endswith(".json"):
				self.all_subgroups.append(name[:-5])

		# only load interesting
		if len(subgroups) > 0:
			self.subgroups = []
			for subgroup in self.all_subgroups:
				if subgroup in subgroups:
					self.subgroups.append(subgroup)
		else:
			self.subgroups = self.all_subgroups

		self.subgroups.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		for subgroup in self.subgroups:
			for document in json.load(open(os.path.join(TwentyNews.BASEDIR, subgroup + ".json"), "r")):
				yield document

	def is_cacheable(self):
		return True

	def get_cachename(self):
		name = "twentynews-" + self.preprocessor_name + "-"
		for s in self.subgroups:
			name += ''.join([p[0] for p in s.split('-')]) + "-"
		return name[0:-1]

class ArXiv(Corpus):
	'''
		ArXiv corpus, filter one or more categories via ``categories=[]``, empty selects all

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.kaggle.com/Cornell-University/arxiv>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "arxiv", "data.json")

	def __init__(self, categories=[], offset=0, limit=-1, **kwargs):
		'''
			Args:
				categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`)
				offset (int): The offset to start at (skip ``offset`` abstracts)
				limit (init): The maximum number of abstracts to return, -1 means no limit
		'''

		self.limit = limit
		self.offset = offset
		self.categories = categories	
		self.categories.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		with open(ArXiv.DATAFILE, "r", errors='ignore') as f:
			position, elements = 0, 0
		
			for line in f:
				if self.offset > position:
					position += 1
				else:
					line = json.loads(line)

					if len(self.categories) > 0:
						matches_cats = False
						for a_c in line['categories'].split(' '):
							for c in self.categories:
								matches_cats = matches_cats or a_c.startswith(c)
					else:
						matches_cats = True
					
					if matches_cats:
						elements += 1
						yield line['abstract']
				
				if self.limit != -1 and elements >= self.limit:
					break

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "arxiv-" + self.preprocessor_name + "-" \
			+ str(self.offset) \
			+ ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \
			+ ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")

class Dummy(Corpus):
	'''
		Very very small dummy corpus for testing!

		Dataset source: <https://en.wikipedia.org/wiki/European_bison>, <https://en.wikipedia.org/wiki/American_bison>
	'''

	def _texts_generator(self):
		data = [
			[
				"The European bison (Bison bonasus) or the European wood bison, also known as the wisent, or the zubr, is a European species of bison.",
				"It is one of two extant species of bison, alongside the American bison.",
				"The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than modern animals."
			],
			[
				"The American bison or simply bison (Bison bison), also commonly known as the American buffalo or simply buffalo, is an American species of bison that once roamed North America in vast herds.",
				"It nearly became extinct by a combination of commercial hunting and slaughter in the 19th century and introduction of bovine diseases from domestic cattle.",
				"With a population in excess of 60 million in the late 18th century, the species was down to just 541 animals by 1889."
			]
		]
		yield ' '.join(data[0])
		yield ' '.join(data[1])

	def is_cacheable(self):
		return False #True
		
	def get_cachename(self):
		return "dummy-bison-" + self.preprocessor_name + "-"

class ManuscriptCultures(Corpus):
	'''
		ManuscriptCultures corpus.

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.csmc.uni-hamburg.de/publications/mc.html>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "manuscriptcultures", "data.json")

	def _texts_generator(self):
		with open(ManuscriptCultures.DATAFILE, "r", errors='ignore') as f:
			for line in f:
				line = json.loads(line)
				for article in line['articles' if len(line['articles']) > 0 else 'others']:
					yield article

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "mc-" + self.preprocessor_name + "-"

# class TwentyNews(core.corpus.corpus.Corpus):

View Source

class TwentyNews(Corpus):
	'''
		TwentyNewsgroups corpus

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <http://qwone.com/~jason/20Newsgroups/>
	'''

	BASEDIR = os.path.join(const.DATASET_DIR, "twentynews")

	def __init__(self, subgroups=[], **kwargs):
		'''
			Args:
				subgroups (array): select one or more subgroups, empty selects all
		'''
		# scan subgroups
		self.all_subgroups = []
		for name in os.listdir(TwentyNews.BASEDIR):
			filename = os.path.join(TwentyNews.BASEDIR, name)
			if os.path.isfile(filename) and name.endswith(".json"):
				self.all_subgroups.append(name[:-5])

		# only load interesting
		if len(subgroups) > 0:
			self.subgroups = []
			for subgroup in self.all_subgroups:
				if subgroup in subgroups:
					self.subgroups.append(subgroup)
		else:
			self.subgroups = self.all_subgroups

		self.subgroups.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		for subgroup in self.subgroups:
			for document in json.load(open(os.path.join(TwentyNews.BASEDIR, subgroup + ".json"), "r")):
				yield document

	def is_cacheable(self):
		return True

	def get_cachename(self):
		name = "twentynews-" + self.preprocessor_name + "-"
		for s in self.subgroups:
			name += ''.join([p[0] for p in s.split('-')]) + "-"
		return name[0:-1]

TwentyNewsgroups corpus

Make sure to run core.download.init_5d7() to downlad needed ressources.

Dataset source: http://qwone.com/~jason/20Newsgroups/

# TwentyNews(subgroups=[], **kwargs)

View Source

	def __init__(self, subgroups=[], **kwargs):
		'''
			Args:
				subgroups (array): select one or more subgroups, empty selects all
		'''
		# scan subgroups
		self.all_subgroups = []
		for name in os.listdir(TwentyNews.BASEDIR):
			filename = os.path.join(TwentyNews.BASEDIR, name)
			if os.path.isfile(filename) and name.endswith(".json"):
				self.all_subgroups.append(name[:-5])

		# only load interesting
		if len(subgroups) > 0:
			self.subgroups = []
			for subgroup in self.all_subgroups:
				if subgroup in subgroups:
					self.subgroups.append(subgroup)
		else:
			self.subgroups = self.all_subgroups

		self.subgroups.sort()

		super().__init__(**kwargs)

Args

subgroups (array): select one or more subgroups, empty selects all

# BASEDIR = '/home/user/data/twentynews'

# def is_cacheable(self):

View Source

	def is_cacheable(self):
		return True

Defines if a corpus is cacheable

Returns

bool

# def get_cachename(self):

View Source

	def get_cachename(self):
		name = "twentynews-" + self.preprocessor_name + "-"
		for s in self.subgroups:
			name += ''.join([p[0] for p in s.split('-')]) + "-"
		return name[0:-1]

Returns string what the corpus should be called (and used for name of cache)

Inherited Members

core.corpus.corpus.Corpus: split; iterate_sentences; iterate_texts; get_num_sentences; get_num_texts

# class ArXiv(core.corpus.corpus.Corpus):

View Source

class ArXiv(Corpus):
	'''
		ArXiv corpus, filter one or more categories via ``categories=[]``, empty selects all

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.kaggle.com/Cornell-University/arxiv>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "arxiv", "data.json")

	def __init__(self, categories=[], offset=0, limit=-1, **kwargs):
		'''
			Args:
				categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`)
				offset (int): The offset to start at (skip ``offset`` abstracts)
				limit (init): The maximum number of abstracts to return, -1 means no limit
		'''

		self.limit = limit
		self.offset = offset
		self.categories = categories	
		self.categories.sort()

		super().__init__(**kwargs)	

	def _texts_generator(self):
		with open(ArXiv.DATAFILE, "r", errors='ignore') as f:
			position, elements = 0, 0
		
			for line in f:
				if self.offset > position:
					position += 1
				else:
					line = json.loads(line)

					if len(self.categories) > 0:
						matches_cats = False
						for a_c in line['categories'].split(' '):
							for c in self.categories:
								matches_cats = matches_cats or a_c.startswith(c)
					else:
						matches_cats = True
					
					if matches_cats:
						elements += 1
						yield line['abstract']
				
				if self.limit != -1 and elements >= self.limit:
					break

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "arxiv-" + self.preprocessor_name + "-" \
			+ str(self.offset) \
			+ ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \
			+ ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")

ArXiv corpus, filter one or more categories via categories=[], empty selects all

Make sure to run core.download.init_5d7() to downlad needed ressources.

Dataset source: https://www.kaggle.com/Cornell-University/arxiv

# ArXiv(categories=[], offset=0, limit=-1, **kwargs)

View Source

	def __init__(self, categories=[], offset=0, limit=-1, **kwargs):
		'''
			Args:
				categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`)
				offset (int): The offset to start at (skip ``offset`` abstracts)
				limit (init): The maximum number of abstracts to return, -1 means no limit
		'''

		self.limit = limit
		self.offset = offset
		self.categories = categories	
		self.categories.sort()

		super().__init__(**kwargs)

Args

categories (array): filter categories of the abstracts, empty means no filter (filtering is done via startswith)
offset (int): The offset to start at (skip offset abstracts)
limit (init): The maximum number of abstracts to return, -1 means no limit

# DATAFILE = '/home/user/data/arxiv/data.json'

# def is_cacheable(self):

View Source

	def is_cacheable(self):
		return True

Defines if a corpus is cacheable

Returns

bool

# def get_cachename(self):

View Source

	def get_cachename(self):
		return "arxiv-" + self.preprocessor_name + "-" \
			+ str(self.offset) \
			+ ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \
			+ ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")

Returns string what the corpus should be called (and used for name of cache)

Inherited Members

core.corpus.corpus.Corpus: split; iterate_sentences; iterate_texts; get_num_sentences; get_num_texts

# class Dummy(core.corpus.corpus.Corpus):

View Source

class Dummy(Corpus):
	'''
		Very very small dummy corpus for testing!

		Dataset source: <https://en.wikipedia.org/wiki/European_bison>, <https://en.wikipedia.org/wiki/American_bison>
	'''

	def _texts_generator(self):
		data = [
			[
				"The European bison (Bison bonasus) or the European wood bison, also known as the wisent, or the zubr, is a European species of bison.",
				"It is one of two extant species of bison, alongside the American bison.",
				"The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than modern animals."
			],
			[
				"The American bison or simply bison (Bison bison), also commonly known as the American buffalo or simply buffalo, is an American species of bison that once roamed North America in vast herds.",
				"It nearly became extinct by a combination of commercial hunting and slaughter in the 19th century and introduction of bovine diseases from domestic cattle.",
				"With a population in excess of 60 million in the late 18th century, the species was down to just 541 animals by 1889."
			]
		]
		yield ' '.join(data[0])
		yield ' '.join(data[1])

	def is_cacheable(self):
		return False #True
		
	def get_cachename(self):
		return "dummy-bison-" + self.preprocessor_name + "-"

Very very small dummy corpus for testing!

Dataset source: https://en.wikipedia.org/wiki/European_bison, https://en.wikipedia.org/wiki/American_bison

# def is_cacheable(self):

View Source

	def is_cacheable(self):
		return False #True

Defines if a corpus is cacheable

Returns

bool

# def get_cachename(self):

View Source

	def get_cachename(self):
		return "dummy-bison-" + self.preprocessor_name + "-"

Returns string what the corpus should be called (and used for name of cache)

Inherited Members

core.corpus.corpus.Corpus: Corpus; split; iterate_sentences; iterate_texts; get_num_sentences; get_num_texts

# class ManuscriptCultures(core.corpus.corpus.Corpus):

View Source

class ManuscriptCultures(Corpus):
	'''
		ManuscriptCultures corpus.

		Make sure to run `core.download.init_5d7()` to
		downlad needed ressources.

		Dataset source: <https://www.csmc.uni-hamburg.de/publications/mc.html>
	'''

	DATAFILE = os.path.join(const.DATASET_DIR, "manuscriptcultures", "data.json")

	def _texts_generator(self):
		with open(ManuscriptCultures.DATAFILE, "r", errors='ignore') as f:
			for line in f:
				line = json.loads(line)
				for article in line['articles' if len(line['articles']) > 0 else 'others']:
					yield article

	def is_cacheable(self):
		return True

	def get_cachename(self):
		return "mc-" + self.preprocessor_name + "-"

ManuscriptCultures corpus.

Make sure to run core.download.init_5d7() to downlad needed ressources.

Dataset source: https://www.csmc.uni-hamburg.de/publications/mc.html

# DATAFILE = '/home/user/data/manuscriptcultures/data.json'

# def is_cacheable(self):

View Source

	def is_cacheable(self):
		return True

Defines if a corpus is cacheable

Returns

bool

# def get_cachename(self):

View Source

	def get_cachename(self):
		return "mc-" + self.preprocessor_name + "-"

Returns string what the corpus should be called (and used for name of cache)

Inherited Members

core.corpus.corpus.Corpus: Corpus; split; iterate_sentences; iterate_texts; get_num_sentences; get_num_texts