core.corpus.corpora
View Source
import os, json from core.corpus.corpus import Corpus import core.utils.const as const class TwentyNews(Corpus): ''' TwentyNewsgroups corpus Make sure to run `core.download.init_5d7()` to downlad needed ressources. Dataset source: <http://qwone.com/~jason/20Newsgroups/> ''' BASEDIR = os.path.join(const.DATASET_DIR, "twentynews") def __init__(self, subgroups=[], **kwargs): ''' Args: subgroups (array): select one or more subgroups, empty selects all ''' # scan subgroups self.all_subgroups = [] for name in os.listdir(TwentyNews.BASEDIR): filename = os.path.join(TwentyNews.BASEDIR, name) if os.path.isfile(filename) and name.endswith(".json"): self.all_subgroups.append(name[:-5]) # only load interesting if len(subgroups) > 0: self.subgroups = [] for subgroup in self.all_subgroups: if subgroup in subgroups: self.subgroups.append(subgroup) else: self.subgroups = self.all_subgroups self.subgroups.sort() super().__init__(**kwargs) def _texts_generator(self): for subgroup in self.subgroups: for document in json.load(open(os.path.join(TwentyNews.BASEDIR, subgroup + ".json"), "r")): yield document def is_cacheable(self): return True def get_cachename(self): name = "twentynews-" + self.preprocessor_name + "-" for s in self.subgroups: name += ''.join([p[0] for p in s.split('-')]) + "-" return name[0:-1] class ArXiv(Corpus): ''' ArXiv corpus, filter one or more categories via ``categories=[]``, empty selects all Make sure to run `core.download.init_5d7()` to downlad needed ressources. Dataset source: <https://www.kaggle.com/Cornell-University/arxiv> ''' DATAFILE = os.path.join(const.DATASET_DIR, "arxiv", "data.json") def __init__(self, categories=[], offset=0, limit=-1, **kwargs): ''' Args: categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`) offset (int): The offset to start at (skip ``offset`` abstracts) limit (init): The maximum number of abstracts to return, -1 means no limit ''' self.limit = limit self.offset = offset self.categories = categories self.categories.sort() super().__init__(**kwargs) def _texts_generator(self): with open(ArXiv.DATAFILE, "r", errors='ignore') as f: position, elements = 0, 0 for line in f: if self.offset > position: position += 1 else: line = json.loads(line) if len(self.categories) > 0: matches_cats = False for a_c in line['categories'].split(' '): for c in self.categories: matches_cats = matches_cats or a_c.startswith(c) else: matches_cats = True if matches_cats: elements += 1 yield line['abstract'] if self.limit != -1 and elements >= self.limit: break def is_cacheable(self): return True def get_cachename(self): return "arxiv-" + self.preprocessor_name + "-" \ + str(self.offset) \ + ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \ + ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "") class Dummy(Corpus): ''' Very very small dummy corpus for testing! Dataset source: <https://en.wikipedia.org/wiki/European_bison>, <https://en.wikipedia.org/wiki/American_bison> ''' def _texts_generator(self): data = [ [ "The European bison (Bison bonasus) or the European wood bison, also known as the wisent, or the zubr, is a European species of bison.", "It is one of two extant species of bison, alongside the American bison.", "The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than modern animals." ], [ "The American bison or simply bison (Bison bison), also commonly known as the American buffalo or simply buffalo, is an American species of bison that once roamed North America in vast herds.", "It nearly became extinct by a combination of commercial hunting and slaughter in the 19th century and introduction of bovine diseases from domestic cattle.", "With a population in excess of 60 million in the late 18th century, the species was down to just 541 animals by 1889." ] ] yield ' '.join(data[0]) yield ' '.join(data[1]) def is_cacheable(self): return False #True def get_cachename(self): return "dummy-bison-" + self.preprocessor_name + "-" class ManuscriptCultures(Corpus): ''' ManuscriptCultures corpus. Make sure to run `core.download.init_5d7()` to downlad needed ressources. Dataset source: <https://www.csmc.uni-hamburg.de/publications/mc.html> ''' DATAFILE = os.path.join(const.DATASET_DIR, "manuscriptcultures", "data.json") def _texts_generator(self): with open(ManuscriptCultures.DATAFILE, "r", errors='ignore') as f: for line in f: line = json.loads(line) for article in line['articles' if len(line['articles']) > 0 else 'others']: yield article def is_cacheable(self): return True def get_cachename(self): return "mc-" + self.preprocessor_name + "-"
View Source
class TwentyNews(Corpus): ''' TwentyNewsgroups corpus Make sure to run `core.download.init_5d7()` to downlad needed ressources. Dataset source: <http://qwone.com/~jason/20Newsgroups/> ''' BASEDIR = os.path.join(const.DATASET_DIR, "twentynews") def __init__(self, subgroups=[], **kwargs): ''' Args: subgroups (array): select one or more subgroups, empty selects all ''' # scan subgroups self.all_subgroups = [] for name in os.listdir(TwentyNews.BASEDIR): filename = os.path.join(TwentyNews.BASEDIR, name) if os.path.isfile(filename) and name.endswith(".json"): self.all_subgroups.append(name[:-5]) # only load interesting if len(subgroups) > 0: self.subgroups = [] for subgroup in self.all_subgroups: if subgroup in subgroups: self.subgroups.append(subgroup) else: self.subgroups = self.all_subgroups self.subgroups.sort() super().__init__(**kwargs) def _texts_generator(self): for subgroup in self.subgroups: for document in json.load(open(os.path.join(TwentyNews.BASEDIR, subgroup + ".json"), "r")): yield document def is_cacheable(self): return True def get_cachename(self): name = "twentynews-" + self.preprocessor_name + "-" for s in self.subgroups: name += ''.join([p[0] for p in s.split('-')]) + "-" return name[0:-1]
TwentyNewsgroups corpus
Make sure to run core.download.init_5d7()
to
downlad needed ressources.
Dataset source: http://qwone.com/~jason/20Newsgroups/
View Source
def __init__(self, subgroups=[], **kwargs): ''' Args: subgroups (array): select one or more subgroups, empty selects all ''' # scan subgroups self.all_subgroups = [] for name in os.listdir(TwentyNews.BASEDIR): filename = os.path.join(TwentyNews.BASEDIR, name) if os.path.isfile(filename) and name.endswith(".json"): self.all_subgroups.append(name[:-5]) # only load interesting if len(subgroups) > 0: self.subgroups = [] for subgroup in self.all_subgroups: if subgroup in subgroups: self.subgroups.append(subgroup) else: self.subgroups = self.all_subgroups self.subgroups.sort() super().__init__(**kwargs)
Args
- subgroups (array): select one or more subgroups, empty selects all
View Source
def is_cacheable(self): return True
Defines if a corpus is cacheable
Returns
bool
View Source
def get_cachename(self): name = "twentynews-" + self.preprocessor_name + "-" for s in self.subgroups: name += ''.join([p[0] for p in s.split('-')]) + "-" return name[0:-1]
Returns string what the corpus should be called (and used for name of cache)
Inherited Members
View Source
class ArXiv(Corpus): ''' ArXiv corpus, filter one or more categories via ``categories=[]``, empty selects all Make sure to run `core.download.init_5d7()` to downlad needed ressources. Dataset source: <https://www.kaggle.com/Cornell-University/arxiv> ''' DATAFILE = os.path.join(const.DATASET_DIR, "arxiv", "data.json") def __init__(self, categories=[], offset=0, limit=-1, **kwargs): ''' Args: categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`) offset (int): The offset to start at (skip ``offset`` abstracts) limit (init): The maximum number of abstracts to return, -1 means no limit ''' self.limit = limit self.offset = offset self.categories = categories self.categories.sort() super().__init__(**kwargs) def _texts_generator(self): with open(ArXiv.DATAFILE, "r", errors='ignore') as f: position, elements = 0, 0 for line in f: if self.offset > position: position += 1 else: line = json.loads(line) if len(self.categories) > 0: matches_cats = False for a_c in line['categories'].split(' '): for c in self.categories: matches_cats = matches_cats or a_c.startswith(c) else: matches_cats = True if matches_cats: elements += 1 yield line['abstract'] if self.limit != -1 and elements >= self.limit: break def is_cacheable(self): return True def get_cachename(self): return "arxiv-" + self.preprocessor_name + "-" \ + str(self.offset) \ + ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \ + ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")
ArXiv corpus, filter one or more categories via categories=[]
, empty selects all
Make sure to run core.download.init_5d7()
to
downlad needed ressources.
Dataset source: https://www.kaggle.com/Cornell-University/arxiv
View Source
def __init__(self, categories=[], offset=0, limit=-1, **kwargs): ''' Args: categories (array): filter categories of the abstracts, empty means no filter (filtering is done via `startswith`) offset (int): The offset to start at (skip ``offset`` abstracts) limit (init): The maximum number of abstracts to return, -1 means no limit ''' self.limit = limit self.offset = offset self.categories = categories self.categories.sort() super().__init__(**kwargs)
Args
- categories (array): filter categories of the abstracts, empty means no filter (filtering is done via
startswith
) - offset (int): The offset to start at (skip
offset
abstracts) - limit (init): The maximum number of abstracts to return, -1 means no limit
View Source
def is_cacheable(self): return True
Defines if a corpus is cacheable
Returns
bool
View Source
def get_cachename(self): return "arxiv-" + self.preprocessor_name + "-" \ + str(self.offset) \ + ( ("-" + str(self.limit)) if self.limit != -1 else "" ) \ + ( ('-' +'-'.join(self.categories)) if len(self.categories) > 0 else "")
Returns string what the corpus should be called (and used for name of cache)
Inherited Members
View Source
class Dummy(Corpus): ''' Very very small dummy corpus for testing! Dataset source: <https://en.wikipedia.org/wiki/European_bison>, <https://en.wikipedia.org/wiki/American_bison> ''' def _texts_generator(self): data = [ [ "The European bison (Bison bonasus) or the European wood bison, also known as the wisent, or the zubr, is a European species of bison.", "It is one of two extant species of bison, alongside the American bison.", "The European bison is the heaviest wild land animal in Europe and individuals in the past may have been even larger than modern animals." ], [ "The American bison or simply bison (Bison bison), also commonly known as the American buffalo or simply buffalo, is an American species of bison that once roamed North America in vast herds.", "It nearly became extinct by a combination of commercial hunting and slaughter in the 19th century and introduction of bovine diseases from domestic cattle.", "With a population in excess of 60 million in the late 18th century, the species was down to just 541 animals by 1889." ] ] yield ' '.join(data[0]) yield ' '.join(data[1]) def is_cacheable(self): return False #True def get_cachename(self): return "dummy-bison-" + self.preprocessor_name + "-"
Very very small dummy corpus for testing!
Dataset source: https://en.wikipedia.org/wiki/European_bison, https://en.wikipedia.org/wiki/American_bison
View Source
def is_cacheable(self): return False #True
Defines if a corpus is cacheable
Returns
bool
View Source
def get_cachename(self): return "dummy-bison-" + self.preprocessor_name + "-"
Returns string what the corpus should be called (and used for name of cache)
Inherited Members
View Source
class ManuscriptCultures(Corpus): ''' ManuscriptCultures corpus. Make sure to run `core.download.init_5d7()` to downlad needed ressources. Dataset source: <https://www.csmc.uni-hamburg.de/publications/mc.html> ''' DATAFILE = os.path.join(const.DATASET_DIR, "manuscriptcultures", "data.json") def _texts_generator(self): with open(ManuscriptCultures.DATAFILE, "r", errors='ignore') as f: for line in f: line = json.loads(line) for article in line['articles' if len(line['articles']) > 0 else 'others']: yield article def is_cacheable(self): return True def get_cachename(self): return "mc-" + self.preprocessor_name + "-"
ManuscriptCultures corpus.
Make sure to run core.download.init_5d7()
to
downlad needed ressources.
Dataset source: https://www.csmc.uni-hamburg.de/publications/mc.html
View Source
def is_cacheable(self): return True
Defines if a corpus is cacheable
Returns
bool
View Source
def get_cachename(self): return "mc-" + self.preprocessor_name + "-"
Returns string what the corpus should be called (and used for name of cache)