core.corpus.preprocess
View Source
import re from abc import ABC, abstractmethod from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from nltk.stem.snowball import SnowballStemmer class Preprocessor(ABC): ''' Preprocessor class used to preprocess copora. See `core.corpus.preprocess.DefaultPreprocessor` and `core.corpus.preprocess.MinimalPreprocessor` for non abstract subclasses. Make sure to run `core.download.init_nltk()` to downlad needed ressources. ''' def __init__(self): pass def preprocess_document(self, document): ''' preprocess a document with multiple sentences Args: document (string): the document (multiple sentences) to preprocess Returns: array of sentences (of words) ''' sentences = [] for sentence in sent_tokenize(document): sentence = self.preprocess_words(sentence) if len(sentence) > 0: sentences.append(sentence) return sentences def preprocess_words(self, words): ''' preprocess a sentence with multiple words Called by `core.corpus.preprocess.Preprocessor.preprocess_document()` on each sentence. Args: words (string): the words (one sentences) to preprocess Returns: array of words ''' prepro_words = [] for word in word_tokenize(words): prepro_word = self.preprocess_word(word) if len(prepro_word) > 0: prepro_words.append(prepro_word) return prepro_words @abstractmethod def preprocess_word(self, word): ''' preprocess a single word Called by `core.corpus.preprocess.Preprocessor.preprocess_words()` on each word Args: word (string): the word to preprocess Resturns: the word or empty string ''' pass class DefaultPreprocessor(Preprocessor): ''' This Preprocessor uses NTLK to tokenize words and sentences. It also stems words and removes stop words. ''' def __init__(self): # note: without this variable, the program would always run/load the whole list for each word self.stopword_list = stopwords.words('english') self.stemmer = SnowballStemmer("english") def preprocess_word(self, word): ''' preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it Args: word (string): the word to preprocess Resturns: the word or empty string ''' word = re.sub(r'[^a-z]', '', word.lower()) if not word in self.stopword_list: return self.stemmer.stem(word) else: return '' class MinimalPreprocessor(Preprocessor): ''' This Preprocessor only uses NTLK to tokenize words and sentences. It lets all words unchanged! ''' def preprocess_word(self, word): return word
View Source
class Preprocessor(ABC): ''' Preprocessor class used to preprocess copora. See `core.corpus.preprocess.DefaultPreprocessor` and `core.corpus.preprocess.MinimalPreprocessor` for non abstract subclasses. Make sure to run `core.download.init_nltk()` to downlad needed ressources. ''' def __init__(self): pass def preprocess_document(self, document): ''' preprocess a document with multiple sentences Args: document (string): the document (multiple sentences) to preprocess Returns: array of sentences (of words) ''' sentences = [] for sentence in sent_tokenize(document): sentence = self.preprocess_words(sentence) if len(sentence) > 0: sentences.append(sentence) return sentences def preprocess_words(self, words): ''' preprocess a sentence with multiple words Called by `core.corpus.preprocess.Preprocessor.preprocess_document()` on each sentence. Args: words (string): the words (one sentences) to preprocess Returns: array of words ''' prepro_words = [] for word in word_tokenize(words): prepro_word = self.preprocess_word(word) if len(prepro_word) > 0: prepro_words.append(prepro_word) return prepro_words @abstractmethod def preprocess_word(self, word): ''' preprocess a single word Called by `core.corpus.preprocess.Preprocessor.preprocess_words()` on each word Args: word (string): the word to preprocess Resturns: the word or empty string ''' pass
Preprocessor class used to preprocess copora.
See core.corpus.preprocess.DefaultPreprocessor
and core.corpus.preprocess.MinimalPreprocessor
for non abstract subclasses.
Make sure to run core.download.init_nltk()
to
downlad needed ressources.
View Source
def __init__(self): pass
View Source
def preprocess_document(self, document): ''' preprocess a document with multiple sentences Args: document (string): the document (multiple sentences) to preprocess Returns: array of sentences (of words) ''' sentences = [] for sentence in sent_tokenize(document): sentence = self.preprocess_words(sentence) if len(sentence) > 0: sentences.append(sentence) return sentences
preprocess a document with multiple sentences
Args
- document (string): the document (multiple sentences) to preprocess
Returns
array of sentences (of words)
View Source
def preprocess_words(self, words): ''' preprocess a sentence with multiple words Called by `core.corpus.preprocess.Preprocessor.preprocess_document()` on each sentence. Args: words (string): the words (one sentences) to preprocess Returns: array of words ''' prepro_words = [] for word in word_tokenize(words): prepro_word = self.preprocess_word(word) if len(prepro_word) > 0: prepro_words.append(prepro_word) return prepro_words
preprocess a sentence with multiple words
Called by core.corpus.preprocess.Preprocessor.preprocess_document()
on each sentence.
Args
- words (string): the words (one sentences) to preprocess
Returns
array of words
View Source
@abstractmethod def preprocess_word(self, word): ''' preprocess a single word Called by `core.corpus.preprocess.Preprocessor.preprocess_words()` on each word Args: word (string): the word to preprocess Resturns: the word or empty string ''' pass
preprocess a single word
Called by core.corpus.preprocess.Preprocessor.preprocess_words()
on each word
Args
- word (string): the word to preprocess
Resturns
the word or empty string
View Source
class DefaultPreprocessor(Preprocessor): ''' This Preprocessor uses NTLK to tokenize words and sentences. It also stems words and removes stop words. ''' def __init__(self): # note: without this variable, the program would always run/load the whole list for each word self.stopword_list = stopwords.words('english') self.stemmer = SnowballStemmer("english") def preprocess_word(self, word): ''' preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it Args: word (string): the word to preprocess Resturns: the word or empty string ''' word = re.sub(r'[^a-z]', '', word.lower()) if not word in self.stopword_list: return self.stemmer.stem(word) else: return ''
This Preprocessor uses NTLK to tokenize words and sentences. It also stems words and removes stop words.
View Source
def __init__(self): # note: without this variable, the program would always run/load the whole list for each word self.stopword_list = stopwords.words('english') self.stemmer = SnowballStemmer("english")
View Source
def preprocess_word(self, word): ''' preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it Args: word (string): the word to preprocess Resturns: the word or empty string ''' word = re.sub(r'[^a-z]', '', word.lower()) if not word in self.stopword_list: return self.stemmer.stem(word) else: return ''
preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it
Args
- word (string): the word to preprocess
Resturns
the word or empty string
Inherited Members
View Source
class MinimalPreprocessor(Preprocessor): ''' This Preprocessor only uses NTLK to tokenize words and sentences. It lets all words unchanged! ''' def preprocess_word(self, word): return word
This Preprocessor only uses NTLK to tokenize words and sentences. It lets all words unchanged!
View Source
def preprocess_word(self, word): return word
preprocess a single word
Called by core.corpus.preprocess.Preprocessor.preprocess_words()
on each word
Args
- word (string): the word to preprocess
Resturns
the word or empty string