core.corpus.preprocess

View Source
import re

from abc import ABC, abstractmethod

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

class Preprocessor(ABC):
	'''
		Preprocessor class used to preprocess copora.

		See `core.corpus.preprocess.DefaultPreprocessor` and `core.corpus.preprocess.MinimalPreprocessor`
		for non abstract subclasses.

		Make sure to run `core.download.init_nltk()` to
		downlad needed ressources.
	'''

	def __init__(self):
		pass

	def preprocess_document(self, document):
		'''
			preprocess a document with multiple sentences

			Args:
				document (string): the document (multiple sentences) to preprocess

			Returns:
				array of sentences (of words)
		'''
		sentences = []
		for sentence in sent_tokenize(document):
			sentence = self.preprocess_words(sentence)
			if len(sentence) > 0:
				sentences.append(sentence)
		return sentences

	def preprocess_words(self, words):
		'''
			preprocess a sentence with multiple words

			Called by `core.corpus.preprocess.Preprocessor.preprocess_document()` on each sentence.

			Args:
				words (string): the words (one sentences) to preprocess

			Returns:
				array of words
		'''
		prepro_words = []
		for word in word_tokenize(words):
			prepro_word = self.preprocess_word(word)
			if len(prepro_word) > 0:
				prepro_words.append(prepro_word)
		return prepro_words

	@abstractmethod
	def preprocess_word(self, word):
		'''
			preprocess a single word

			Called by `core.corpus.preprocess.Preprocessor.preprocess_words()` on each word

			Args:
				word (string): the word to preprocess

			Resturns:
				the word or empty string 
		'''
		pass

class DefaultPreprocessor(Preprocessor):
	'''
		This Preprocessor uses NTLK to tokenize words and sentences.
		It also stems words and removes stop words.
	'''

	def __init__(self):
		# note: without this variable, the program would always run/load the whole list for each word
		self.stopword_list = stopwords.words('english')
		self.stemmer = SnowballStemmer("english")

	def preprocess_word(self, word):
		'''
			preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it

			Args:
				word (string): the word to preprocess

			Resturns:
				the word or empty string 
		'''
		word = re.sub(r'[^a-z]', '', word.lower())
		if not word in self.stopword_list:
			return self.stemmer.stem(word)
		else:
			return ''

class MinimalPreprocessor(Preprocessor):
	'''
		This Preprocessor only uses NTLK to tokenize words and sentences.
		It lets all words unchanged!
	'''

	def preprocess_word(self, word):
		return word
#   class Preprocessor(abc.ABC):
View Source
class Preprocessor(ABC):
	'''
		Preprocessor class used to preprocess copora.

		See `core.corpus.preprocess.DefaultPreprocessor` and `core.corpus.preprocess.MinimalPreprocessor`
		for non abstract subclasses.

		Make sure to run `core.download.init_nltk()` to
		downlad needed ressources.
	'''

	def __init__(self):
		pass

	def preprocess_document(self, document):
		'''
			preprocess a document with multiple sentences

			Args:
				document (string): the document (multiple sentences) to preprocess

			Returns:
				array of sentences (of words)
		'''
		sentences = []
		for sentence in sent_tokenize(document):
			sentence = self.preprocess_words(sentence)
			if len(sentence) > 0:
				sentences.append(sentence)
		return sentences

	def preprocess_words(self, words):
		'''
			preprocess a sentence with multiple words

			Called by `core.corpus.preprocess.Preprocessor.preprocess_document()` on each sentence.

			Args:
				words (string): the words (one sentences) to preprocess

			Returns:
				array of words
		'''
		prepro_words = []
		for word in word_tokenize(words):
			prepro_word = self.preprocess_word(word)
			if len(prepro_word) > 0:
				prepro_words.append(prepro_word)
		return prepro_words

	@abstractmethod
	def preprocess_word(self, word):
		'''
			preprocess a single word

			Called by `core.corpus.preprocess.Preprocessor.preprocess_words()` on each word

			Args:
				word (string): the word to preprocess

			Resturns:
				the word or empty string 
		'''
		pass

Preprocessor class used to preprocess copora.

See core.corpus.preprocess.DefaultPreprocessor and core.corpus.preprocess.MinimalPreprocessor for non abstract subclasses.

Make sure to run core.download.init_nltk() to downlad needed ressources.

#   Preprocessor()
View Source
	def __init__(self):
		pass
#   def preprocess_document(self, document):
View Source
	def preprocess_document(self, document):
		'''
			preprocess a document with multiple sentences

			Args:
				document (string): the document (multiple sentences) to preprocess

			Returns:
				array of sentences (of words)
		'''
		sentences = []
		for sentence in sent_tokenize(document):
			sentence = self.preprocess_words(sentence)
			if len(sentence) > 0:
				sentences.append(sentence)
		return sentences

preprocess a document with multiple sentences

Args
  • document (string): the document (multiple sentences) to preprocess
Returns

array of sentences (of words)

#   def preprocess_words(self, words):
View Source
	def preprocess_words(self, words):
		'''
			preprocess a sentence with multiple words

			Called by `core.corpus.preprocess.Preprocessor.preprocess_document()` on each sentence.

			Args:
				words (string): the words (one sentences) to preprocess

			Returns:
				array of words
		'''
		prepro_words = []
		for word in word_tokenize(words):
			prepro_word = self.preprocess_word(word)
			if len(prepro_word) > 0:
				prepro_words.append(prepro_word)
		return prepro_words

preprocess a sentence with multiple words

Called by core.corpus.preprocess.Preprocessor.preprocess_document() on each sentence.

Args
  • words (string): the words (one sentences) to preprocess
Returns

array of words

#  
@abstractmethod
def preprocess_word(self, word):
View Source
	@abstractmethod
	def preprocess_word(self, word):
		'''
			preprocess a single word

			Called by `core.corpus.preprocess.Preprocessor.preprocess_words()` on each word

			Args:
				word (string): the word to preprocess

			Resturns:
				the word or empty string 
		'''
		pass

preprocess a single word

Called by core.corpus.preprocess.Preprocessor.preprocess_words() on each word

Args
  • word (string): the word to preprocess
Resturns

the word or empty string

#   class DefaultPreprocessor(Preprocessor):
View Source
class DefaultPreprocessor(Preprocessor):
	'''
		This Preprocessor uses NTLK to tokenize words and sentences.
		It also stems words and removes stop words.
	'''

	def __init__(self):
		# note: without this variable, the program would always run/load the whole list for each word
		self.stopword_list = stopwords.words('english')
		self.stemmer = SnowballStemmer("english")

	def preprocess_word(self, word):
		'''
			preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it

			Args:
				word (string): the word to preprocess

			Resturns:
				the word or empty string 
		'''
		word = re.sub(r'[^a-z]', '', word.lower())
		if not word in self.stopword_list:
			return self.stemmer.stem(word)
		else:
			return ''

This Preprocessor uses NTLK to tokenize words and sentences. It also stems words and removes stop words.

#   DefaultPreprocessor()
View Source
	def __init__(self):
		# note: without this variable, the program would always run/load the whole list for each word
		self.stopword_list = stopwords.words('english')
		self.stemmer = SnowballStemmer("english")
#   def preprocess_word(self, word):
View Source
	def preprocess_word(self, word):
		'''
			preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it

			Args:
				word (string): the word to preprocess

			Resturns:
				the word or empty string 
		'''
		word = re.sub(r'[^a-z]', '', word.lower())
		if not word in self.stopword_list:
			return self.stemmer.stem(word)
		else:
			return ''

preprocess a single word by removing all non alphanumerical chars, transforming to lower case and stemming it

Args
  • word (string): the word to preprocess
Resturns

the word or empty string

#   class MinimalPreprocessor(Preprocessor):
View Source
class MinimalPreprocessor(Preprocessor):
	'''
		This Preprocessor only uses NTLK to tokenize words and sentences.
		It lets all words unchanged!
	'''

	def preprocess_word(self, word):
		return word

This Preprocessor only uses NTLK to tokenize words and sentences. It lets all words unchanged!

#   def preprocess_word(self, word):
View Source
	def preprocess_word(self, word):
		return word

preprocess a single word

Called by core.corpus.preprocess.Preprocessor.preprocess_words() on each word

Args
  • word (string): the word to preprocess
Resturns

the word or empty string