88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
import re
|
|
from Sastrawi.Stemmer.Context.Visitor.VisitorProvider import VisitorProvider
|
|
from Sastrawi.Stemmer.Filter import TextNormalizer
|
|
from Sastrawi.Stemmer.Context.Context import Context
|
|
|
|
class Stemmer(object):
|
|
"""Indonesian Stemmer.
|
|
Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS.
|
|
|
|
@link https://github.com/sastrawi/sastrawi/wiki/Resources
|
|
"""
|
|
def __init__(self, dictionary):
|
|
self.dictionary = dictionary
|
|
self.visitor_provider = VisitorProvider()
|
|
|
|
def get_dictionary(self):
|
|
return self.dictionary
|
|
|
|
def stem(self, text):
|
|
"""Stem a text string to its common stem form."""
|
|
normalizedText = TextNormalizer.normalize_text(text)
|
|
|
|
words = normalizedText.split(' ')
|
|
stems = []
|
|
|
|
for word in words:
|
|
stems.append(self.stem_word(word))
|
|
|
|
return ' '.join(stems)
|
|
|
|
def stem_word(self, word):
|
|
"""Stem a word to its common stem form."""
|
|
if self.is_plural(word):
|
|
return self.stem_plural_word(word)
|
|
else:
|
|
return self.stem_singular_word(word)
|
|
|
|
def is_plural(self, word):
|
|
#-ku|-mu|-nya
|
|
#nikmat-Ku, etc
|
|
matches = re.match(r'^(.*)-(ku|mu|nya|lah|kah|tah|pun)$', word)
|
|
if matches:
|
|
return matches.group(1).find('-') != -1
|
|
|
|
return word.find('-') != -1
|
|
|
|
def stem_plural_word(self, plural):
|
|
"""Stem a plural word to its common stem form.
|
|
Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 76-77.
|
|
|
|
@link http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
"""
|
|
matches = re.match(r'^(.*)-(.*)$', plural)
|
|
#translated from PHP conditional check:
|
|
#if (!isset($words[1]) || !isset($words[2]))
|
|
if not matches:
|
|
return plural
|
|
words = [matches.group(1), matches.group(2)]
|
|
|
|
#malaikat-malaikat-nya -> malaikat malaikat-nya
|
|
suffix = words[1]
|
|
suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
|
|
matches = re.match(r'^(.*)-(.*)$', words[0])
|
|
if suffix in suffixes and matches:
|
|
words[0] = matches.group(1)
|
|
words[1] = matches.group(2) + '-' + suffix
|
|
|
|
#berbalas-balasan -> balas
|
|
rootWord1 = self.stem_singular_word(words[0])
|
|
rootWord2 = self.stem_singular_word(words[1])
|
|
|
|
#meniru-nirukan -> tiru
|
|
if not self.dictionary.contains(words[1]) and rootWord2 == words[1]:
|
|
rootWord2 = self.stem_singular_word('me' + words[1])
|
|
|
|
if rootWord1 == rootWord2:
|
|
return rootWord1
|
|
else:
|
|
return plural
|
|
|
|
def stem_singular_word(self, word):
|
|
"""Stem a singular word to its common stem form."""
|
|
context = Context(word, self.dictionary, self.visitor_provider)
|
|
context.execute()
|
|
|
|
return context.result
|
|
|