NIM_E31221299/scripts/venv/Lib/site-packages/Sastrawi/Stemmer/Stemmer.py

import re
from Sastrawi.Stemmer.Context.Visitor.VisitorProvider import VisitorProvider
from Sastrawi.Stemmer.Filter import TextNormalizer
from Sastrawi.Stemmer.Context.Context import Context

class Stemmer(object):
    """Indonesian Stemmer.
    Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS.

    @link https://github.com/sastrawi/sastrawi/wiki/Resources
    """
    def __init__(self, dictionary):
        self.dictionary = dictionary
        self.visitor_provider = VisitorProvider()

    def get_dictionary(self):
        return self.dictionary

    def stem(self, text):
        """Stem a text string to its common stem form."""
        normalizedText = TextNormalizer.normalize_text(text)

        words = normalizedText.split(' ')
        stems = []

        for word in words:
            stems.append(self.stem_word(word))

        return ' '.join(stems)

    def stem_word(self, word):
        """Stem a word to its common stem form."""
        if self.is_plural(word):
            return self.stem_plural_word(word)
        else:
            return self.stem_singular_word(word)

    def is_plural(self, word):
        #-ku|-mu|-nya
        #nikmat-Ku, etc
        matches = re.match(r'^(.*)-(ku|mu|nya|lah|kah|tah|pun)$', word)
        if matches:
            return matches.group(1).find('-') != -1

        return word.find('-') != -1

    def stem_plural_word(self, plural):
        """Stem a plural word to its common stem form.
        Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 76-77.

        @link   http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
        """
        matches = re.match(r'^(.*)-(.*)$', plural)
        #translated from PHP conditional check:
        #if (!isset($words[1]) || !isset($words[2]))
        if not matches:
            return plural
        words = [matches.group(1), matches.group(2)]

        #malaikat-malaikat-nya -> malaikat malaikat-nya
        suffix = words[1]
        suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
        matches = re.match(r'^(.*)-(.*)$', words[0])
        if suffix in suffixes and matches:
            words[0] = matches.group(1)
            words[1] = matches.group(2) + '-' + suffix

        #berbalas-balasan -> balas
        rootWord1 = self.stem_singular_word(words[0])
        rootWord2 = self.stem_singular_word(words[1])

        #meniru-nirukan -> tiru
        if not self.dictionary.contains(words[1]) and rootWord2 == words[1]:
            rootWord2 = self.stem_singular_word('me' + words[1])

        if rootWord1 == rootWord2:
            return rootWord1
        else:
            return plural

    def stem_singular_word(self, word):
        """Stem a singular word to its common stem form."""
        context = Context(word, self.dictionary, self.visitor_provider)
        context.execute()

        return context.result