first commit

2025-03-20 11:39:12 +07:00 · 2025-03-20 11:39:12 +07:00 · c9035a115e
parent 36d3c88aae
commit c9035a115e
6 changed files with 242 additions and 0 deletions
--- a/pycache/preprocessing.cpython-310.pyc
+++ b/pycache/preprocessing.cpython-310.pyc
--- a/app.py
+++ b/app.py
@ -0,0 +1,47 @@
+from flask import Flask, request, render_template
+import pickle
+from preprocessing import preprocess_text  # pastikan file preprocessing.py ada di folder yang sama
+
+app = Flask(__name__)
+
+with open('model_SVM.pkl', 'rb') as f:
+    model_SVM = pickle.load(f)
+with open('model_NB.pkl', 'rb') as f:
+    model_NB = pickle.load(f)
+
+def classify_text(processed_text):
+    result_svm = model_SVM.predict([processed_text])[0]
+    result_nb = model_NB.predict([processed_text])[0]
+    return result_svm, result_nb
+
+@app.route('/', methods=['GET', 'POST'])
+def index():
+    input_text = ''
+    processed_steps = {}
+    result_svm = ''
+    result_nb = ''
+    tfidf_dict = {}
+
+    if request.method == 'POST':
+        input_text = request.form['text']
+        processed_steps = preprocess_text(input_text)
+        # Gunakan hasil akhir (stemming) untuk prediksi dan TF-IDF
+        final_text = processed_steps.get("stemming", "")
+        result_svm, result_nb = classify_text(final_text)
+        
+        # Dapatkan nilai TF-IDF dari vectorizer di pipeline model_SVM
+        vectorizer = model_SVM.named_steps['vectorizer']
+        tfidf_vector = vectorizer.transform([final_text])
+        dense = tfidf_vector.todense().tolist()[0]
+        features = vectorizer.get_feature_names_out()
+        tfidf_dict = {feature: round(value, 3) for feature, value in zip(features, dense) if value > 0}
+
+    return render_template('index.html', 
+                           input_text=input_text, 
+                           processed_steps=processed_steps,
+                           result_svm=result_svm, 
+                           result_nb=result_nb,
+                           tfidf_dict=tfidf_dict)
+
+if __name__ == '__main__':
+    app.run(debug=True)
--- a/model_NB.pkl
+++ b/model_NB.pkl
--- a/model_SVM.pkl
+++ b/model_SVM.pkl
--- a/preprocessing.py
+++ b/preprocessing.py
@ -0,0 +1,105 @@
+import re
+from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
+
+def clean_twitter_data(text):
+    # Fungsi cleansing: menghilangkan mention, hashtag, RT, URL, dan karakter non-alfanumerik
+    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
+    text = re.sub(r'#\w+', '', text)
+    text = re.sub(r'RT[\s]+', '', text)
+    text = re.sub(r'https?://\S+', '', text)
+    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+
+def normalize_text(text):
+    kamus_normalisasi = {
+        'bi': 'bank',
+        'ri': 'indonesia',
+        'akn': 'akan',
+        'gmn': 'bagaimana',
+        'ga': 'tidak',
+        'gak': 'tidak',
+        'nggak': 'tidak',
+        'yg': 'yang',
+        'kalo': 'kalau',
+        'aja': 'saja',
+        'nih': 'ini',
+        'dong': '',
+        'banget': 'sangat',
+        'bro': 'teman',
+        'sis': 'teman',
+        'dgn': 'dengan',
+        'bgt': 'sangat',
+        'blm': 'belum',
+        'jgn': 'jangan',
+        'tdk': 'tidak',
+    }
+    words = text.split()
+    normalized_words = [kamus_normalisasi.get(word, word) for word in words]
+    return ' '.join(normalized_words)
+
+def convert_negation(text):
+    negation_words = ['tidak', 'bukan', 'tak', 'jangan', 'belum']
+    tokens = text.split()
+    new_tokens = []
+    negate = False
+    negation_word = ''
+    for token in tokens:
+        if token in negation_words:
+            negate = True
+            negation_word = token
+        elif negate:
+            new_tokens.append(f"{negation_word}-{token}")
+            negate = False
+        else:
+            new_tokens.append(token)
+    return ' '.join(new_tokens)
+
+def remove_stopwords(text):
+    stopwords = set([
+        'yang', 'di', 'ke', 'dari', 'dan', 'atau', 'itu', 'ini', 'dengan',
+        'pada', 'untuk', 'ada', 'sangat', 'dalam', 'oleh', 'karena'
+    ])
+    words = text.split()
+    filtered_words = [word for word in words if word.lower() not in stopwords]
+    return ' '.join(filtered_words)
+
+def stemming(tokenized_text):
+    factory = StemmerFactory()
+    stemmer = factory.create_stemmer()
+    stemmed_words = [stemmer.stem(w) for w in tokenized_text]
+    return " ".join(stemmed_words)
+
+def preprocess_text(text):
+    steps = {}
+    steps["original_text"] = text
+
+    # 1. Case Folding: konversi ke huruf kecil
+    case_folding = text.lower()
+    steps["case_folding"] = case_folding
+
+    # 2. Cleansing: hapus mention, hashtag, URL, dll.
+    cleansing = clean_twitter_data(case_folding)
+    steps["cleansing"] = cleansing
+
+    # 4. Normalization: normalisasi kata dengan kamus
+    normalization = normalize_text(cleansing)
+    steps["normalization"] = normalization
+
+    # 3. Convert Negation: ubah kata negasi
+    negation_converted = convert_negation(normalization)
+    steps["convert_negation"] = negation_converted
+
+    # 5. Stopwords: hilangkan stopwords
+    no_stopwords = remove_stopwords(negation_converted)
+    steps["stopwords"] = no_stopwords
+
+    # 6. Tokenizing: pecah teks jadi token
+    tokenizing = no_stopwords.split()
+    steps["tokenizing"] = tokenizing
+
+    # 7. Stemming: proses stemming tiap token
+    stemming_result = stemming(tokenizing)
+    steps["stemming"] = stemming_result
+
+    return steps
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,90 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Sentimen Analysis Lengkap</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="bg-gray-100">
+  <div class="container mx-auto p-6">
+    <h1 class="text-3xl font-bold text-center text-blue-600 mb-6">
+      Sentimen Analysis dengan SVM & Naive Bayes
+    </h1>
+    <form method="post" class="bg-white p-6 rounded-lg shadow-md">
+      <div class="mb-4">
+        <textarea name="text" rows="4" class="w-full p-2 border rounded-lg" 
+                  placeholder="Masukkan teks disini...">{{ input_text }}</textarea>
+      </div>
+      <button type="submit" class="w-full bg-blue-500 text-white py-2 rounded-lg hover:bg-blue-600">
+        Analisis
+      </button>
+    </form>
+    
+    {% if input_text %}
+      <!-- Hasil Prediksi -->
+      <div class="mt-6 flex space-x-6">
+        <div class="flex-1 bg-white p-4 rounded-lg shadow-md border border-gray-300">
+          <h3 class="text-lg font-semibold text-center">Hasil Naive Bayes</h3>
+          <p class="text-xl text-center text-blue-500 mt-2">{{ result_nb }}</p>
+        </div>
+        <div class="flex-1 bg-white p-4 rounded-lg shadow-md border border-gray-300">
+          <h3 class="text-lg font-semibold text-center">Hasil SVM</h3>
+          <p class="text-xl text-center text-blue-500 mt-2">{{ result_svm }}</p>
+        </div>
+      </div>
+      
+      <!-- Detail Proses Preprocessing -->
+      <div class="mt-8">
+        <h2 class="text-2xl font-bold mb-4 text-center">Detail Proses Preprocessing</h2>
+        {% set step_keys = ["case_folding", "cleansing", "normalization", "convert_negation", "stopwords", "tokenizing", "stemming"] %}
+        <div class="flex flex-wrap items-center justify-center gap-4">
+          {% for key in step_keys %}
+            {% if processed_steps[key] is defined %}
+              <div class="bg-white p-4 rounded-lg shadow-md border border-gray-300">
+                <h3 class="font-semibold capitalize">{{ key | replace('_', ' ') }}</h3>
+                {% if processed_steps[key] is iterable and processed_steps[key] is not string %}
+                  <p>{{ processed_steps[key] | join(', ') }}</p>
+                {% else %}
+                  <p>{{ processed_steps[key] }}</p>
+                {% endif %}
+              </div>
+              {% if not loop.last %}
+                <div class="text-3xl font-bold">→</div>
+              {% endif %}
+            {% endif %}
+          {% endfor %}
+        </div>
+      </div>
+      
+      <!-- Nilai TF-IDF -->
+      <div class="mt-8">
+        <h2 class="text-2xl font-bold mb-4 text-center">Nilai TF-IDF</h2>
+        {% if tfidf_dict %}
+          <div class="overflow-x-auto">
+            <table class="min-w-full bg-white border border-gray-300">
+              <thead>
+                <tr>
+                  <th class="px-4 py-2 border">Fitur</th>
+                  <th class="px-4 py-2 border">Nilai</th>
+                </tr>
+              </thead>
+              <tbody>
+                {% for feature, value in tfidf_dict.items() %}
+                  <tr>
+                    <td class="px-4 py-2 border">{{ feature }}</td>
+                    <td class="px-4 py-2 border">{{ value }}</td>
+                  </tr>
+                {% endfor %}
+              </tbody>
+            </table>
+          </div>
+        {% else %}
+          <p class="text-center">Tidak ada nilai TF-IDF untuk ditampilkan.</p>
+        {% endif %}
+      </div>
+      
+    {% endif %}
+  </div>
+</body>
+</html>