TIF_E41210369/ems-model/model.py

import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os
import subprocess
np.random.seed(42)

def load_data(file_path):
    """
    Load the data from .csv file (must be a csv)
    """
    try:
        df = pd.read_csv(file_path)
        print("CSV Loaded, Dataset shape:", df.shape)
        return df
    except FileNotFoundError:
        print(f"Error: CSV File {file_path} not found!")
        return None

def preprocess_data(df):
    """
    Preprocess data:
    - LabelEncoder to species (categorical)
    - MinMaxScaling for the rest (numerical)
    """
    if not os.path.exists("pickles"):
        os.makedirs("pickles")

    le = LabelEncoder()
    df['species'] = le.fit_transform(df['species'])

    with open('pickles/label_encoding.pkl', 'wb') as f:
      pickle.dump(le, f)

    scaler = MinMaxScaler()
    numerical_columns = ['soakDuration', 'lowestTemp', 'highestTemp']
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    scaled_features = df[numerical_columns]

    with open('pickles/scaler_encoding.pkl', 'wb') as f:
      pickle.dump(scaler, f)

    X = df[['species', 'emsConcentration', 'soakDuration',
                   'lowestTemp', 'highestTemp']]
    y = df['result']

    return X, y

def train_model(X, y):
    """
    Split data and train Random Forest model
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # initialize and train model
    # use hitchhiker's guide to the galaxy which is 42
    # if u just wanted to, say a random number
    #
    # "Deep Thought had been built by its creators to give the answer to the "Ultimate Question of Life, the Universe, and Everything", which, after eons of calculations, was given simply as "42"." - Wikipedia about hitchiker's guide to the galaxy
    #
    rf_model = RandomForestClassifier(
        n_estimators=100,
        criterion='gini',
        random_state=42
    )

    rf_model.fit(X_train, y_train)

    with open('pickles/ems_model.pkl', 'wb') as f:
      pickle.dump(rf_model, f)

    return rf_model, X_train, X_test, y_train, y_test

def evaluate_model(model, X_test, y_test, X, y):
    """
    Evaluate model performance
    """
    y_pred = model.predict(X_test)
    y_test = np.array(y_test)

    print("\n!!! Evaluation !!!\n")

    print("\n!!! K - Fold (Stratified Shuffle Split) !!!\n")
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
    scores = cross_val_score(model, X, y, cv=sss)

    print("Cross-validation scores:", scores)
    print("Mean accuracy:", scores.mean())
    print("Standard deviation:", scores.std())

    print("\n!!! Classification Report !!!\n")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

    print(f"Model Accuracy Score: {round(model.score(X_test, y_test) * 100,1)}% \n")

    print("\n!!! Confusion Matrix !!!\n")

    print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    .rename_axis(index={'Actual': 'Actual'}, columns={'Predicted': 'Predicted'})
    .set_axis(['Fail rate', 'Success rate'], axis=0)
    .set_axis(['Fail rate', 'Success rate'], axis=1))

def feature_importance(model):
    """
    Plot feature importance
    Check the importance of each feature
    """
    feature_names = ['Species', 'EMS Concentration', 'Soak Duration', 'Lowest Temperature', 'Highest Temperature']
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    plt.figure(figsize=(12, 6))
    ax = sns.barplot(
        x='importance',
        y='feature',
        hue='feature',
        data=feature_importance,
        legend=False
    )

    for i, v in enumerate(feature_importance['importance']):
        percentage = f'{v:.3f}'
        ax.text(v, i, f' {percentage}', va='center')

    plt.title('Feature Importance', pad=20, fontsize=12, fontweight='bold')
    plt.xlabel('Importance Score', fontsize=10)
    plt.ylabel('Features', fontsize=10)

    plt.tight_layout()

def main():
    print("Hello from ems-model!")

    # randomize (shuffle) the data
    # subprocess.run(
    #         ["python", "shuffle.py"], capture_output=True, text=True, check=True
    #     )

    # load from csv file
    # df = load_data("csv/ems_data_randomized.csv")
    df = load_data("csv/ems_data.csv")

    # preprocess
    X, y = preprocess_data(df)

    # train
    model, _, X_test, _, y_test = train_model(X, y)

    # evaluate
    evaluate_model(model, X_test, y_test, X, y)

    # feature importance
    feature_importance(model)

    # show evaluation on confusion matrix and feature importance
    # plt.show()

if __name__ == "__main__":
    main()