Update examples for MultiMha

thieu1995 · thieu1995 · commit 50ab11630d0d · 2023-07-10T22:57:44.000+07:00
diff --git a/README.md b/README.md
@@ -236,6 +236,45 @@ print(feat_selector.SUPPORT)
 ```
 Or you better read the document from: https://mafese.readthedocs.io/en/latest/
 
+3) I got this type of error
+```python 
+raise ValueError("Existed at least one new label in y_pred.")
+ValueError: Existed at least one new label in y_pred.
+``` 
+How to solve this?
+
++ This occurs only when you are working on a classification problem with a small dataset that has many classes. For 
+  instance, the "Zoo" dataset contains only 101 samples, but it has 7 classes. If you split the dataset into a 
+  training and testing set with a ratio of around 80% - 20%, there is a chance that one or more classes may appear 
+  in the testing set but not in the training set. As a result, when you calculate the performance metrics, you may 
+  encounter this error. You cannot predict or assign new data to a new label because you have no knowledge about the 
+  new label. There are several solutions to this problem.
+
++ 1st: Use the SMOTE method to address imbalanced data and ensure that all classes have the same number of samples.
+
+```python 
+from imblearn.over_sampling import SMOTE
+import pandas as pd
+from mafese import Data
+
+dataset = pd.read_csv('examples/dataset.csv', index_col=0).values
+X, y = dataset[:, 0:-1], dataset[:, -1]
+
+X_new, y_new = SMOTE().fit_resample(X, y)
+data = Data(X_new, y_new)
+```
+
++ 2nd: Use different random_state numbers in split_train_test() function.
+```python
+import pandas as pd 
+from mafese import Data 
+
+dataset = pd.read_csv('examples/dataset.csv', index_col=0).values
+X, y = dataset[:, 0:-1], dataset[:, -1]
+data = Data(X, y)
+data.split_train_test(test_size=0.2, random_state=10)   # Try different random_state value 
+```
+
 
 For more usage examples please look at [examples](/examples) folder.
 
diff --git a/examples/wrapper/exam_multimha.py b/examples/wrapper/exam_multimha.py
@@ -11,12 +11,17 @@
 data.split_train_test(test_size=0.2)
 
 list_optimizers = ("OriginalWOA", "OriginalGWO", "OriginalTLO", "OriginalGSKA")
-list_paras = [{"epoch": 50, "pop_size": 30}, ]*4
+list_paras = [
+    {"name": "WOA", "epoch": 20, "pop_size": 30},
+    {"name": "GWO", "epoch": 20, "pop_size": 30},
+    {"name": "TLO", "epoch": 20, "pop_size": 30},
+    {"name": "GSKA", "epoch": 20, "pop_size": 30}
+]
 feat_selector = MultiMhaSelector(problem="classification", estimator="knn",
                             list_optimizers=list_optimizers, list_optimizer_paras=list_paras,
                             transfer_func="vstf_01", obj_name="AS")
 
-feat_selector.fit(data.X_train, data.y_train, n_trials=3, n_jobs=3, verbose=False)
+feat_selector.fit(data.X_train, data.y_train, n_trials=2, n_jobs=2, verbose=False)
 feat_selector.export_boxplot_figures()
 feat_selector.export_convergence_figures()
 
diff --git a/run_fs.py b/run_fs.py
@@ -8,12 +8,12 @@
 from sklearn.svm import SVC
 
 
-data = get_dataset("Arrhythmia")
-data.split_train_test(test_size=0.2)
+data = get_dataset("ecoli")
+data.split_train_test(test_size=0.2, random_state=2)
 print(data.X_train.shape, data.X_test.shape)            # (361, 279) (91, 279)
 
 feat_selector = MhaSelector(problem="classification", estimator="knn",
-                            optimizer="OriginalTLO", optimizer_paras=None,
+                            optimizer="OriginalTLO", optimizer_paras={"epoch": 50, "pop_size": 30},
                             transfer_func="vstf_01", obj_name="AS")
 feat_selector.fit(data.X_train, data.y_train, fit_weights=(0.9, 0.1), verbose=True)
 X_selected = feat_selector.transform(data.X_train)