mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-09 02:09:16 +08:00
* Initial plan * Fix training/test set overlap in holdout classification by only adding missing labels when needed Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Address code review feedback: add bounds checking and fix edge cases Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix bounds checking: use correct comparison operator for array indexing Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix potential ValueError with max() on empty lists and simplify test assertions Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add extra bounds checking for label_matches indices Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix pandas_on_spark compatibility by using iloc_pandas_on_spark util method Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Run pre-commit to fix formatting issues Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Improve missing label handling to avoid overlap when possible For classes with multiple instances that end up in one set, properly re-split them between train and val instead of duplicating. Only add to both sets when the class has exactly 1 instance (unavoidable overlap). Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix: ensure at least 1 instance remains in original set when re-splitting Addresses comments on lines 580 and 724 - prevents moving ALL instances from validation to training or vice versa by using min(num_instances - 1, ...). Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Refactor: Extract missing label handling into two methods (fast/slow) - Created _handle_missing_labels_fast(): Fast version that adds first instance to both sets (may overlap) - Created _handle_missing_labels_no_overlap(): Precise version that avoids overlap when possible - Added allow_label_overlap parameter to AutoML.fit() (default=True for fast version) - Updated documentation with new parameter - Both versions maintain label completeness while offering speed/precision tradeoff Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add usage example for allow_label_overlap to Best-Practices.md - Added comprehensive documentation in Classification section - Included code examples showing both fast and precise versions - Explained tradeoffs between speed and precision - Noted that parameter only affects holdout evaluation Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Address code review feedback: update documentation and tests - Updated docstrings to clarify fast version only adds instances to missing sets - Fixed documentation to reflect actual behavior (not "both sets" but "set with missing label") - Completely rewrote test_no_overlap.py to test both allow_label_overlap modes - Added tests with sample_weights for better code coverage - Added test for single-instance class handling - All 5 tests passing Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix AttributeError: initialize _allow_label_overlap in settings and retrain_from_log - Added allow_label_overlap to settings initialization with default=True - Added parameter defaulting in fit() method to use settings value if not provided - Added _allow_label_overlap initialization in retrain_from_log method - Fixes test failures in test_multiclass, test_regression, and spark tests Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add docstring to fit() --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com> Co-authored-by: Li Jiang <lijiang1@microsoft.com>
273 lines
8.9 KiB
Python
273 lines
8.9 KiB
Python
"""Test to ensure correct label overlap handling for classification tasks"""
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.datasets import load_iris, make_classification
|
|
|
|
from flaml import AutoML
|
|
|
|
|
|
def test_allow_label_overlap_true():
|
|
"""Test with allow_label_overlap=True (fast mode, default)"""
|
|
# Load iris dataset
|
|
dic_data = load_iris(as_frame=True)
|
|
iris_data = dic_data["frame"]
|
|
|
|
# Prepare data
|
|
x_train = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]].to_numpy()
|
|
y_train = iris_data["target"]
|
|
|
|
# Train with fast mode (default)
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"max_iter": 5,
|
|
"metric": "accuracy",
|
|
"task": "classification",
|
|
"estimator_list": ["lgbm"],
|
|
"eval_method": "holdout",
|
|
"split_type": "stratified",
|
|
"keep_search_state": True,
|
|
"retrain_full": False,
|
|
"auto_augment": False,
|
|
"verbose": 0,
|
|
"allow_label_overlap": True, # Fast mode
|
|
}
|
|
automl.fit(x_train, y_train, **automl_settings)
|
|
|
|
# Check results
|
|
input_size = len(x_train)
|
|
train_size = len(automl._state.X_train)
|
|
val_size = len(automl._state.X_val)
|
|
|
|
# With stratified split on balanced data, fast mode may have no overlap
|
|
assert (
|
|
train_size + val_size >= input_size
|
|
), f"Inconsistent sizes. Input: {input_size}, Train: {train_size}, Val: {val_size}"
|
|
|
|
# Verify all classes are represented in both sets
|
|
train_labels = set(np.unique(automl._state.y_train))
|
|
val_labels = set(np.unique(automl._state.y_val))
|
|
all_labels = set(np.unique(y_train))
|
|
|
|
assert train_labels == all_labels, f"Not all labels in train. All: {all_labels}, Train: {train_labels}"
|
|
assert val_labels == all_labels, f"Not all labels in val. All: {all_labels}, Val: {val_labels}"
|
|
|
|
print(
|
|
f"✓ Test passed (fast mode): Input: {input_size}, Train: {train_size}, Val: {val_size}, "
|
|
f"Overlap: {train_size + val_size - input_size}"
|
|
)
|
|
|
|
|
|
def test_allow_label_overlap_false():
|
|
"""Test with allow_label_overlap=False (precise mode)"""
|
|
# Load iris dataset
|
|
dic_data = load_iris(as_frame=True)
|
|
iris_data = dic_data["frame"]
|
|
|
|
# Prepare data
|
|
x_train = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]].to_numpy()
|
|
y_train = iris_data["target"]
|
|
|
|
# Train with precise mode
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"max_iter": 5,
|
|
"metric": "accuracy",
|
|
"task": "classification",
|
|
"estimator_list": ["lgbm"],
|
|
"eval_method": "holdout",
|
|
"split_type": "stratified",
|
|
"keep_search_state": True,
|
|
"retrain_full": False,
|
|
"auto_augment": False,
|
|
"verbose": 0,
|
|
"allow_label_overlap": False, # Precise mode
|
|
}
|
|
automl.fit(x_train, y_train, **automl_settings)
|
|
|
|
# Check that there's no overlap (or minimal overlap for single-instance classes)
|
|
input_size = len(x_train)
|
|
train_size = len(automl._state.X_train)
|
|
val_size = len(automl._state.X_val)
|
|
|
|
# Verify all classes are represented
|
|
all_labels = set(np.unique(y_train))
|
|
|
|
# Should have no overlap or minimal overlap
|
|
overlap = train_size + val_size - input_size
|
|
assert overlap <= len(all_labels), f"Excessive overlap: {overlap}"
|
|
|
|
# Verify all classes are represented
|
|
train_labels = set(np.unique(automl._state.y_train))
|
|
val_labels = set(np.unique(automl._state.y_val))
|
|
|
|
combined_labels = train_labels.union(val_labels)
|
|
assert combined_labels == all_labels, f"Not all labels present. All: {all_labels}, Combined: {combined_labels}"
|
|
|
|
print(
|
|
f"✓ Test passed (precise mode): Input: {input_size}, Train: {train_size}, Val: {val_size}, "
|
|
f"Overlap: {overlap}"
|
|
)
|
|
|
|
|
|
def test_uniform_split_with_overlap_control():
|
|
"""Test with uniform split and both overlap modes"""
|
|
# Load iris dataset
|
|
dic_data = load_iris(as_frame=True)
|
|
iris_data = dic_data["frame"]
|
|
|
|
# Prepare data
|
|
x_train = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]].to_numpy()
|
|
y_train = iris_data["target"]
|
|
|
|
# Test precise mode with uniform split
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"max_iter": 5,
|
|
"metric": "accuracy",
|
|
"task": "classification",
|
|
"estimator_list": ["lgbm"],
|
|
"eval_method": "holdout",
|
|
"split_type": "uniform",
|
|
"keep_search_state": True,
|
|
"retrain_full": False,
|
|
"auto_augment": False,
|
|
"verbose": 0,
|
|
"allow_label_overlap": False, # Precise mode
|
|
}
|
|
automl.fit(x_train, y_train, **automl_settings)
|
|
|
|
input_size = len(x_train)
|
|
train_size = len(automl._state.X_train)
|
|
val_size = len(automl._state.X_val)
|
|
|
|
# Verify all classes are represented
|
|
train_labels = set(np.unique(automl._state.y_train))
|
|
val_labels = set(np.unique(automl._state.y_val))
|
|
all_labels = set(np.unique(y_train))
|
|
|
|
combined_labels = train_labels.union(val_labels)
|
|
assert combined_labels == all_labels, "Not all labels present with uniform split"
|
|
|
|
print(f"✓ Test passed (uniform split): Input: {input_size}, Train: {train_size}, Val: {val_size}")
|
|
|
|
|
|
def test_with_sample_weights():
|
|
"""Test label overlap handling with sample weights"""
|
|
# Create a simple dataset
|
|
X, y = make_classification(
|
|
n_samples=200,
|
|
n_features=10,
|
|
n_informative=5,
|
|
n_redundant=2,
|
|
n_classes=3,
|
|
n_clusters_per_class=1,
|
|
random_state=42,
|
|
)
|
|
|
|
# Create sample weights (giving more weight to some samples)
|
|
sample_weight = np.random.uniform(0.5, 2.0, size=len(y))
|
|
|
|
# Test fast mode with sample weights
|
|
automl_fast = AutoML()
|
|
automl_fast.fit(
|
|
X,
|
|
y,
|
|
task="classification",
|
|
metric="accuracy",
|
|
estimator_list=["lgbm"],
|
|
eval_method="holdout",
|
|
split_type="stratified",
|
|
max_iter=3,
|
|
keep_search_state=True,
|
|
retrain_full=False,
|
|
auto_augment=False,
|
|
verbose=0,
|
|
allow_label_overlap=True, # Fast mode
|
|
sample_weight=sample_weight,
|
|
)
|
|
|
|
# Verify all labels present
|
|
train_labels_fast = set(np.unique(automl_fast._state.y_train))
|
|
val_labels_fast = set(np.unique(automl_fast._state.y_val))
|
|
all_labels = set(np.unique(y))
|
|
|
|
assert train_labels_fast == all_labels, "Not all labels in train (fast mode with weights)"
|
|
assert val_labels_fast == all_labels, "Not all labels in val (fast mode with weights)"
|
|
|
|
# Test precise mode with sample weights
|
|
automl_precise = AutoML()
|
|
automl_precise.fit(
|
|
X,
|
|
y,
|
|
task="classification",
|
|
metric="accuracy",
|
|
estimator_list=["lgbm"],
|
|
eval_method="holdout",
|
|
split_type="stratified",
|
|
max_iter=3,
|
|
keep_search_state=True,
|
|
retrain_full=False,
|
|
auto_augment=False,
|
|
verbose=0,
|
|
allow_label_overlap=False, # Precise mode
|
|
sample_weight=sample_weight,
|
|
)
|
|
|
|
# Verify all labels present
|
|
train_labels_precise = set(np.unique(automl_precise._state.y_train))
|
|
val_labels_precise = set(np.unique(automl_precise._state.y_val))
|
|
|
|
combined_labels = train_labels_precise.union(val_labels_precise)
|
|
assert combined_labels == all_labels, "Not all labels present (precise mode with weights)"
|
|
|
|
print("✓ Test passed with sample weights (fast and precise modes)")
|
|
|
|
|
|
def test_single_instance_class():
|
|
"""Test handling of single-instance classes"""
|
|
# Create imbalanced dataset where one class has only 1 instance
|
|
X = np.random.randn(50, 4)
|
|
y = np.array([0] * 40 + [1] * 9 + [2] * 1) # Class 2 has only 1 instance
|
|
|
|
# Test precise mode - should add single instance to both sets
|
|
automl = AutoML()
|
|
automl.fit(
|
|
X,
|
|
y,
|
|
task="classification",
|
|
metric="accuracy",
|
|
estimator_list=["lgbm"],
|
|
eval_method="holdout",
|
|
split_type="uniform",
|
|
max_iter=3,
|
|
keep_search_state=True,
|
|
retrain_full=False,
|
|
auto_augment=False,
|
|
verbose=0,
|
|
allow_label_overlap=False, # Precise mode
|
|
)
|
|
|
|
# Verify all labels present
|
|
train_labels = set(np.unique(automl._state.y_train))
|
|
val_labels = set(np.unique(automl._state.y_val))
|
|
all_labels = set(np.unique(y))
|
|
|
|
# Single-instance class should be in both sets
|
|
combined_labels = train_labels.union(val_labels)
|
|
assert combined_labels == all_labels, "Not all labels present with single-instance class"
|
|
|
|
# Check that single-instance class (label 2) is in both sets
|
|
assert 2 in train_labels, "Single-instance class not in train"
|
|
assert 2 in val_labels, "Single-instance class not in val"
|
|
|
|
print("✓ Test passed with single-instance class")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_allow_label_overlap_true()
|
|
test_allow_label_overlap_false()
|
|
test_uniform_split_with_overlap_control()
|
|
test_with_sample_weights()
|
|
test_single_instance_class()
|
|
print("\n✓ All tests passed!")
|