import importlib
import custom_utils
importlib.reload(custom_utils)

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import PowerTransformer

import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.encoding import RareLabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import make_scorer, r2_score, mean_squared_error

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

google_drive_link = "https://drive.google.com/file/d/1LqK2BvE6eGKIdbLHXaxN3aLx6dekx6B7/view?usp=drive_link"
custom_utils.download_dataset(google_drive_link)

house_data = pd.read_csv("dataset/dataset.csv")
house_data['SaleCondition'] = house_data['SaleCondition'].replace('normal', 'Normal')

X_train, X_test, y_train, y_test = train_test_split(
    house_data.drop('SalePrice', axis=1), # predictive variables
    house_data['SalePrice'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)


X_train.shape, X_test.shape

((1314, 21), (146, 21))

# Target variable transformation:
y_train = np.log(y_train)
y_test = np.log(y_test)
                
var_remove = ['HalfBath', 'Alley', 'LotType']
var_binning = ['LotArea', 'GrLivArea', 'TotalBsmtSF', 'GarageArea']
categorical_columns = [
    'SaleType',
    'HouseStyle',
    'SaleCondition',
    'Foundation',
    'GarageType',
    'BldgType',
    'Street',
    'CentralAir'
]
var_onehot_encode = ['SaleCondition', 'CentralAir', 'Foundation']

encoding_dict = {
    'SaleType': {'Rare': 0, 'WD': 1, 'New': 2},
    'HouseStyle': {'Rare': 0, '1.5Fin': 1, '1Story': 2, '2Story': 3},
    'GarageType': {'Rare': 0, 'Detchd': 0, 'Attchd': 1, 'BuiltIn': 2},
    'BldgType': {'Rare': 0, 'TwnhsE': 1, '1Fam': 2},
    'Street': {'Pave': 0, 'Rare': 1}
}

numerical_pipeline = Pipeline([
    # Step 1: Remove specified columns
    ('column_remover', custom_utils.ColumnRemover(variables_to_remove=var_remove)),
    
    # Step 2: Compute difference between YearBuilt and YearSold
    ('year_transformer', custom_utils.TemporalVariableTransformer(variables=["YearBuilt"], reference_variable="YearSold")),
    
    # Renaming and dropping step
    ('rename_drop', custom_utils.RenameDropTransformer(rename_dict={"YearBuilt": "HouseAge"}, drop_cols=["YearSold"])),
    
    # Step 3: Apply sqrt transformation to HouseAge
    ('sqrt_transformer', custom_utils.SqrtTransformer(variables=["HouseAge"])),
    
    # Step 4: Apply median imputation to GarageArea
    ('median_imputer', MeanMedianImputer(imputation_method='mean', variables=['GarageArea'])),
    
    # Step 5: Apply Yeo-Johnson transformation
    ('yeo_johnson', YeoJohnsonTransformer(variables=['LotArea', 'GrLivArea'])),
    
    # Step 6: Apply equal frequency discretization
    ('discretiser', EqualFrequencyDiscretiser(variables=var_binning, q=200))
])

# Create the categorical pipeline
categorical_pipeline = Pipeline([
    # Step 7: Custom Rare label encoding for categorical variables
    ('rare_label_encoder', RareLabelEncoder(n_categories=1, replace_with='Rare', missing_values='ignore',
                                               variables=categorical_columns)),
    
    # Step 8: Special treatment for Foundation column
    ('foundation_rare_encoder', RareLabelEncoder(n_categories=1, max_n_categories=2, 
                                               replace_with='Rare', missing_values='ignore',
                                               variables=['Foundation'])),
    # Step 9: Handle Nan of GarageType with the labels accroding to distribution
    ('probabilistic_imputation', custom_utils.ProbabilisticCategoricalImputer(
            variables='GarageType', 
            probabilities={'Attchd': 0.60, 'Detchd': 0.25, 'BuiltIn': 0.05, 'Rare': 0.10}
        )),
    
    # Step 10: Apply custom label mapping
    ('label_mapper', custom_utils.LabelMapper(mapping=encoding_dict)),

    # Step 11: Apply one hot encoding
    ('onehot_encoder', custom_utils.OneHotEncodingTransformer(columns=var_onehot_encode))
])

preprocessing_pipeline = Pipeline([
    ('numerical_transformer', numerical_pipeline),
    ('categorical_transformer', categorical_pipeline)
])

preprocessed_X_train = preprocessing_pipeline.fit_transform(X_train)

preprocessed_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1314 entries, 930 to 684
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   LotArea                1314 non-null   int64  
 1   GrLivArea              1314 non-null   int64  
 2   Street                 1314 non-null   int64  
 3   BldgType               1314 non-null   int64  
 4   HouseStyle             1314 non-null   int64  
 5   OverallQuality         1314 non-null   int64  
 6   OverallCondition       1314 non-null   int64  
 7   HouseAge               1314 non-null   float64
 8   TotalBsmtSF            1314 non-null   int64  
 9   FullBath               1314 non-null   int64  
 10  GarageType             1314 non-null   int64  
 11  GarageCars             1314 non-null   int64  
 12  GarageArea             1314 non-null   int64  
 13  SaleType               1314 non-null   int64  
 14  SaleCondition_Abnorml  1314 non-null   float64
 15  SaleCondition_Normal   1314 non-null   float64
 16  SaleCondition_Partial  1314 non-null   float64
 17  SaleCondition_Rare     1314 non-null   float64
 18  CentralAir_N           1314 non-null   float64
 19  CentralAir_Y           1314 non-null   float64
 20  Foundation_CBlock      1314 non-null   float64
 21  Foundation_PConc       1314 non-null   float64
 22  Foundation_Rare        1314 non-null   float64
dtypes: float64(10), int64(13)
memory usage: 246.4 KB

# Set up RFECV for regression
min_features_to_select = 1
estimator = LinearRegression()
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Use neg_mean_squared_error as the scoring metric for regression
rfecv = RFECV(
    estimator=estimator,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=min_features_to_select,
    n_jobs=2
)

# Fit RFECV
rfecv.fit(preprocessed_X_train, y_train)

# Print optimal number of features
print(f"Optimal number of features: {rfecv.n_features_}")

# Plot number of features VS cross-validation scores
cv_results = pd.DataFrame(rfecv.cv_results_)
plt.figure(figsize=(10, 6))
plt.xlabel("Number of features selected")
plt.ylabel("Mean cross-validated MSE")
plt.plot(
    cv_results["n_features"],
    -cv_results["mean_test_score"],  # Convert negative MSE back to positive for plotting
)
plt.fill_between(
    cv_results["n_features"],
    -cv_results["mean_test_score"] - cv_results["std_test_score"],
    -cv_results["mean_test_score"] + cv_results["std_test_score"],
    alpha=0.3
)
plt.title("Recursive Feature Elimination with Cross-Validation")
plt.grid(True)
plt.show()

Optimal number of features: 22

feature_selector = RFECV(
    estimator=LinearRegression(),
    step=1,           # Remove one feature at a time
    cv=5,             # 5-fold cross-validation
    scoring='r2',     # Use R² score as the evaluation metric
    min_features_to_select=5  # Optionally set a minimum number of features to keep
)

ML_pipeline = Pipeline([
    ('feature_engineer', preprocessing_pipeline),
    ('feature_selector', feature_selector),
    ('normalizer', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# # Fit with the filtered data
ML_pipeline.fit(X_train, y_train)

Pipeline(steps=[('feature_engineer',
                 Pipeline(steps=[('numerical_transformer',
                                  Pipeline(steps=[('column_remover',
                                                   ColumnRemover(variables_to_remove=['HalfBath',
                                                                                      'Alley',
                                                                                      'LotType'])),
                                                  ('year_transformer',
                                                   TemporalVariableTransformer(reference_variable='YearSold',
                                                                               variables=['YearBuilt'])),
                                                  ('rename_drop',
                                                   RenameDropTransformer(drop_cols=['YearSold'],
                                                                         rename_d...
                                                                                       'Rare': 0},
                                                                        'SaleType': {'New': 2,
                                                                                     'Rare': 0,
                                                                                     'WD': 1},
                                                                        'Street': {'Pave': 0,
                                                                                   'Rare': 1}})),
                                                  ('onehot_encoder',
                                                   OneHotEncodingTransformer(columns=['SaleCondition',
                                                                                      'CentralAir',
                                                                                      'Foundation']))]))])),
                ('feature_selector',
                 RFECV(cv=5, estimator=LinearRegression(),
                       min_features_to_select=5, scoring='r2')),
                ('normalizer', StandardScaler()),
                ('linear_regression', LinearRegression())])

Pipeline(steps=[('feature_engineer',
                 Pipeline(steps=[('numerical_transformer',
                                  Pipeline(steps=[('column_remover',
                                                   ColumnRemover(variables_to_remove=['HalfBath',
                                                                                      'Alley',
                                                                                      'LotType'])),
                                                  ('year_transformer',
                                                   TemporalVariableTransformer(reference_variable='YearSold',
                                                                               variables=['YearBuilt'])),
                                                  ('rename_drop',
                                                   RenameDropTransformer(drop_cols=['YearSold'],
                                                                         rename_d...
                                                                                       'Rare': 0},
                                                                        'SaleType': {'New': 2,
                                                                                     'Rare': 0,
                                                                                     'WD': 1},
                                                                        'Street': {'Pave': 0,
                                                                                   'Rare': 1}})),
                                                  ('onehot_encoder',
                                                   OneHotEncodingTransformer(columns=['SaleCondition',
                                                                                      'CentralAir',
                                                                                      'Foundation']))]))])),
                ('feature_selector',
                 RFECV(cv=5, estimator=LinearRegression(),
                       min_features_to_select=5, scoring='r2')),
                ('normalizer', StandardScaler()),
                ('linear_regression', LinearRegression())])

Pipeline(steps=[('numerical_transformer',
                 Pipeline(steps=[('column_remover',
                                  ColumnRemover(variables_to_remove=['HalfBath',
                                                                     'Alley',
                                                                     'LotType'])),
                                 ('year_transformer',
                                  TemporalVariableTransformer(reference_variable='YearSold',
                                                              variables=['YearBuilt'])),
                                 ('rename_drop',
                                  RenameDropTransformer(drop_cols=['YearSold'],
                                                        rename_dict={'YearBuilt': 'HouseAge'})),
                                 ('sqr...
                                 ('label_mapper',
                                  LabelMapper(mapping={'BldgType': {'1Fam': 2,
                                                                    'Rare': 0,
                                                                    'TwnhsE': 1},
                                                       'GarageType': {'Attchd': 1,
                                                                      'BuiltIn': 2,
                                                                      'Detchd': 0,
                                                                      'Rare': 0},
                                                       'HouseStyle': {'1.5Fin': 1,
                                                                      '1Story': 2,
                                                                      '2Story': 3,
                                                                      'Rare': 0},
                                                       'SaleType': {'New': 2,
                                                                    'Rare': 0,
                                                                    'WD': 1},
                                                       'Street': {'Pave': 0,
                                                                  'Rare': 1}})),
                                 ('onehot_encoder',
                                  OneHotEncodingTransformer(columns=['SaleCondition',
                                                                     'CentralAir',
                                                                     'Foundation']))]))])

Pipeline(steps=[('column_remover',
                 ColumnRemover(variables_to_remove=['HalfBath', 'Alley',
                                                    'LotType'])),
                ('year_transformer',
                 TemporalVariableTransformer(reference_variable='YearSold',
                                             variables=['YearBuilt'])),
                ('rename_drop',
                 RenameDropTransformer(drop_cols=['YearSold'],
                                       rename_dict={'YearBuilt': 'HouseAge'})),
                ('sqrt_transformer', SqrtTransformer(variables=['HouseAge'])),
                ('median_imputer',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['GarageArea'])),
                ('yeo_johnson',
                 YeoJohnsonTransformer(variables=['LotArea', 'GrLivArea'])),
                ('discretiser',
                 EqualFrequencyDiscretiser(q=200,
                                           variables=['LotArea', 'GrLivArea',
                                                      'TotalBsmtSF',
                                                      'GarageArea']))])

ColumnRemover(variables_to_remove=['HalfBath', 'Alley', 'LotType'])

TemporalVariableTransformer(reference_variable='YearSold',
                            variables=['YearBuilt'])

RenameDropTransformer(drop_cols=['YearSold'],
                      rename_dict={'YearBuilt': 'HouseAge'})

SqrtTransformer(variables=['HouseAge'])

MeanMedianImputer(imputation_method='mean', variables=['GarageArea'])

YeoJohnsonTransformer(variables=['LotArea', 'GrLivArea'])

def custom_r2(y_true, y_pred):
    return r2_score(np.exp(y_true), np.exp(y_pred))

def custom_mse(y_true, y_pred):
    return mean_squared_error(np.exp(y_true), np.exp(y_pred))

# Define scoring metrics
scoring = {
    'R2': make_scorer(custom_r2),
    'MSE': make_scorer(custom_mse, greater_is_better=False)
}

# Set up a KFold cross-validator
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Run cross-validation on the training set
cv_results = cross_validate(ML_pipeline, X_train, y_train, cv=kfold, scoring=scoring, return_train_score=True)
test_pred = ML_pipeline.predict(X_test)
train_pred = ML_pipeline.predict(X_train)

test_metrics = custom_utils.calculate_metrics(np.exp(y_test), np.exp(test_pred))
# Display metrics


# Display the cross-validation results
print("\nCross-Validation Performance Metrics:")
print("=" * 50)

# Format and display metrics in a structured table
print(f"{'Metric':<10} {'Dataset':<10} {'Mean':<12} {'Std':<12}")
print("-" * 50)

# R2 scores
train_r2_scores = cv_results['train_R2']
print(f"{'R2':<10} {'Train':<10} {np.mean(train_r2_scores):.4f}       {np.std(train_r2_scores):.4f}")
valid_r2_scores = cv_results['test_R2']
print(f"{'R2':<10} {'Valid':<10} {np.mean(valid_r2_scores):.4f}       {np.std(valid_r2_scores):.4f}")

# MSE scores
train_mse_scores = cv_results['train_MSE']
print(f"{'MSE':<10} {'Train':<10} {np.mean(train_mse_scores):.4f}       {np.std(train_mse_scores):.4f}")
valid_mse_scores = cv_results['test_MSE']
print(f"{'MSE':<10} {'Valid':<10} {np.mean(valid_mse_scores):.4f}       {np.std(valid_mse_scores):.4f}")

# Display the test set performance in a nicely formatted way
print("\nTest Set Prediction Performance:")
print("=" * 50)
print(f"{'Metric':<15} {'Value':<10}")
print("-" * 50)
for metric, value in test_metrics.items():
    if metric != 'Residuals':
        print(f"{metric:<15} {value:.4f}")

Cross-Validation Performance Metrics:
==================================================
Metric     Dataset    Mean         Std         
--------------------------------------------------
R2         Train      0.8331       0.0079
R2         Valid      0.8279       0.0325
MSE        Train      -1017217834.3478       55232596.0199
MSE        Valid      -1056092951.6024       249124040.5176

Test Set Prediction Performance:
==================================================
Metric          Value     
--------------------------------------------------
MSE             1586941812.5927
RMSE            39836.4383
MAE             19739.6539
R²              0.8026

plt.figure(figsize=(16, 12))

# Plot 1: Predicted vs Actual (Training set)
plt.subplot(2, 2, 1)
plt.scatter(np.exp(y_train), np.exp(train_pred), alpha=0.5)
plt.plot([np.exp(y_train).min(), np.exp(y_train).max()], 
         [np.exp(y_train).min(), np.exp(y_train).max()], 
         'r--', lw=2)
plt.xlabel('Actual House Price')
plt.ylabel('Predicted House Price')
plt.title('Training Set: Predicted vs Actual Prices')
plt.grid(True)

# Plot 2: Predicted vs Actual (Test set)
plt.subplot(2, 2, 2)
plt.scatter(np.exp(y_test), np.exp(test_pred), alpha=0.5, color='green')
plt.plot([np.exp(y_test).min(), np.exp(y_test).max()], 
         [np.exp(y_test).min(), np.exp(y_test).max()], 
         'r--', lw=2)
plt.xlabel('Actual House Price')
plt.ylabel('Predicted House Price')
plt.title('Test Set: Predicted vs Actual Prices')
plt.grid(True)

try:
    # First transform X_train with the preprocessing pipeline
    X_train_preprocessed = ML_pipeline.named_steps['feature_engineer'].transform(X_train)
    
    # Get the feature selection mask from the fitted feature selector
    selection_mask = ML_pipeline.named_steps['feature_selector'].support_
    
    # Get the number of features
    n_features_selected = np.sum(selection_mask)
    
    # Generate generic feature names
    preprocessed_feature_names = [f"Feature_{i}" for i in range(len(selection_mask))]
    
    # Get the selected feature names
    selected_feature_names = np.array(preprocessed_feature_names)[selection_mask]
    
    print(f"Number of features after preprocessing: {len(preprocessed_feature_names)}")
    print(f"Number of features selected: {n_features_selected}")
except Exception as e:
    print(f"Method 1 failed: {e}")
    # Fallback option
    selected_feature_names = [f"Feature_{i}" for i in range(len(ML_pipeline.named_steps['linear_regression'].coef_))]

# Get coefficients from the linear regression model
coefficients = ML_pipeline.named_steps['linear_regression'].coef_

print(f"Number of coefficients: {len(coefficients)}")

# Create a DataFrame with selected feature names and coefficients
coef_df = pd.DataFrame({
    'Feature': selected_feature_names,
    'Coefficient': coefficients
})

# Sort by absolute coefficient value to determine importance
coef_df['abs_coef'] = coef_df['Coefficient'].abs()
top_features = coef_df.sort_values(by='abs_coef', ascending=False).head(15)

# Create visualization
plt.figure(figsize=(12, 8))
colors = ['#3498db' if coef >= 0 else '#e74c3c' for coef in top_features['Coefficient']]
bars = plt.barh(top_features['Feature'], top_features['Coefficient'], color=colors)
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Top 15 Features by Importance in Linear Regression Model')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.5)  # Add a vertical line at x=0

plt.gca().invert_yaxis()  # Show highest importance at the top
plt.tight_layout()
plt.show()

Number of features after preprocessing: 23
Number of features selected: 22
Number of coefficients: 22

Preliminary Analysis¶

Project Overview¶

Research Objective¶

Different Aspects of the Study¶

Conclusion¶

Dataset Preparation¶

FEATURE ENGINEERING¶

Numerical Pipeline¶

Categorical Pipeline¶

FEATURE SELECTION¶

MODEL TRAINING¶

MODEL EVALUATION¶

POST PROCESSING¶