Not only does this protect against multicollinearity, but if we one-hot encoded every category, we'd end up with around 1,800 columns were we to one-hot encode every current column. That is impractical and slow, so this approach is better.
# Perform iterative correlation testing to determine which columns to keep
# Step 1: Compute correlation matrix for continuous predictors
continuous_vars = ['MAXVO21_', 'FC601_', 'x_BMI5', 'DROCDY4_', 'x_DRNKWK2', 'PHYSHLTH'] # Known continuous predictors
categorical_vars = [col for col in data_filtered.columns if col not in continuous_vars] # For Cramer's V (next)
corr_matrix = data_filtered[continuous_vars].corr().abs() # Absolute correlations
# Step 2: Remove duplicate correlations (upper triangle)
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Step 3: Identify predictors to drop based on threshold
threshold = 0.7 # Set correlation threshold to avoid high redundancy
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
print(
f"Columns to drop due to high correlation: {to_drop}")
Iterative correlation testing on our continuous variables identified 1 to drop (FC601_), and we now have 109 columns left, including our dependent variable (MENTHLTH).
# Print number of columns before removal
print(f"Columns before removal: {data_filtered.shape[1]}")
# Remove the columns identified above
data_filtered = data_filtered.drop(to_drop + low_variance_vars, axis=1)
# Remove dropped variables from lists
continuous_vars = [col for col in continuous_vars if col not in to_drop + low_variance_vars]
categorical_vars = [col for col in categorical_vars if col not in to_drop + low_variance_vars]
# Print number of columns after removal
print(f"Columns after removal: {data_filtered.shape[1]}")
# Function to compute Cramér's V
def cramers_v(x, y):
"""Calculate Cramér's V for two categorical variables."""
confusion_matrix = pd.crosstab(x, y) # Create a contingency table
chi2, _, _, _ = chi2_contingency(confusion_matrix) # Perform Chi-Square test
n = confusion_matrix.sum().sum() # Total number of observations
phi2 = chi2 / n # Compute phi-squared
r, k = confusion_matrix.shape # Number of rows and columns in the table
phi2_corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1)) # Bias correction
r_corr = max(1, r - ((r-1)**2) / (n-1)) # Ensure corrected rows > 0
k_corr = max(1, k - ((k-1)**2) / (n-1)) # Ensure corrected columns > 0
denominator = min((k_corr - 1), (r_corr - 1))
if denominator <= 0: # Avoid division by zero
return np.nan
return np.sqrt(phi2_corr / denominator) # Return Cramér's V
# Create a matrix to store Cramér's V values
cramers_matrix = pd.DataFrame(index=categorical_vars, columns=categorical_vars)
# Compute Cramér's V for all pairs of categorical variables
for col1 in categorical_vars:
for col2 in categorical_vars:
if col1 == col2:
cramers_matrix.loc[col1, col2] = 0.0 # Self-correlation set to 0 for comparison
else:
cramers_matrix.loc[col1, col2] = cramers_v(data_filtered[col1], data_filtered[col2])
# Convert matrix values to floats for further processing
cramers_matrix = cramers_matrix.astype(float)
# Print Cramér's V matrix
print("Cramér's V Matrix:")
print(cramers_matrix)
# Identify variables with high association (e.g., V > 0.8)
threshold = 0.7 # Set threshold for high association
to_drop_cat = []
# Iterate over columns and ignore self-correlation (diagonal values)
for col in cramers_matrix.columns:
high_association = cramers_matrix[col][cramers_matrix.index != col] > threshold # Exclude diagonal
if any(high_association):
to_drop_cat.append(col)
print(f"Categorical variables to drop (V > {threshold}): {to_drop_cat}")