# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import KBinsDiscretizer
# Set styling for plots
# plt.style.use('seaborn')
Introduction
Data discretization is a powerful technique that can significantly improve your machine learning models’ performance. In this tutorial, we’ll explore practical implementations with real-world examples and visualizations.
Basic Age Discretization Example
Let’s start with a simple example using age data:
# Create sample data
42)
np.random.seed(= {'age': np.random.normal(40, 15, 1000).clip(18, 90)}
data = pd.DataFrame(data)
df
# Create age bins
= [18, 30, 45, 60, 75, 90]
age_bins = ['18-30', '31-45', '46-60', '61-75', '76-90']
age_labels 'age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels)
df[
# Visualize the distribution
=(8, 6))
plt.figure(figsize
1, 2, 1)
plt.subplot(=df, x='age', bins=20)
sns.histplot(data'Original Age Distribution')
plt.title(
1, 2, 2)
plt.subplot(=df, x='age_group')
sns.countplot(data'Discretized Age Groups')
plt.title(=45)
plt.xticks(rotation
plt.tight_layout() plt.show()
Advanced Discretization Techniques
1. Equal-Width Binning Using KBinsDiscretizer
# Generate sample data
42)
np.random.seed(= {
data 'income': np.random.exponential(50000, 1000) + 30000
}= pd.DataFrame(data)
df_income
# Apply KBinsDiscretizer
= KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
est 'income_binned'] = est.fit_transform(df_income[['income']])
df_income[
# Visualize
=(8, 6))
plt.figure(figsize
1, 2, 1)
plt.subplot(=df_income, x='income', bins=30)
sns.histplot(data'Original Income Distribution')
plt.title(
1, 2, 2)
plt.subplot(=df_income, x='income_binned', bins=5)
sns.histplot(data'Discretized Income (Equal-Width)')
plt.title(
plt.tight_layout() plt.show()
/Users/fabianlanderos/miniforge3/envs/quarto/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py:248: FutureWarning:
In version 1.5 onwards, subsample=200_000 will be used by default. Set subsample explicitly to silence this warning in the mean time. Set subsample=None to disable subsampling explicitly.
2. Quantile-Based Discretization
# Apply quantile-based discretization
'income_quantile'] = pd.qcut(df_income['income'], q=5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'])
df_income[
# Calculate mean income per quantile
= df_income.groupby('income_quantile')['income'].mean()
quantile_means
# Visualize
=(10, 6))
plt.figure(figsize='bar')
quantile_means.plot(kind'Mean Income by Quantile')
plt.title('Income Quantile')
plt.xlabel('Mean Income ($)')
plt.ylabel(=45)
plt.xticks(rotation
plt.tight_layout() plt.show()
/var/folders/nq/y2lyg7p15txfm5ksrw6f90340000gn/T/ipykernel_59379/873430559.py:5: FutureWarning:
The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Real-World Application: Housing Data
Let’s demonstrate the power of discretization with a housing dataset example:
# Create synthetic housing data
42)
np.random.seed(= 1000
n_samples
= pd.DataFrame({
housing_data 'Latitude': np.random.uniform(32.0, 42.0, n_samples),
'Longitude': np.random.uniform(-124.0, -114.0, n_samples),
'Price': np.random.exponential(500000, n_samples) + 200000
})
# Discretize locations
'LatitudeBin'] = pd.cut(housing_data['Latitude'], bins=5, labels=['South', 'South-Mid', 'Mid', 'Mid-North', 'North'])
housing_data['LongitudeBin'] = pd.cut(housing_data['Longitude'], bins=5, labels=['West', 'West-Mid', 'Mid', 'Mid-East', 'East'])
housing_data[
# Calculate average prices by region
= housing_data.groupby(['LatitudeBin', 'LongitudeBin'])['Price'].mean().unstack()
region_prices
# Create heatmap
=(8, 8))
plt.figure(figsize=True, fmt='.0f', cmap='YlOrRd')
sns.heatmap(region_prices, annot'Average House Prices by Region')
plt.title(
plt.tight_layout() plt.show()
/var/folders/nq/y2lyg7p15txfm5ksrw6f90340000gn/T/ipykernel_59379/674675667.py:16: FutureWarning:
The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Target Discretization Example
Here’s an implementation of the winning strategy mentioned for converting continuous targets to discrete values:
# Generate synthetic continuous target data
42)
np.random.seed(= np.random.normal(5, 2, 1000)
continuous_targets
# Define unique target values
= np.array([1.25, 2.25, 3.05, 4.05, 4.85, 5.75, 6.55, 7.75, 9.25])
unique_targets
# Transform to nearest discrete target
= np.array([unique_targets[np.abs(unique_targets - y).argmin()]
discretized_targets for y in continuous_targets])
# Visualize the transformation
=(8, 6))
plt.figure(figsize
1, 2, 1)
plt.subplot(=30, alpha=0.7)
plt.hist(continuous_targets, bins'Original Continuous Targets')
plt.title('Value')
plt.xlabel('Count')
plt.ylabel(
1, 2, 2)
plt.subplot(=len(unique_targets), alpha=0.7)
plt.hist(discretized_targets, bins'Discretized Targets')
plt.title('Value')
plt.xlabel('Count')
plt.ylabel(
plt.tight_layout() plt.show()
Conclusion
Data discretization is a powerful technique that can significantly improve model performance. Key takeaways:
- It reduces noise in continuous variables
- Improves model interpretability
- Can lead to better performance in certain algorithms
- Particularly effective in geographic and demographic data
Remember to experiment with different binning strategies and number of bins to find the optimal discretization for your specific problem.