I learned that a huge chunk of a machine learning engineer’s time isn’t spent on building fancy models. it’s spent cleaning and prepping data. It is not so glamorous. But it is actually the secret sauce to making models perform well. This process, called preprocessing, involves filling in missing values, scaling numbers, and making sure everything is in the right format before feeding it into an algorithm. To save you some headaches, I’ve put together a handy cheat sheet below to make data prep a little easier..

Libraries: NumPy, Pandas, Scikit-learn, TensorFlow, PyTorch

Data Loading and Basic Inspection

import pandas as pd
import numpy as np
from sklearn.preprocessing import *

# Load data
df = pd.read_csv('data.csv')
df.head()                    # View first few rows
df.info()                    # Get data info (types, non-null values)
df.describe()                # Statistical summary
df.isnull().sum()           # Check missing values

Missing Value Handling

Detection and Removal

# Remove rows with any missing values
df.dropna(inplace=True)

# Remove rows where all values are missing
df.dropna(how='all', inplace=True)

# Remove columns with >50% missing values
df.dropna(thresh=len(df)*0.5, axis=1, inplace=True)

Imputation

# Sklearn imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')  # or median, most_frequent, constant
X_imputed = imputer.fit_transform(X)

# Advanced imputation
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)

Data Split

from sklearn.model_selection import train_test_split

X = df.drop('target', axis=1)  
y = df['target'] 

# Binary split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42
)

Feature Scaling

Standardization (Standard Scaler)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Mean = 0, STD = 1

Normalization (Min-Max Scaling)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# Range: [0,1]

Robust Scaling

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
# Robust to outliers

Categorical Data Encoding

Label Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

One-Hot Encoding

# Pandas method
df_encoded = pd.get_dummies(df, columns=['categorical_column'])

# Sklearn method
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X)

Ordinal Encoding

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

Feature Engineering

Polynomial Features

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

Binning/Discretization

# Equal-width binning
df['binned'] = pd.cut(df['column'], bins=5)

# Equal-frequency binning
df['binned'] = pd.qcut(df['column'], q=5)

# Custom binning
bins = [0, 20, 40, 60, 80, 100]
labels = ['very_low', 'low', 'medium', 'high', 'very_high']
df['binned'] = pd.cut(df['column'], bins=bins, labels=labels)

Deep Learning Specific Preprocessing

Image Data

# Using TensorFlow
import tensorflow as tf

# Resize images
resized = tf.image.resize(images, [224, 224])

# Normalize pixel values
normalized = resized / 255.0

# Data augmentation
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.2),
])

Text Data

# Using TensorFlow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding
padded = pad_sequences(sequences, maxlen=100, padding='post')

Feature Selection

Statistical Selection

from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

Correlation Analysis

# Calculate correlation matrix
correlation_matrix = df.corr()

# Remove highly correlated features
def remove_correlated_features(df, threshold=0.95):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(to_drop, axis=1)

Cross-Validation

from sklearn.model_selection import KFold, cross_val_score

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kf)

Pipeline Creation

from sklearn.pipeline import Pipeline

# Create preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(k=10))
])

# Full pipeline with model
full_pipeline = Pipeline([
    ('preprocessor', preprocessing_pipeline),
    ('classifier', RandomForestClassifier())
])