Real-World Project: Complete Data Analysis Pipeline

Welcome to the capstone project of our Data Science module! This is where we bring everything together - NumPy, Pandas, data cleaning, visualization, and statistical analysis - to solve a real-world problem.

Project Overview: Customer Churn Analysis

Business Problem: A telecom company wants to understand why customers are leaving and predict who might leave next. Customer churn (when customers stop using a service) costs businesses billions annually.

Our Mission: Build a complete data analysis pipeline to:

Explore the customer data
Clean and prepare the data
Analyze patterns and relationships
Visualize key insights
Build a predictive model
Present actionable recommendations

Step 1: Data Exploration

Loading and Initial Inspection

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the dataset
df = pd.read_csv('telecom_customer_churn.csv')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nData types and missing values:")
print(df.info())

print("\nBasic statistics:")
print(df.describe())

Understanding the Variables

# Check unique values in categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical variables:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts().head())
    print(f"Unique values: {df[col].nunique()}")

# Check numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(f"\nNumerical variables ({len(numerical_cols)}):")
print(numerical_cols.tolist())

Target Variable Analysis

# Churn distribution
churn_counts = df['Churn'].value_counts()
print("Churn distribution:")
print(churn_counts)
print(f"Churn rate: {churn_counts[1] / len(df):.1%}")

# Visualize churn distribution
plt.figure(figsize=(8, 5))
plt.pie(churn_counts.values, labels=['Stayed', 'Churned'], autopct='%1.1f%%', 
        colors=['lightblue', 'salmon'], startangle=90)
plt.title('Customer Churn Distribution', fontsize=14, fontweight='bold')
plt.axis('equal')
plt.show()

Step 2: Data Cleaning and Preprocessing

Handling Missing Values

# Check for missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Percentage', ascending=False)

print("Missing values analysis:")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
plt.figure(figsize=(10, 6))
missing_df[missing_df['Missing Count'] > 0]['Missing Percentage'].plot(kind='barh')
plt.title('Missing Values by Column')
plt.xlabel('Percentage Missing')
plt.ylabel('Column')
plt.grid(True, alpha=0.3)
plt.show()

Handling Missing Values Strategies

# Strategy 1: Drop columns with too many missing values (>50%)
cols_to_drop = missing_df[missing_df['Missing Percentage'] > 50].index
print(f"Columns to drop: {cols_to_drop.tolist()}")

df_clean = df.drop(columns=cols_to_drop)

# Strategy 2: Fill numerical missing values with median
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if df_clean[col].isnull().sum() > 0:
        median_val = df_clean[col].median()
        df_clean[col].fillna(median_val, inplace=True)
        print(f"Filled {col} missing values with median: {median_val}")

# Strategy 3: Fill categorical missing values with mode
categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_val = df_clean[col].mode()[0]
        df_clean[col].fillna(mode_val, inplace=True)
        print(f"Filled {col} missing values with mode: {mode_val}")

print(f"\nAfter cleaning: {df_clean.isnull().sum().sum()} missing values remaining")

Outlier Detection and Treatment

# Function to detect outliers using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Check for outliers in numerical columns
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols[:6]):  # Show first 6 columns
    plt.subplot(2, 3, i+1)
    outliers, lower, upper = detect_outliers_iqr(df_clean, col)
    plt.boxplot(df_clean[col])
    plt.title(f'{col}\nOutliers: {len(outliers)}')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Handle outliers (capping extreme values)
for col in numerical_cols:
    outliers, lower, upper = detect_outliers_iqr(df_clean, col)
    if len(outliers) > 0:
        # Cap outliers at bounds
        df_clean[col] = np.clip(df_clean[col], lower, upper)
        print(f"Capped outliers in {col}: {len(outliers)} values adjusted")

Feature Engineering

# Create new features
df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')

# Tenure groups
df_clean['TenureGroup'] = pd.cut(df_clean['Tenure'], 
                                bins=[0, 12, 24, 36, 48, 60, 72], 
                                labels=['0-1yr', '1-2yr', '2-3yr', '3-4yr', '4-5yr', '5-6yr'])

# Monthly charges categories
df_clean['MonthlyChargesGroup'] = pd.cut(df_clean['MonthlyCharges'], 
                                        bins=[0, 30, 50, 70, 90, 120], 
                                        labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])

# Service usage score (number of services used)
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

df_clean['ServiceCount'] = df_clean[service_cols].apply(
    lambda row: sum(1 for val in row if val in ['Yes', 'DSL', 'Fiber optic']), axis=1)

# Contract type encoding for analysis
contract_mapping = {'Month-to-month': 1, 'One year': 2, 'Two year': 3}
df_clean['ContractScore'] = df_clean['Contract'].map(contract_mapping)

print("New features created:")
print("- TenureGroup: Tenure categorized into yearly groups")
print("- MonthlyChargesGroup: Monthly charges categorized")
print("- ServiceCount: Number of services used")
print("- ContractScore: Contract type numerical score")

Step 3: Exploratory Data Analysis

Churn Analysis by Demographics

# Churn by gender
gender_churn = pd.crosstab(df_clean['Gender'], df_clean['Churn'], normalize='index') * 100
print("Churn rate by gender:")
print(gender_churn.round(2))

# Churn by senior citizen status
senior_churn = pd.crosstab(df_clean['SeniorCitizen'], df_clean['Churn'], normalize='index') * 100
print("\nChurn rate by senior citizen status:")
print(senior_churn.round(2))

# Visualize demographic analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

gender_churn.plot(kind='bar', ax=axes[0], color=['lightblue', 'salmon'])
axes[0].set_title('Churn Rate by Gender')
axes[0].set_ylabel('Percentage')
axes[0].legend(['Stayed', 'Churned'])
axes[0].tick_params(axis='x', rotation=0)

senior_churn.plot(kind='bar', ax=axes[1], color=['lightblue', 'salmon'])
axes[1].set_title('Churn Rate by Senior Citizen Status')
axes[1].set_ylabel('Percentage')
axes[1].legend(['Stayed', 'Churned'])
axes[1].set_xticklabels(['Not Senior', 'Senior'])

plt.tight_layout()
plt.show()

Churn Analysis by Services

# Churn by internet service type
internet_churn = pd.crosstab(df_clean['InternetService'], df_clean['Churn'], normalize='index') * 100
print("Churn rate by internet service:")
print(internet_churn.round(2))

# Churn by contract type
contract_churn = pd.crosstab(df_clean['Contract'], df_clean['Churn'], normalize='index') * 100
print("\nChurn rate by contract type:")
print(contract_churn.round(2))

# Visualize service analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

internet_churn.plot(kind='bar', ax=axes[0], color=['lightblue', 'salmon'])
axes[0].set_title('Churn Rate by Internet Service')
axes[0].set_ylabel('Percentage')
axes[0].legend(['Stayed', 'Churned'])
axes[0].tick_params(axis='x', rotation=45)

contract_churn.plot(kind='bar', ax=axes[1], color=['lightblue', 'salmon'])
axes[1].set_title('Churn Rate by Contract Type')
axes[1].set_ylabel('Percentage')
axes[1].legend(['Stayed', 'Churned'])
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

Numerical Variables Analysis

# Compare distributions for churned vs stayed customers
numerical_cols = ['Tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    # Separate data by churn status
    stayed = df_clean[df_clean['Churn'] == 0][col]
    churned = df_clean[df_clean['Churn'] == 1][col]
    
    axes[i].hist([stayed, churned], bins=20, alpha=0.7, label=['Stayed', 'Churned'], 
                 color=['lightblue', 'salmon'])
    axes[i].set_title(f'{col} Distribution')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistical comparison
print("Statistical comparison (Churned vs Stayed):")
for col in numerical_cols:
    stayed = df_clean[df_clean['Churn'] == 0][col]
    churned = df_clean[df_clean['Churn'] == 1][col]
    
    t_stat, p_value = stats.ttest_ind(stayed, churned)
    print(f"\n{col}:")
    print(f"  Stayed mean: {stayed.mean():.2f}")
    print(f"  Churned mean: {churned.mean():.2f}")
    print(f"  T-statistic: {t_stat:.3f}")
    print(f"  P-value: {p_value:.3f}")
    print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

Step 4: Correlation Analysis

Correlation Matrix

# Select numerical columns for correlation
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
correlation_matrix = df_clean[numerical_cols].corr()

# Focus on correlations with Churn
churn_correlations = correlation_matrix['Churn'].sort_values(ascending=False)
print("Correlations with Churn:")
print(churn_correlations)

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
            center=0, fmt='.2f', square=True, linewidths=0.5)
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Key Insights from Correlations

# Top positive correlations with churn (risk factors)
positive_corr = churn_correlations[churn_correlations > 0].head(5)
print("Top risk factors for churn:")
for feature, corr in positive_corr.items():
    print(f"  {feature}: {corr:.3f}")

# Top negative correlations with churn (protective factors)
negative_corr = churn_correlations[churn_correlations < 0].tail(5)
print("\nTop protective factors against churn:")
for feature, corr in negative_corr.items():
    print(f"  {feature}: {corr:.3f}")

Step 5: Advanced Visualizations

Churn Analysis Dashboard

# Create a comprehensive dashboard
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Churn rate by tenure group
tenure_churn = pd.crosstab(df_clean['TenureGroup'], df_clean['Churn'], normalize='index') * 100
tenure_churn[1].plot(kind='bar', ax=axes[0, 0], color='salmon')
axes[0, 0].set_title('Churn Rate by Tenure Group')
axes[0, 0].set_ylabel('Churn Rate (%)')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# 2. Churn rate by monthly charges group
monthly_churn = pd.crosstab(df_clean['MonthlyChargesGroup'], df_clean['Churn'], normalize='index') * 100
monthly_churn[1].plot(kind='bar', ax=axes[0, 1], color='salmon')
axes[0, 1].set_title('Churn Rate by Monthly Charges')
axes[0, 1].set_ylabel('Churn Rate (%)')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# 3. Churn rate by service count
service_churn = df_clean.groupby('ServiceCount')['Churn'].mean() * 100
service_churn.plot(kind='bar', ax=axes[0, 2], color='salmon')
axes[0, 2].set_title('Churn Rate by Service Count')
axes[0, 2].set_ylabel('Churn Rate (%)')
axes[0, 2].set_xlabel('Number of Services')
axes[0, 2].grid(True, alpha=0.3)

# 4. Churn rate by payment method
payment_churn = pd.crosstab(df_clean['PaymentMethod'], df_clean['Churn'], normalize='index') * 100
payment_churn[1].plot(kind='barh', ax=axes[1, 0], color='salmon')
axes[1, 0].set_title('Churn Rate by Payment Method')
axes[1, 0].set_xlabel('Churn Rate (%)')
axes[1, 0].grid(True, alpha=0.3)

# 5. Box plot of monthly charges by churn status
df_clean.boxplot(column='MonthlyCharges', by='Churn', ax=axes[1, 1])
axes[1, 1].set_title('Monthly Charges by Churn Status')
axes[1, 1].set_xlabel('Churn Status')
axes[1, 1].set_ylabel('Monthly Charges ($)')
axes[1, 1].grid(True, alpha=0.3)

# 6. Tenure distribution by churn status
df_clean[df_clean['Churn'] == 0]['Tenure'].hist(alpha=0.7, bins=20, ax=axes[1, 2], label='Stayed', color='lightblue')
df_clean[df_clean['Churn'] == 1]['Tenure'].hist(alpha=0.7, bins=20, ax=axes[1, 2], label='Churned', color='salmon')
axes[1, 2].set_title('Tenure Distribution')
axes[1, 2].set_xlabel('Tenure (months)')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.suptitle('Customer Churn Analysis Dashboard', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

Customer Segmentation Analysis

# Create customer segments based on key metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Select features for clustering
cluster_features = ['Tenure', 'MonthlyCharges', 'TotalCharges', 'ServiceCount', 'ContractScore']
X = df_clean[cluster_features].copy()

# Handle any remaining missing values
X = X.fillna(X.median())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df_clean['Segment'] = kmeans.labels_

# Analyze segments
segment_analysis = df_clean.groupby('Segment').agg({
    'Tenure': 'mean',
    'MonthlyCharges': 'mean',
    'TotalCharges': 'mean',
    'ServiceCount': 'mean',
    'Churn': 'mean'
}).round(2)

print("Customer Segments Analysis:")
print(segment_analysis)

# Visualize segments
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Tenure vs Monthly Charges by segment
for segment in df_clean['Segment'].unique():
    segment_data = df_clean[df_clean['Segment'] == segment]
    axes[0, 0].scatter(segment_data['Tenure'], segment_data['MonthlyCharges'], 
                      label=f'Segment {segment}', alpha=0.6, s=50)

axes[0, 0].set_xlabel('Tenure (months)')
axes[0, 0].set_ylabel('Monthly Charges ($)')
axes[0, 0].set_title('Customer Segments: Tenure vs Monthly Charges')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Churn rate by segment
churn_by_segment = df_clean.groupby('Segment')['Churn'].mean() * 100
churn_by_segment.plot(kind='bar', ax=axes[0, 1], color='salmon')
axes[0, 1].set_title('Churn Rate by Customer Segment')
axes[0, 1].set_ylabel('Churn Rate (%)')
axes[0, 1].set_xlabel('Segment')
axes[0, 1].grid(True, alpha=0.3)

# Service count by segment
service_by_segment = df_clean.groupby('Segment')['ServiceCount'].mean()
service_by_segment.plot(kind='bar', ax=axes[1, 0], color='lightblue')
axes[1, 0].set_title('Average Services by Segment')
axes[1, 0].set_ylabel('Average Service Count')
axes[1, 0].set_xlabel('Segment')
axes[1, 0].grid(True, alpha=0.3)

# Segment sizes
segment_sizes = df_clean['Segment'].value_counts()
segment_sizes.plot(kind='pie', ax=axes[1, 1], autopct='%1.1f%%', 
                   colors=['lightblue', 'salmon', 'lightgreen', 'orange'])
axes[1, 1].set_title('Segment Distribution')

plt.suptitle('Customer Segmentation Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Step 6: Predictive Modeling

Prepare Data for Modeling

# Encode categorical variables
df_model = df_clean.copy()

# Binary encoding for simple yes/no variables
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    df_model[col] = df_model[col].map({'Yes': 1, 'No': 0})

# One-hot encoding for multi-category variables
categorical_cols = ['Gender', 'InternetService', 'Contract', 'PaymentMethod']
df_model = pd.get_dummies(df_model, columns=categorical_cols, drop_first=True)

# Convert SeniorCitizen to int if it's not already
df_model['SeniorCitizen'] = df_model['SeniorCitizen'].astype(int)

# Select features and target
feature_cols = [col for col in df_model.columns if col not in ['Churn', 'customerID', 'TenureGroup', 'MonthlyChargesGroup']]
X = df_model[feature_cols]
y = df_model['Churn']

print(f"Features: {len(feature_cols)}")
print(f"Target: Churn (0=Stay, 1=Leave)")
print(f"Dataset shape: {X.shape}")

Build and Evaluate Models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Model 1: Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_proba = lr_model.predict_proba(X_test)[:, 1]

# Model 2: Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluate models
def evaluate_model(y_true, y_pred, y_proba, model_name):
    print(f"\n{model_name} Results:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    
    # ROC AUC Score
    auc_score = roc_auc_score(y_true, y_proba)
    print(f"ROC AUC Score: {auc_score:.3f}")
    
    return auc_score

lr_auc = evaluate_model(y_test, lr_pred, lr_proba, "Logistic Regression")
rf_auc = evaluate_model(y_test, rf_pred, rf_proba, "Random Forest")

# Compare ROC curves
plt.figure(figsize=(8, 6))
fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_proba)

plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', linewidth=2)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {rf_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves: Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Feature Importance Analysis

# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(10)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Features for Churn Prediction')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.show()

Step 7: Business Recommendations

Key Findings Summary

print("🔍 KEY FINDINGS FROM CHURN ANALYSIS:")
print("=" * 50)

# Churn rate
churn_rate = df_clean['Churn'].mean() * 100
print(".1f")

# Top risk factors
print("\n🚨 TOP RISK FACTORS FOR CHURN:")
risk_factors = churn_correlations[churn_correlations > 0].head(3)
for factor, corr in risk_factors.items():
    print(".3f")

# Top protective factors
print("\n✅ TOP PROTECTIVE FACTORS AGAINST CHURN:")
protective_factors = churn_correlations[churn_correlations < 0].tail(3)
for factor, corr in protective_factors.items():
    print(".3f")

# Model performance
print("
🤖 MODEL PERFORMANCE:")
print(".3f")
print(".3f")

Actionable Recommendations

print("\n💡 ACTIONABLE RECOMMENDATIONS:")
print("=" * 50)

print("\n1. 📞 CONTRACT STRATEGY:")
print("   - Month-to-month customers churn 4x more than long-term contract customers")
print("   - Offer incentives to switch to annual/two-year contracts")
print("   - Implement loyalty discounts for contract renewals")

print("\n2. 💰 PRICING OPTIMIZATION:")
print("   - High monthly charges ($70+) correlate strongly with churn")
print("   - Create tiered pricing with value-added services")
print("   - Offer bundled packages to reduce perceived cost")

print("\n3. 🎯 CUSTOMER SEGMENTATION:")
print("   - Focus retention efforts on high-risk segments:")
print("     * New customers (tenure < 12 months)")
print("     * High-spenders with multiple services")
print("     * Fiber optic customers with month-to-month contracts")

print("\n4. 📱 SERVICE ENHANCEMENT:")
print("   - Customers with fewer services churn more")
print("   - Promote additional services through personalized recommendations")
print("   - Bundle internet + phone + TV for better retention")

print("\n5. 🎯 TARGETED INTERVENTIONS:")
print("   - Proactively contact high-risk customers with personalized offers")
print("   - Implement early warning system for customers showing churn signals")
print("   - Create win-back campaigns for recently churned customers")

print("\n6. 📊 MONITORING & MEASUREMENT:")
print("   - Track churn rate monthly by customer segments")
print("   - Monitor impact of retention initiatives")
print("   - Set up automated alerts for rising churn rates")

Expected Impact

print("\n📈 EXPECTED BUSINESS IMPACT:")
print("=" * 50)

current_churn_rate = df_clean['Churn'].mean()
estimated_customer_lifetime = 24  # months
average_monthly_revenue = df_clean['MonthlyCharges'].mean()

print(".1f"print(".0f"print(".0f"
# Potential savings from 20% churn reduction
churn_reduction = 0.20
annual_savings = (current_churn_rate * churn_reduction * 
                 len(df_clean) * average_monthly_revenue * 12)
print(".0f"
print("\n💰 Additional benefits:")
print("   - Reduced customer acquisition costs")
print("   - Improved customer lifetime value")
print("   - Better customer satisfaction scores")
print("   - Enhanced brand reputation")

Step 8: Create Executive Summary Report

# Generate comprehensive report
def create_executive_summary():
    """Create a formatted executive summary"""
    
    summary = f"""
# Customer Churn Analysis Executive Summary

## Executive Summary
This analysis examined customer churn patterns in our telecom customer base, 
identifying key drivers and providing actionable recommendations to reduce churn.

## Key Metrics
- **Total Customers Analyzed**: {len(df_clean):,}
- **Current Churn Rate**: {churn_rate:.1f}%
- **Average Customer Lifetime**: {df_clean['Tenure'].mean():.1f} months
- **Average Monthly Revenue**: ${average_monthly_revenue:.2f}

## Critical Findings

### High-Risk Customer Segments
1. **Contract Type**: Month-to-month customers churn at {contract_churn.loc['Month-to-month', 1]:.1f}%, 
   compared to {contract_churn.loc['Two year', 1]:.1f}% for two-year contracts
2. **Tenure**: Customers with < 12 months tenure show {tenure_churn.loc['0-1yr', 1]:.1f}% churn rate
3. **Monthly Charges**: Customers paying >$70/month show elevated churn risk

### Predictive Model Performance
- **Best Model**: Random Forest (AUC: {rf_auc:.3f})
- **Key Predictors**: Contract type, tenure, monthly charges, total charges
- **Model Accuracy**: {rf_auc*100:.1f}% ability to distinguish churners from non-churners

## Recommended Actions

### Immediate Actions (Next 30 Days)
1. Launch contract upgrade campaign targeting month-to-month customers
2. Implement early warning system for high-risk customers
3. Create personalized retention offers based on usage patterns

### Medium-term Actions (3-6 Months)
1. Optimize pricing strategy for high-churn price points
2. Enhance onboarding process for new customers
3. Develop targeted communication campaigns

### Long-term Strategy (6-12 Months)
1. Build comprehensive customer analytics platform
2. Implement predictive churn modeling in CRM system
3. Establish customer success team for high-value segments

## Expected ROI
- **Churn Reduction Potential**: 15-25% through targeted interventions
- **Annual Revenue Impact**: ${annual_savings:,.0f} from 20% churn reduction
- **Payback Period**: 3-6 months for retention program investments

## Next Steps
1. Form cross-functional task force (Marketing, Product, Customer Success)
2. Pilot retention programs with high-risk segments
3. Establish KPIs and monitoring dashboard
4. Schedule follow-up analysis in 3 months

---
*Analysis completed on {pd.Timestamp.now().strftime('%B %d, %Y')}*
*Prepared by Data Science Team*
"""
    
    # Save report
    with open('churn_analysis_executive_summary.md', 'w') as f:
        f.write(summary)
    
    print("Executive summary saved as 'churn_analysis_executive_summary.md'")
    return summary

# Create the report
executive_summary = create_executive_summary()
print("\n" + "="*60)
print("EXECUTIVE SUMMARY PREVIEW:")
print("="*60)
print(executive_summary[:1000] + "...")

Complete Project Code

# Complete customer churn analysis pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def complete_churn_analysis(filepath):
    """
    Complete end-to-end customer churn analysis pipeline
    """
    print("🚀 Starting Customer Churn Analysis Pipeline")
    print("=" * 50)
    
    # 1. Data Loading and Exploration
    print("\n📊 Step 1: Data Loading and Exploration")
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df):,} customers with {len(df.columns)} features")
    
    # 2. Data Cleaning
    print("\n🧹 Step 2: Data Cleaning")
    df_clean = clean_data(df)
    print(f"Cleaned dataset: {len(df_clean)} customers")
    
    # 3. Exploratory Data Analysis
    print("\n🔍 Step 3: Exploratory Data Analysis")
    insights = perform_eda(df_clean)
    
    # 4. Predictive Modeling
    print("\n🤖 Step 4: Predictive Modeling")
    model_results = build_predictive_model(df_clean)
    
    # 5. Generate Recommendations
    print("\n💡 Step 5: Generate Business Recommendations")
    recommendations = generate_recommendations(df_clean, insights, model_results)
    
    # 6. Create Executive Summary
    print("\n📋 Step 6: Create Executive Summary")
    create_executive_summary(df_clean, insights, model_results, recommendations)
    
    print("\n✅ Analysis Complete!")
    print("Generated files:")
    print("- churn_analysis_report.md")
    print("- churn_analysis_visualizations.png")
    print("- churn_predictive_model.pkl")
    
    return df_clean, insights, model_results

def clean_data(df):
    """Clean and preprocess the data"""
    # Handle missing values, outliers, feature engineering
    # (Implementation details from earlier code)
    return df_clean

def perform_eda(df):
    """Perform exploratory data analysis"""
    # Generate insights, correlations, visualizations
    # (Implementation details from earlier code)
    return insights

def build_predictive_model(df):
    """Build and evaluate predictive models"""
    # Train models, evaluate performance
    # (Implementation details from earlier code)
    return model_results

def generate_recommendations(df, insights, model_results):
    """Generate actionable business recommendations"""
    # Based on analysis results
    # (Implementation details from earlier code)
    return recommendations

def create_executive_summary(df, insights, model_results, recommendations):
    """Create comprehensive executive summary"""
    # Generate final report
    # (Implementation details from earlier code)
    pass

# Run the complete analysis
if __name__ == "__main__":
    # df_clean, insights, model_results = complete_churn_analysis('telecom_customer_churn.csv')
    print("Pipeline ready to run!")

Practice Exercises

Exercise 1: Data Exploration Challenge

Load a new dataset (e.g., retail sales data)
Perform initial data exploration
Identify data quality issues
Create summary statistics and visualizations

Exercise 2: Feature Engineering Challenge

Create new features from existing data
Handle categorical variables appropriately
Deal with missing values creatively
Validate feature importance

Exercise 3: Advanced Visualization Challenge

Create a comprehensive dashboard
Use multiple chart types effectively
Apply consistent styling and branding
Make visualizations actionable

Exercise 4: Model Comparison Challenge

Compare 3+ different machine learning models
Perform hyperparameter tuning
Evaluate models using multiple metrics
Explain model decisions to business stakeholders

Exercise 5: Business Presentation Challenge

Create an executive summary
Design actionable recommendations
Calculate ROI and business impact
Prepare for stakeholder presentation

Summary

Complete Data Analysis Pipeline:

# 1. Data Exploration
df.head(), df.info(), df.describe()

# 2. Data Cleaning
df.dropna(), df.fillna(), outlier_handling()

# 3. Feature Engineering
df['new_feature'] = transform_existing_features()

# 4. Exploratory Analysis
correlation_matrix, churn_patterns, visualizations()

# 5. Statistical Testing
hypothesis_tests, confidence_intervals()

# 6. Predictive Modeling
train_models(), evaluate_performance(), feature_importance()

# 7. Business Recommendations
identify_risk_factors(), calculate_roi(), create_action_plan()

Key Takeaways:

End-to-end thinking: From raw data to business decisions
Iterative process: Clean → analyze → model → insights → actions
Business focus: Technical analysis serves business objectives
Communication: Translate data insights into actionable recommendations
Measurement: Track impact and refine approach over time

Congratulations! 🎉 You’ve completed a comprehensive data science project that demonstrates real-world skills in data analysis, statistical modeling, and business intelligence. This same approach can be applied to any business problem with data.

Next Module: Projects - Put it all together in complete Python applications! 🚀

Popular Topics

Categories