Statistical Analysis: Basic Statistics and Correlations

Welcome to Statistical Analysis! Think of statistics as data’s truth serum - it helps you separate signal from noise, find meaningful patterns, and make confident decisions based on evidence rather than guesswork.

Why Statistics Matters

Data without statistics is just numbers - statistics gives them meaning:

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Raw data - hard to interpret
sales_a = [100, 120, 95, 110, 105]
sales_b = [90, 115, 100, 125, 95]

print("Group A sales:", sales_a)
print("Group B sales:", sales_b)
print("A seems higher... but is it significant?")

# Statistical analysis - provides confidence
t_stat, p_value = stats.ttest_ind(sales_a, sales_b)
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print("Conclusion:", "Significant difference!" if p_value < 0.05 else "No significant difference")

# Visualize the comparison
plt.figure(figsize=(8, 5))
plt.boxplot([sales_a, sales_b], labels=['Group A', 'Group B'])
plt.title('Sales Comparison: Statistical Significance')
plt.ylabel('Sales ($)')
plt.grid(True, alpha=0.3)
plt.show()

Descriptive Statistics

Central Tendency

# Sample data
scores = np.array([85, 92, 78, 96, 88, 91, 83, 89, 94, 87])

print("Data:", scores)
print(f"Count: {len(scores)}")
print(f"Sum: {np.sum(scores)}")

# Mean (average)
mean_score = np.mean(scores)
print(f"Mean: {mean_score:.2f}")

# Median (middle value)
median_score = np.median(scores)
print(f"Median: {median_score:.2f}")

# Mode (most frequent value)
from scipy import stats as scipy_stats
mode_result = scipy_stats.mode(scores)
print(f"Mode: {mode_result.mode[0]} (appears {mode_result.count[0]} times)")

# Compare mean, median, mode
plt.figure(figsize=(10, 6))
plt.hist(scores, bins=8, alpha=0.7, edgecolor='black')
plt.axvline(mean_score, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_score:.1f}')
plt.axvline(median_score, color='green', linestyle='--', linewidth=2, label=f'Median: {median_score:.1f}')
plt.axvline(mode_result.mode[0], color='blue', linestyle='--', linewidth=2, label=f'Mode: {mode_result.mode[0]}')
plt.title('Score Distribution with Central Tendency Measures')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Variability Measures

# Range
data_range = np.ptp(scores)  # peak-to-peak
print(f"Range: {data_range}")

# Variance
variance = np.var(scores, ddof=1)  # ddof=1 for sample variance
print(f"Variance: {variance:.2f}")

# Standard deviation
std_dev = np.std(scores, ddof=1)
print(f"Standard Deviation: {std_dev:.2f}")

# Coefficient of variation (relative variability)
cv = (std_dev / mean_score) * 100
print(f"Coefficient of Variation: {cv:.2f}%")

# Quartiles and IQR
q1 = np.percentile(scores, 25)
q2 = np.percentile(scores, 50)  # Same as median
q3 = np.percentile(scores, 75)
iqr = q3 - q1

print(f"Q1 (25th percentile): {q1}")
print(f"Q2 (50th percentile): {q2}")
print(f"Q3 (75th percentile): {q3}")
print(f"IQR: {iqr}")

# Visualize quartiles
plt.figure(figsize=(8, 6))
plt.boxplot(scores, vert=False)
plt.title('Box Plot: Quartiles and Outliers')
plt.xlabel('Score')
plt.grid(True, alpha=0.3)

# Add quartile annotations
plt.text(q1, 1.1, f'Q1: {q1}', ha='center')
plt.text(q2, 1.1, f'Q2: {q2}', ha='center')
plt.text(q3, 1.1, f'Q3: {q3}', ha='center')
plt.text(q3 + 5, 1.1, f'IQR: {iqr}', ha='left')

plt.tight_layout()
plt.show()

Distribution Shape

# Skewness (asymmetry)
skewness = scipy_stats.skew(scores)
print(f"Skewness: {skewness:.3f}")
print("Distribution is:", "right-skewed" if skewness > 0.5 else "left-skewed" if skewness < -0.5 else "approximately symmetric")

# Kurtosis (tail heaviness)
kurtosis = scipy_stats.kurtosis(scores)
print(f"Kurtosis: {kurtosis:.3f}")
print("Distribution has:", "heavy tails" if kurtosis > 0.5 else "light tails" if kurtosis < -0.5 else "normal tails")

# Create different distributions to compare
normal_data = np.random.normal(75, 10, 1000)
skewed_data = np.random.exponential(2, 1000) * 20 + 50

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].hist(normal_data, bins=30, alpha=0.7, edgecolor='black')
axes[0].set_title('Normal Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].axvline(np.mean(normal_data), color='red', linestyle='--', label='Mean')
axes[0].axvline(np.median(normal_data), color='green', linestyle='--', label='Median')
axes[0].legend()

axes[1].hist(skewed_data, bins=30, alpha=0.7, edgecolor='black')
axes[1].set_title('Skewed Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
axes[1].axvline(np.mean(skewed_data), color='red', linestyle='--', label='Mean')
axes[1].axvline(np.median(skewed_data), color='green', linestyle='--', label='Median')
axes[1].legend()

plt.tight_layout()
plt.show()

Correlation Analysis

Pearson Correlation

# Study time vs exam scores
study_hours = np.array([2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
exam_scores = np.array([65, 70, 75, 80, 82, 85, 88, 90, 92, 95])

# Calculate correlation coefficient
correlation = np.corrcoef(study_hours, exam_scores)[0, 1]
print(f"Pearson correlation: {correlation:.3f}")

# Interpretation
if abs(correlation) < 0.3:
    strength = "weak"
elif abs(correlation) < 0.7:
    strength = "moderate"
else:
    strength = "strong"

direction = "positive" if correlation > 0 else "negative"
print(f"This is a {strength} {direction} correlation")

# Scatter plot with correlation
plt.figure(figsize=(8, 6))
plt.scatter(study_hours, exam_scores, s=50, alpha=0.7)
plt.title(f'Study Hours vs Exam Scores\nCorrelation: {correlation:.3f}')
plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(study_hours, exam_scores, 1)
p = np.poly1d(z)
plt.plot(study_hours, p(study_hours), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

Spearman Rank Correlation

# Non-linear relationship example
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 4, 9, 16, 25, 36, 49, 64, 81, 100])  # Perfect quadratic

# Pearson correlation (linear)
pearson_corr = np.corrcoef(x, y)[0, 1]
print(f"Pearson correlation: {pearson_corr:.3f}")

# Spearman correlation (monotonic)
spearman_corr = scipy_stats.spearmanr(x, y)[0]
print(f"Spearman correlation: {spearman_corr:.3f}")

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(x, y, s=50, alpha=0.7)
plt.title('Non-linear Relationship')
plt.xlabel('X')
plt.ylabel('Y (X²)')
plt.grid(True, alpha=0.3)

# Add quadratic fit
z = np.polyfit(x, y, 2)
p = np.poly1d(z)
x_smooth = np.linspace(1, 10, 100)
plt.plot(x_smooth, p(x_smooth), "r--", alpha=0.8, label='Quadratic fit')

plt.legend()
plt.tight_layout()
plt.show()

Correlation Matrix

# Multiple variables
np.random.seed(42)
data = pd.DataFrame({
    'height': np.random.normal(170, 10, 100),
    'weight': np.random.normal(70, 15, 100),
    'age': np.random.normal(30, 8, 100),
    'income': np.random.normal(50000, 15000, 100),
    'education_years': np.random.normal(16, 3, 100)
})

# Add some realistic correlations
data['weight'] = data['height'] * 0.5 + data['weight'] * 0.5  # Height-weight correlation
data['income'] = data['education_years'] * 2000 + data['income'] * 0.3  # Education-income correlation

# Calculate correlation matrix
correlation_matrix = data.corr()
print("Correlation Matrix:")
print(correlation_matrix.round(3))

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
heatmap = plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')

plt.colorbar(heatmap, label='Correlation')
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)

# Add correlation values
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                ha='center', va='center', color='white', fontweight='bold')

plt.title('Variable Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Hypothesis Testing

One-Sample T-Test

# Test if average height is different from 170 cm
heights = np.random.normal(172, 8, 50)  # Sample data

# Hypotheses:
# H0: mean height = 170 cm (null hypothesis)
# H1: mean height ≠ 170 cm (alternative hypothesis)

t_stat, p_value = scipy_stats.ttest_1samp(heights, 170)

print(f"Sample mean height: {np.mean(heights):.2f} cm")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

alpha = 0.05  # Significance level
if p_value < alpha:
    print("Reject null hypothesis - heights are significantly different from 170 cm")
else:
    print("Fail to reject null hypothesis - no significant difference from 170 cm")

# Visualize
plt.figure(figsize=(8, 5))
plt.hist(heights, bins=15, alpha=0.7, edgecolor='black')
plt.axvline(170, color='red', linestyle='--', linewidth=2, label='Hypothesized mean (170 cm)')
plt.axvline(np.mean(heights), color='blue', linestyle='--', linewidth=2, 
           label=f'Sample mean ({np.mean(heights):.1f} cm)')
plt.title('Height Distribution with Hypothesis Test')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Two-Sample T-Test

# Compare test scores between two teaching methods
method_a_scores = np.random.normal(78, 10, 30)
method_b_scores = np.random.normal(82, 12, 30)

# Hypotheses:
# H0: mean scores are equal
# H1: mean scores are different

t_stat, p_value = scipy_stats.ttest_ind(method_a_scores, method_b_scores)

print(f"Method A mean: {np.mean(method_a_scores):.2f}")
print(f"Method B mean: {np.mean(method_b_scores):.2f}")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

if p_value < 0.05:
    print("Significant difference between teaching methods!")
else:
    print("No significant difference between teaching methods.")

# Visualize comparison
plt.figure(figsize=(8, 6))
plt.boxplot([method_a_scores, method_b_scores], labels=['Method A', 'Method B'])
plt.title('Teaching Method Comparison')
plt.ylabel('Test Score')
plt.grid(True, alpha=0.3)

# Add significance annotation
sig_text = "p < 0.05" if p_value < 0.05 else "p ≥ 0.05"
plt.text(1.5, max(max(method_a_scores), max(method_b_scores)) + 2, 
         f'Significance: {sig_text}', ha='center', fontsize=12, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

Chi-Square Test

# Test if there's a relationship between education and job satisfaction
from scipy.stats import chi2_contingency

# Contingency table
contingency_table = np.array([
    [30, 10, 5],  # High school: Satisfied, Neutral, Dissatisfied
    [45, 15, 10], # Bachelor's: Satisfied, Neutral, Dissatisfied
    [50, 20, 15]  # Graduate: Satisfied, Neutral, Dissatisfied
])

chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print("Contingency Table:")
print("Education Level vs Job Satisfaction")
print(contingency_table)
print(f"\nChi-square statistic: {chi2:.3f}")
print(f"P-value: {p_value:.3f}")
print(f"Degrees of freedom: {dof}")

if p_value < 0.05:
    print("Significant relationship between education and job satisfaction!")
else:
    print("No significant relationship between education and job satisfaction.")

# Visualize
education_levels = ['High School', "Bachelor's", 'Graduate']
satisfaction_levels = ['Satisfied', 'Neutral', 'Dissatisfied']

fig, ax = plt.subplots(figsize=(10, 6))
bars = []
x = np.arange(len(education_levels))

for i, level in enumerate(satisfaction_levels):
    bars.append(ax.bar(x + i*0.25, contingency_table[:, i], width=0.25, 
                      label=level, alpha=0.8))

ax.set_xlabel('Education Level')
ax.set_ylabel('Count')
ax.set_title('Education vs Job Satisfaction')
ax.set_xticks(x + 0.25)
ax.set_xticklabels(education_levels)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Confidence Intervals

Mean Confidence Interval

# Calculate 95% confidence interval for the mean
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    std_err = scipy_stats.sem(data)  # Standard error of the mean
    margin = std_err * scipy_stats.t.ppf((1 + confidence) / 2, len(data) - 1)
    return mean - margin, mean + margin

# Sample data
scores = np.random.normal(75, 10, 30)
mean_score = np.mean(scores)

# Calculate confidence interval
ci_lower, ci_upper = confidence_interval(scores)
print(f"Sample mean: {mean_score:.2f}")
print(f"95% Confidence interval: ({ci_lower:.2f}, {ci_upper:.2f})")

# Visualize
plt.figure(figsize=(8, 5))
plt.hist(scores, bins=10, alpha=0.7, edgecolor='black')
plt.axvline(mean_score, color='red', linestyle='-', linewidth=2, label='Sample mean')
plt.axvline(ci_lower, color='blue', linestyle='--', linewidth=2, label='95% CI lower')
plt.axvline(ci_upper, color='blue', linestyle='--', linewidth=2, label='95% CI upper')
plt.title('Confidence Interval for Mean Score')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Interpretation
print("We are 95% confident that the true population mean lies between")
print(f"{ci_lower:.2f} and {ci_upper:.2f}")

Proportion Confidence Interval

# Confidence interval for proportion
def proportion_ci(successes, total, confidence=0.95):
    p = successes / total
    se = np.sqrt(p * (1 - p) / total)
    z = scipy_stats.norm.ppf((1 + confidence) / 2)
    margin = z * se
    return p - margin, p + margin

# Example: Survey results
total_respondents = 1000
satisfied_customers = 750

ci_lower, ci_upper = proportion_ci(satisfied_customers, total_respondents)
print(f"Sample proportion: {satisfied_customers/total_respondents:.1%}")
print(f"95% CI for satisfaction rate: {ci_lower:.1%} to {ci_upper:.1%}")

# Visualize
plt.figure(figsize=(8, 5))
plt.bar(['Satisfied', 'Not Satisfied'], 
        [satisfied_customers, total_respondents - satisfied_customers],
        color=['green', 'red'], alpha=0.7)
plt.title('Customer Satisfaction Survey Results')
plt.ylabel('Number of Respondents')
plt.text(0, satisfied_customers + 20, f'{satisfied_customers/total_respondents:.1%}', 
         ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')

# Add confidence interval annotation
plt.text(0.5, total_respondents * 0.8, 
         f'95% CI: {ci_lower:.1%} - {ci_upper:.1%}', 
         ha='center', fontsize=11,
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

Practical Examples

Example 1: A/B Testing Analysis

# A/B test results for website conversion
np.random.seed(42)

# Simulate conversion rates
control_conversions = np.random.binomial(1, 0.12, 10000)  # 12% conversion
variant_conversions = np.random.binomial(1, 0.15, 10000)  # 15% conversion

control_rate = np.mean(control_conversions)
variant_rate = np.mean(variant_conversions)

print("A/B Test Results:")
print(f"Control conversion rate: {control_rate:.2%}")
print(f"Variant conversion rate: {variant_rate:.2%}")
print(f"Absolute improvement: {(variant_rate - control_rate):.2%}")
print(f"Relative improvement: {((variant_rate - control_rate) / control_rate):.1%}")

# Statistical significance test
t_stat, p_value = scipy_stats.ttest_ind(control_conversions, variant_conversions)
print(f"\nStatistical test:")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

if p_value < 0.05:
    print("✅ Significant improvement! The variant performs better.")
else:
    print("❌ No significant difference between control and variant.")

# Confidence intervals
control_ci = proportion_ci(np.sum(control_conversions), len(control_conversions))
variant_ci = proportion_ci(np.sum(variant_conversions), len(variant_conversions))

print(f"\n95% CI for control: {control_ci[0]:.2%} - {control_ci[1]:.2%}")
print(f"95% CI for variant: {variant_ci[0]:.2%} - {variant_ci[1]:.2%}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Conversion rates comparison
rates = [control_rate, variant_rate]
errors = [[control_rate - control_ci[0], variant_rate - variant_ci[0]], 
          [control_ci[1] - control_rate, variant_ci[1] - variant_rate]]

axes[0].bar(['Control', 'Variant'], rates, yerr=errors, capsize=5, 
            color=['skyblue', 'orange'], alpha=0.7)
axes[0].set_title('Conversion Rates with 95% CI')
axes[0].set_ylabel('Conversion Rate')
axes[0].set_ylim(0, max(variant_ci[1], control_ci[1]) * 1.1)
axes[0].grid(True, alpha=0.3, axis='y')

# Add percentage labels
for i, (rate, ci) in enumerate(zip(rates, [control_ci, variant_ci])):
    axes[0].text(i, rate + 0.005, f'{rate:.1%}', ha='center', va='bottom', fontweight='bold')

# Distribution comparison
axes[1].hist(control_conversions, bins=2, alpha=0.7, label='Control', color='skyblue')
axes[1].hist(variant_conversions, bins=2, alpha=0.7, label='Variant', color='orange')
axes[1].set_title('Conversion Distributions')
axes[1].set_xlabel('Converted (1) vs Not (0)')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Example 2: Customer Segmentation Analysis

# Customer data analysis
np.random.seed(42)

customers = pd.DataFrame({
    'customer_id': range(1, 1001),
    'age': np.random.normal(35, 10, 1000),
    'annual_income': np.random.normal(60000, 20000, 1000),
    'spending_score': np.random.normal(50, 25, 1000),  # 0-100 scale
    'membership_years': np.random.normal(3, 2, 1000)
})

# Clip values to realistic ranges
customers['age'] = np.clip(customers['age'], 18, 80)
customers['annual_income'] = np.clip(customers['annual_income'], 20000, 150000)
customers['spending_score'] = np.clip(customers['spending_score'], 0, 100)
customers['membership_years'] = np.clip(customers['membership_years'], 0, 10)

print("Customer Demographics Summary:")
print(customers[['age', 'annual_income', 'spending_score', 'membership_years']].describe())

# Segment customers by spending score
def spending_segment(score):
    if score >= 75: return 'High Spender'
    elif score >= 50: return 'Medium Spender'
    else: return 'Low Spender'

customers['spending_segment'] = customers['spending_score'].apply(spending_segment)

# Statistical comparison between segments
segments = customers.groupby('spending_segment')

print("\nSegment Analysis:")
for name, group in segments:
    print(f"\n{name}:")
    print(f"  Count: {len(group)}")
    print(f"  Avg Age: {group['age'].mean():.1f}")
    print(f"  Avg Income: ${group['annual_income'].mean():,.0f}")
    print(f"  Avg Membership: {group['membership_years'].mean():.1f} years")

# Test if high spenders have significantly higher income
high_spenders = customers[customers['spending_segment'] == 'High Spender']['annual_income']
others = customers[customers['spending_segment'] != 'High Spender']['annual_income']

t_stat, p_value = scipy_stats.ttest_ind(high_spenders, others)
print(f"\nIncome comparison (High spenders vs Others):")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print("High spenders have significantly higher income!" if p_value < 0.05 else "No significant income difference.")

# Correlation analysis
correlation_matrix = customers[['age', 'annual_income', 'spending_score', 'membership_years']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix.round(3))

# Visualize key relationships
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Age vs Spending Score
axes[0, 0].scatter(customers['age'], customers['spending_score'], alpha=0.6)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Spending Score')
axes[0, 0].set_title('Age vs Spending Score')
axes[0, 0].grid(True, alpha=0.3)

# Income vs Spending Score
axes[0, 1].scatter(customers['annual_income'], customers['spending_score'], alpha=0.6)
axes[0, 1].set_xlabel('Annual Income')
axes[0, 1].set_ylabel('Spending Score')
axes[0, 1].set_title('Income vs Spending Score')
axes[0, 1].grid(True, alpha=0.3)

# Segment distribution
segment_counts = customers['spending_segment'].value_counts()
axes[1, 0].bar(segment_counts.index, segment_counts.values, 
               color=['green', 'orange', 'red'], alpha=0.7)
axes[1, 0].set_title('Customer Segments')
axes[1, 0].set_ylabel('Number of Customers')
axes[1, 0].tick_params(axis='x', rotation=45)

# Membership years distribution by segment
segment_data = [customers[customers['spending_segment'] == seg]['membership_years'] 
                for seg in ['High Spender', 'Medium Spender', 'Low Spender']]
axes[1, 1].boxplot(segment_data, labels=['High', 'Medium', 'Low'])
axes[1, 1].set_title('Membership Years by Segment')
axes[1, 1].set_ylabel('Membership Years')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Best Practices

1. Understand Your Data First

# Always start with exploratory analysis
print(df.describe())  # Summary statistics
print(df.isnull().sum())  # Missing values
plt.hist(df['column'])  # Distribution
plt.scatter(df['x'], df['y'])  # Relationships

2. Choose Appropriate Tests

# Parametric tests (normal data)
# - One-sample t-test: compare sample mean to known value
# - Two-sample t-test: compare means of two groups
# - ANOVA: compare means of three or more groups

# Non-parametric tests (non-normal data)
# - Mann-Whitney U: compare two groups
# - Kruskal-Wallis: compare three or more groups
# - Chi-square: test relationships between categorical variables

3. Consider Effect Size

# P-values tell you if there's a difference, but not how big
from statistics import mean, stdev

def cohen_d(group1, group2):
    """Calculate Cohen's d effect size"""
    diff = mean(group1) - mean(group2)
    pooled_std = ((len(group1) - 1) * stdev(group1)**2 + 
                  (len(group2) - 1) * stdev(group2)**2) / (len(group1) + len(group2) - 2)
    pooled_std = pooled_std ** 0.5
    return diff / pooled_std

# Effect size interpretation:
# Small: 0.2, Medium: 0.5, Large: 0.8

4. Avoid Common Mistakes

# Don't confuse correlation with causation
# Correlation shows relationship, not cause-and-effect

# Don't ignore assumptions of statistical tests
# T-tests assume normal distribution and equal variances

# Don't over-rely on p-values
# P < 0.05 doesn't mean the effect is important

# Don't forget about multiple testing
# Testing many hypotheses increases false positive risk

Practice Exercises

Exercise 1: Descriptive Statistics

Analyze a dataset of student grades:

Calculate mean, median, mode, standard deviation
Create histograms and box plots
Identify outliers using IQR method
Compare performance across different subjects

Exercise 2: Correlation Analysis

Examine relationships in sales data:

Calculate correlations between variables
Create scatter plots with trend lines
Identify strongest and weakest relationships
Create a correlation heatmap

Exercise 3: Hypothesis Testing

Test hypotheses about customer behavior:

Test if average purchase amount differs by customer segment
Compare conversion rates between website versions
Test relationship between customer satisfaction and loyalty
Calculate confidence intervals for key metrics

Exercise 4: A/B Test Analysis

Design and analyze an A/B test:

Calculate sample size requirements
Perform statistical significance testing
Calculate confidence intervals
Create visualizations showing results

Exercise 5: Comprehensive Analysis

Perform a complete statistical analysis:

Data exploration and cleaning
Descriptive statistics
Correlation analysis
Hypothesis testing
Create a statistical report with visualizations

Summary

Statistical analysis provides rigorous data insights:

Descriptive Statistics:

# Central tendency
mean = np.mean(data)
median = np.median(data)
mode = scipy_stats.mode(data)

# Variability
std_dev = np.std(data, ddof=1)
variance = np.var(data, ddof=1)
iqr = np.subtract(*np.percentile(data, [75, 25]))

Correlation Analysis:

# Pearson correlation (linear relationships)
pearson_corr = np.corrcoef(x, y)[0, 1]

# Spearman correlation (monotonic relationships)
spearman_corr = scipy_stats.spearmanr(x, y)[0]

Hypothesis Testing:

# T-tests
t_stat, p_value = scipy_stats.ttest_ind(group1, group2)

# Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

Key Concepts:

Central tendency and variability measures
Correlation vs causation
Hypothesis testing and p-values
Confidence intervals
Effect size vs statistical significance
Choosing appropriate statistical tests

Next: Real-World Project - Complete data analysis project! 🚀

Popular Topics

Categories