Daily Tech Brief

Top startup stories in your inbox

Subscribe Free

© 2026 rakrisi Daily

Statistical Analysis - Basic Statistics and Correlations

Statistical Analysis: Basic Statistics and Correlations

Welcome to Statistical Analysis! Think of statistics as data’s truth serum - it helps you separate signal from noise, find meaningful patterns, and make confident decisions based on evidence rather than guesswork.

Why Statistics Matters

Data without statistics is just numbers - statistics gives them meaning:

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Raw data - hard to interpret
sales_a = [100, 120, 95, 110, 105]
sales_b = [90, 115, 100, 125, 95]

print("Group A sales:", sales_a)
print("Group B sales:", sales_b)
print("A seems higher... but is it significant?")

# Statistical analysis - provides confidence
t_stat, p_value = stats.ttest_ind(sales_a, sales_b)
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print("Conclusion:", "Significant difference!" if p_value < 0.05 else "No significant difference")

# Visualize the comparison
plt.figure(figsize=(8, 5))
plt.boxplot([sales_a, sales_b], labels=['Group A', 'Group B'])
plt.title('Sales Comparison: Statistical Significance')
plt.ylabel('Sales ($)')
plt.grid(True, alpha=0.3)
plt.show()

Descriptive Statistics

Central Tendency

# Sample data
scores = np.array([85, 92, 78, 96, 88, 91, 83, 89, 94, 87])

print("Data:", scores)
print(f"Count: {len(scores)}")
print(f"Sum: {np.sum(scores)}")

# Mean (average)
mean_score = np.mean(scores)
print(f"Mean: {mean_score:.2f}")

# Median (middle value)
median_score = np.median(scores)
print(f"Median: {median_score:.2f}")

# Mode (most frequent value)
from scipy import stats as scipy_stats
mode_result = scipy_stats.mode(scores)
print(f"Mode: {mode_result.mode[0]} (appears {mode_result.count[0]} times)")

# Compare mean, median, mode
plt.figure(figsize=(10, 6))
plt.hist(scores, bins=8, alpha=0.7, edgecolor='black')
plt.axvline(mean_score, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_score:.1f}')
plt.axvline(median_score, color='green', linestyle='--', linewidth=2, label=f'Median: {median_score:.1f}')
plt.axvline(mode_result.mode[0], color='blue', linestyle='--', linewidth=2, label=f'Mode: {mode_result.mode[0]}')
plt.title('Score Distribution with Central Tendency Measures')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Variability Measures

# Range
data_range = np.ptp(scores)  # peak-to-peak
print(f"Range: {data_range}")

# Variance
variance = np.var(scores, ddof=1)  # ddof=1 for sample variance
print(f"Variance: {variance:.2f}")

# Standard deviation
std_dev = np.std(scores, ddof=1)
print(f"Standard Deviation: {std_dev:.2f}")

# Coefficient of variation (relative variability)
cv = (std_dev / mean_score) * 100
print(f"Coefficient of Variation: {cv:.2f}%")

# Quartiles and IQR
q1 = np.percentile(scores, 25)
q2 = np.percentile(scores, 50)  # Same as median
q3 = np.percentile(scores, 75)
iqr = q3 - q1

print(f"Q1 (25th percentile): {q1}")
print(f"Q2 (50th percentile): {q2}")
print(f"Q3 (75th percentile): {q3}")
print(f"IQR: {iqr}")

# Visualize quartiles
plt.figure(figsize=(8, 6))
plt.boxplot(scores, vert=False)
plt.title('Box Plot: Quartiles and Outliers')
plt.xlabel('Score')
plt.grid(True, alpha=0.3)

# Add quartile annotations
plt.text(q1, 1.1, f'Q1: {q1}', ha='center')
plt.text(q2, 1.1, f'Q2: {q2}', ha='center')
plt.text(q3, 1.1, f'Q3: {q3}', ha='center')
plt.text(q3 + 5, 1.1, f'IQR: {iqr}', ha='left')

plt.tight_layout()
plt.show()

Distribution Shape

# Skewness (asymmetry)
skewness = scipy_stats.skew(scores)
print(f"Skewness: {skewness:.3f}")
print("Distribution is:", "right-skewed" if skewness > 0.5 else "left-skewed" if skewness < -0.5 else "approximately symmetric")

# Kurtosis (tail heaviness)
kurtosis = scipy_stats.kurtosis(scores)
print(f"Kurtosis: {kurtosis:.3f}")
print("Distribution has:", "heavy tails" if kurtosis > 0.5 else "light tails" if kurtosis < -0.5 else "normal tails")

# Create different distributions to compare
normal_data = np.random.normal(75, 10, 1000)
skewed_data = np.random.exponential(2, 1000) * 20 + 50

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].hist(normal_data, bins=30, alpha=0.7, edgecolor='black')
axes[0].set_title('Normal Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].axvline(np.mean(normal_data), color='red', linestyle='--', label='Mean')
axes[0].axvline(np.median(normal_data), color='green', linestyle='--', label='Median')
axes[0].legend()

axes[1].hist(skewed_data, bins=30, alpha=0.7, edgecolor='black')
axes[1].set_title('Skewed Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
axes[1].axvline(np.mean(skewed_data), color='red', linestyle='--', label='Mean')
axes[1].axvline(np.median(skewed_data), color='green', linestyle='--', label='Median')
axes[1].legend()

plt.tight_layout()
plt.show()

Correlation Analysis

Pearson Correlation

# Study time vs exam scores
study_hours = np.array([2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
exam_scores = np.array([65, 70, 75, 80, 82, 85, 88, 90, 92, 95])

# Calculate correlation coefficient
correlation = np.corrcoef(study_hours, exam_scores)[0, 1]
print(f"Pearson correlation: {correlation:.3f}")

# Interpretation
if abs(correlation) < 0.3:
    strength = "weak"
elif abs(correlation) < 0.7:
    strength = "moderate"
else:
    strength = "strong"

direction = "positive" if correlation > 0 else "negative"
print(f"This is a {strength} {direction} correlation")

# Scatter plot with correlation
plt.figure(figsize=(8, 6))
plt.scatter(study_hours, exam_scores, s=50, alpha=0.7)
plt.title(f'Study Hours vs Exam Scores\nCorrelation: {correlation:.3f}')
plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(study_hours, exam_scores, 1)
p = np.poly1d(z)
plt.plot(study_hours, p(study_hours), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

Spearman Rank Correlation

# Non-linear relationship example
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 4, 9, 16, 25, 36, 49, 64, 81, 100])  # Perfect quadratic

# Pearson correlation (linear)
pearson_corr = np.corrcoef(x, y)[0, 1]
print(f"Pearson correlation: {pearson_corr:.3f}")

# Spearman correlation (monotonic)
spearman_corr = scipy_stats.spearmanr(x, y)[0]
print(f"Spearman correlation: {spearman_corr:.3f}")

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(x, y, s=50, alpha=0.7)
plt.title('Non-linear Relationship')
plt.xlabel('X')
plt.ylabel('Y (X²)')
plt.grid(True, alpha=0.3)

# Add quadratic fit
z = np.polyfit(x, y, 2)
p = np.poly1d(z)
x_smooth = np.linspace(1, 10, 100)
plt.plot(x_smooth, p(x_smooth), "r--", alpha=0.8, label='Quadratic fit')

plt.legend()
plt.tight_layout()
plt.show()

Correlation Matrix

# Multiple variables
np.random.seed(42)
data = pd.DataFrame({
    'height': np.random.normal(170, 10, 100),
    'weight': np.random.normal(70, 15, 100),
    'age': np.random.normal(30, 8, 100),
    'income': np.random.normal(50000, 15000, 100),
    'education_years': np.random.normal(16, 3, 100)
})

# Add some realistic correlations
data['weight'] = data['height'] * 0.5 + data['weight'] * 0.5  # Height-weight correlation
data['income'] = data['education_years'] * 2000 + data['income'] * 0.3  # Education-income correlation

# Calculate correlation matrix
correlation_matrix = data.corr()
print("Correlation Matrix:")
print(correlation_matrix.round(3))

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
heatmap = plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')

plt.colorbar(heatmap, label='Correlation')
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)

# Add correlation values
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                ha='center', va='center', color='white', fontweight='bold')

plt.title('Variable Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Hypothesis Testing

One-Sample T-Test

# Test if average height is different from 170 cm
heights = np.random.normal(172, 8, 50)  # Sample data

# Hypotheses:
# H0: mean height = 170 cm (null hypothesis)
# H1: mean height ≠ 170 cm (alternative hypothesis)

t_stat, p_value = scipy_stats.ttest_1samp(heights, 170)

print(f"Sample mean height: {np.mean(heights):.2f} cm")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

alpha = 0.05  # Significance level
if p_value < alpha:
    print("Reject null hypothesis - heights are significantly different from 170 cm")
else:
    print("Fail to reject null hypothesis - no significant difference from 170 cm")

# Visualize
plt.figure(figsize=(8, 5))
plt.hist(heights, bins=15, alpha=0.7, edgecolor='black')
plt.axvline(170, color='red', linestyle='--', linewidth=2, label='Hypothesized mean (170 cm)')
plt.axvline(np.mean(heights), color='blue', linestyle='--', linewidth=2, 
           label=f'Sample mean ({np.mean(heights):.1f} cm)')
plt.title('Height Distribution with Hypothesis Test')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Two-Sample T-Test

# Compare test scores between two teaching methods
method_a_scores = np.random.normal(78, 10, 30)
method_b_scores = np.random.normal(82, 12, 30)

# Hypotheses:
# H0: mean scores are equal
# H1: mean scores are different

t_stat, p_value = scipy_stats.ttest_ind(method_a_scores, method_b_scores)

print(f"Method A mean: {np.mean(method_a_scores):.2f}")
print(f"Method B mean: {np.mean(method_b_scores):.2f}")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

if p_value < 0.05:
    print("Significant difference between teaching methods!")
else:
    print("No significant difference between teaching methods.")

# Visualize comparison
plt.figure(figsize=(8, 6))
plt.boxplot([method_a_scores, method_b_scores], labels=['Method A', 'Method B'])
plt.title('Teaching Method Comparison')
plt.ylabel('Test Score')
plt.grid(True, alpha=0.3)

# Add significance annotation
sig_text = "p < 0.05" if p_value < 0.05 else "p ≥ 0.05"
plt.text(1.5, max(max(method_a_scores), max(method_b_scores)) + 2, 
         f'Significance: {sig_text}', ha='center', fontsize=12, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

Chi-Square Test

# Test if there's a relationship between education and job satisfaction
from scipy.stats import chi2_contingency

# Contingency table
contingency_table = np.array([
    [30, 10, 5],  # High school: Satisfied, Neutral, Dissatisfied
    [45, 15, 10], # Bachelor's: Satisfied, Neutral, Dissatisfied
    [50, 20, 15]  # Graduate: Satisfied, Neutral, Dissatisfied
])

chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print("Contingency Table:")
print("Education Level vs Job Satisfaction")
print(contingency_table)
print(f"\nChi-square statistic: {chi2:.3f}")
print(f"P-value: {p_value:.3f}")
print(f"Degrees of freedom: {dof}")

if p_value < 0.05:
    print("Significant relationship between education and job satisfaction!")
else:
    print("No significant relationship between education and job satisfaction.")

# Visualize
education_levels = ['High School', "Bachelor's", 'Graduate']
satisfaction_levels = ['Satisfied', 'Neutral', 'Dissatisfied']

fig, ax = plt.subplots(figsize=(10, 6))
bars = []
x = np.arange(len(education_levels))

for i, level in enumerate(satisfaction_levels):
    bars.append(ax.bar(x + i*0.25, contingency_table[:, i], width=0.25, 
                      label=level, alpha=0.8))

ax.set_xlabel('Education Level')
ax.set_ylabel('Count')
ax.set_title('Education vs Job Satisfaction')
ax.set_xticks(x + 0.25)
ax.set_xticklabels(education_levels)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Confidence Intervals

Mean Confidence Interval

# Calculate 95% confidence interval for the mean
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    std_err = scipy_stats.sem(data)  # Standard error of the mean
    margin = std_err * scipy_stats.t.ppf((1 + confidence) / 2, len(data) - 1)
    return mean - margin, mean + margin

# Sample data
scores = np.random.normal(75, 10, 30)
mean_score = np.mean(scores)

# Calculate confidence interval
ci_lower, ci_upper = confidence_interval(scores)
print(f"Sample mean: {mean_score:.2f}")
print(f"95% Confidence interval: ({ci_lower:.2f}, {ci_upper:.2f})")

# Visualize
plt.figure(figsize=(8, 5))
plt.hist(scores, bins=10, alpha=0.7, edgecolor='black')
plt.axvline(mean_score, color='red', linestyle='-', linewidth=2, label='Sample mean')
plt.axvline(ci_lower, color='blue', linestyle='--', linewidth=2, label='95% CI lower')
plt.axvline(ci_upper, color='blue', linestyle='--', linewidth=2, label='95% CI upper')
plt.title('Confidence Interval for Mean Score')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Interpretation
print("We are 95% confident that the true population mean lies between")
print(f"{ci_lower:.2f} and {ci_upper:.2f}")

Proportion Confidence Interval

# Confidence interval for proportion
def proportion_ci(successes, total, confidence=0.95):
    p = successes / total
    se = np.sqrt(p * (1 - p) / total)
    z = scipy_stats.norm.ppf((1 + confidence) / 2)
    margin = z * se
    return p - margin, p + margin

# Example: Survey results
total_respondents = 1000
satisfied_customers = 750

ci_lower, ci_upper = proportion_ci(satisfied_customers, total_respondents)
print(f"Sample proportion: {satisfied_customers/total_respondents:.1%}")
print(f"95% CI for satisfaction rate: {ci_lower:.1%} to {ci_upper:.1%}")

# Visualize
plt.figure(figsize=(8, 5))
plt.bar(['Satisfied', 'Not Satisfied'], 
        [satisfied_customers, total_respondents - satisfied_customers],
        color=['green', 'red'], alpha=0.7)
plt.title('Customer Satisfaction Survey Results')
plt.ylabel('Number of Respondents')
plt.text(0, satisfied_customers + 20, f'{satisfied_customers/total_respondents:.1%}', 
         ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')

# Add confidence interval annotation
plt.text(0.5, total_respondents * 0.8, 
         f'95% CI: {ci_lower:.1%} - {ci_upper:.1%}', 
         ha='center', fontsize=11,
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

Practical Examples

Example 1: A/B Testing Analysis

# A/B test results for website conversion
np.random.seed(42)

# Simulate conversion rates
control_conversions = np.random.binomial(1, 0.12, 10000)  # 12% conversion
variant_conversions = np.random.binomial(1, 0.15, 10000)  # 15% conversion

control_rate = np.mean(control_conversions)
variant_rate = np.mean(variant_conversions)

print("A/B Test Results:")
print(f"Control conversion rate: {control_rate:.2%}")
print(f"Variant conversion rate: {variant_rate:.2%}")
print(f"Absolute improvement: {(variant_rate - control_rate):.2%}")
print(f"Relative improvement: {((variant_rate - control_rate) / control_rate):.1%}")

# Statistical significance test
t_stat, p_value = scipy_stats.ttest_ind(control_conversions, variant_conversions)
print(f"\nStatistical test:")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

if p_value < 0.05:
    print("✅ Significant improvement! The variant performs better.")
else:
    print("❌ No significant difference between control and variant.")

# Confidence intervals
control_ci = proportion_ci(np.sum(control_conversions), len(control_conversions))
variant_ci = proportion_ci(np.sum(variant_conversions), len(variant_conversions))

print(f"\n95% CI for control: {control_ci[0]:.2%} - {control_ci[1]:.2%}")
print(f"95% CI for variant: {variant_ci[0]:.2%} - {variant_ci[1]:.2%}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Conversion rates comparison
rates = [control_rate, variant_rate]
errors = [[control_rate - control_ci[0], variant_rate - variant_ci[0]], 
          [control_ci[1] - control_rate, variant_ci[1] - variant_rate]]

axes[0].bar(['Control', 'Variant'], rates, yerr=errors, capsize=5, 
            color=['skyblue', 'orange'], alpha=0.7)
axes[0].set_title('Conversion Rates with 95% CI')
axes[0].set_ylabel('Conversion Rate')
axes[0].set_ylim(0, max(variant_ci[1], control_ci[1]) * 1.1)
axes[0].grid(True, alpha=0.3, axis='y')

# Add percentage labels
for i, (rate, ci) in enumerate(zip(rates, [control_ci, variant_ci])):
    axes[0].text(i, rate + 0.005, f'{rate:.1%}', ha='center', va='bottom', fontweight='bold')

# Distribution comparison
axes[1].hist(control_conversions, bins=2, alpha=0.7, label='Control', color='skyblue')
axes[1].hist(variant_conversions, bins=2, alpha=0.7, label='Variant', color='orange')
axes[1].set_title('Conversion Distributions')
axes[1].set_xlabel('Converted (1) vs Not (0)')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Example 2: Customer Segmentation Analysis

# Customer data analysis
np.random.seed(42)

customers = pd.DataFrame({
    'customer_id': range(1, 1001),
    'age': np.random.normal(35, 10, 1000),
    'annual_income': np.random.normal(60000, 20000, 1000),
    'spending_score': np.random.normal(50, 25, 1000),  # 0-100 scale
    'membership_years': np.random.normal(3, 2, 1000)
})

# Clip values to realistic ranges
customers['age'] = np.clip(customers['age'], 18, 80)
customers['annual_income'] = np.clip(customers['annual_income'], 20000, 150000)
customers['spending_score'] = np.clip(customers['spending_score'], 0, 100)
customers['membership_years'] = np.clip(customers['membership_years'], 0, 10)

print("Customer Demographics Summary:")
print(customers[['age', 'annual_income', 'spending_score', 'membership_years']].describe())

# Segment customers by spending score
def spending_segment(score):
    if score >= 75: return 'High Spender'
    elif score >= 50: return 'Medium Spender'
    else: return 'Low Spender'

customers['spending_segment'] = customers['spending_score'].apply(spending_segment)

# Statistical comparison between segments
segments = customers.groupby('spending_segment')

print("\nSegment Analysis:")
for name, group in segments:
    print(f"\n{name}:")
    print(f"  Count: {len(group)}")
    print(f"  Avg Age: {group['age'].mean():.1f}")
    print(f"  Avg Income: ${group['annual_income'].mean():,.0f}")
    print(f"  Avg Membership: {group['membership_years'].mean():.1f} years")

# Test if high spenders have significantly higher income
high_spenders = customers[customers['spending_segment'] == 'High Spender']['annual_income']
others = customers[customers['spending_segment'] != 'High Spender']['annual_income']

t_stat, p_value = scipy_stats.ttest_ind(high_spenders, others)
print(f"\nIncome comparison (High spenders vs Others):")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print("High spenders have significantly higher income!" if p_value < 0.05 else "No significant income difference.")

# Correlation analysis
correlation_matrix = customers[['age', 'annual_income', 'spending_score', 'membership_years']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix.round(3))

# Visualize key relationships
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Age vs Spending Score
axes[0, 0].scatter(customers['age'], customers['spending_score'], alpha=0.6)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Spending Score')
axes[0, 0].set_title('Age vs Spending Score')
axes[0, 0].grid(True, alpha=0.3)

# Income vs Spending Score
axes[0, 1].scatter(customers['annual_income'], customers['spending_score'], alpha=0.6)
axes[0, 1].set_xlabel('Annual Income')
axes[0, 1].set_ylabel('Spending Score')
axes[0, 1].set_title('Income vs Spending Score')
axes[0, 1].grid(True, alpha=0.3)

# Segment distribution
segment_counts = customers['spending_segment'].value_counts()
axes[1, 0].bar(segment_counts.index, segment_counts.values, 
               color=['green', 'orange', 'red'], alpha=0.7)
axes[1, 0].set_title('Customer Segments')
axes[1, 0].set_ylabel('Number of Customers')
axes[1, 0].tick_params(axis='x', rotation=45)

# Membership years distribution by segment
segment_data = [customers[customers['spending_segment'] == seg]['membership_years'] 
                for seg in ['High Spender', 'Medium Spender', 'Low Spender']]
axes[1, 1].boxplot(segment_data, labels=['High', 'Medium', 'Low'])
axes[1, 1].set_title('Membership Years by Segment')
axes[1, 1].set_ylabel('Membership Years')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Best Practices

1. Understand Your Data First

# Always start with exploratory analysis
print(df.describe())  # Summary statistics
print(df.isnull().sum())  # Missing values
plt.hist(df['column'])  # Distribution
plt.scatter(df['x'], df['y'])  # Relationships

2. Choose Appropriate Tests

# Parametric tests (normal data)
# - One-sample t-test: compare sample mean to known value
# - Two-sample t-test: compare means of two groups
# - ANOVA: compare means of three or more groups

# Non-parametric tests (non-normal data)
# - Mann-Whitney U: compare two groups
# - Kruskal-Wallis: compare three or more groups
# - Chi-square: test relationships between categorical variables

3. Consider Effect Size

# P-values tell you if there's a difference, but not how big
from statistics import mean, stdev

def cohen_d(group1, group2):
    """Calculate Cohen's d effect size"""
    diff = mean(group1) - mean(group2)
    pooled_std = ((len(group1) - 1) * stdev(group1)**2 + 
                  (len(group2) - 1) * stdev(group2)**2) / (len(group1) + len(group2) - 2)
    pooled_std = pooled_std ** 0.5
    return diff / pooled_std

# Effect size interpretation:
# Small: 0.2, Medium: 0.5, Large: 0.8

4. Avoid Common Mistakes

# Don't confuse correlation with causation
# Correlation shows relationship, not cause-and-effect

# Don't ignore assumptions of statistical tests
# T-tests assume normal distribution and equal variances

# Don't over-rely on p-values
# P < 0.05 doesn't mean the effect is important

# Don't forget about multiple testing
# Testing many hypotheses increases false positive risk

Practice Exercises

Exercise 1: Descriptive Statistics

Analyze a dataset of student grades:

  1. Calculate mean, median, mode, standard deviation
  2. Create histograms and box plots
  3. Identify outliers using IQR method
  4. Compare performance across different subjects

Exercise 2: Correlation Analysis

Examine relationships in sales data:

  1. Calculate correlations between variables
  2. Create scatter plots with trend lines
  3. Identify strongest and weakest relationships
  4. Create a correlation heatmap

Exercise 3: Hypothesis Testing

Test hypotheses about customer behavior:

  1. Test if average purchase amount differs by customer segment
  2. Compare conversion rates between website versions
  3. Test relationship between customer satisfaction and loyalty
  4. Calculate confidence intervals for key metrics

Exercise 4: A/B Test Analysis

Design and analyze an A/B test:

  1. Calculate sample size requirements
  2. Perform statistical significance testing
  3. Calculate confidence intervals
  4. Create visualizations showing results

Exercise 5: Comprehensive Analysis

Perform a complete statistical analysis:

  1. Data exploration and cleaning
  2. Descriptive statistics
  3. Correlation analysis
  4. Hypothesis testing
  5. Create a statistical report with visualizations

Summary

Statistical analysis provides rigorous data insights:

Descriptive Statistics:

# Central tendency
mean = np.mean(data)
median = np.median(data)
mode = scipy_stats.mode(data)

# Variability
std_dev = np.std(data, ddof=1)
variance = np.var(data, ddof=1)
iqr = np.subtract(*np.percentile(data, [75, 25]))

Correlation Analysis:

# Pearson correlation (linear relationships)
pearson_corr = np.corrcoef(x, y)[0, 1]

# Spearman correlation (monotonic relationships)
spearman_corr = scipy_stats.spearmanr(x, y)[0]

Hypothesis Testing:

# T-tests
t_stat, p_value = scipy_stats.ttest_ind(group1, group2)

# Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

Key Concepts:

  • Central tendency and variability measures
  • Correlation vs causation
  • Hypothesis testing and p-values
  • Confidence intervals
  • Effect size vs statistical significance
  • Choosing appropriate statistical tests

Next: Real-World Project - Complete data analysis project! 🚀