Statistical Analysis: Basic Statistics and Correlations
Welcome to Statistical Analysis! Think of statistics as data’s truth serum - it helps you separate signal from noise, find meaningful patterns, and make confident decisions based on evidence rather than guesswork.
Why Statistics Matters
Data without statistics is just numbers - statistics gives them meaning:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
# Raw data - hard to interpret
sales_a = [100, 120, 95, 110, 105]
sales_b = [90, 115, 100, 125, 95]
print("Group A sales:", sales_a)
print("Group B sales:", sales_b)
print("A seems higher... but is it significant?")
# Statistical analysis - provides confidence
t_stat, p_value = stats.ttest_ind(sales_a, sales_b)
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print("Conclusion:", "Significant difference!" if p_value < 0.05 else "No significant difference")
# Visualize the comparison
plt.figure(figsize=(8, 5))
plt.boxplot([sales_a, sales_b], labels=['Group A', 'Group B'])
plt.title('Sales Comparison: Statistical Significance')
plt.ylabel('Sales ($)')
plt.grid(True, alpha=0.3)
plt.show()
Descriptive Statistics
Central Tendency
# Sample data
scores = np.array([85, 92, 78, 96, 88, 91, 83, 89, 94, 87])
print("Data:", scores)
print(f"Count: {len(scores)}")
print(f"Sum: {np.sum(scores)}")
# Mean (average)
mean_score = np.mean(scores)
print(f"Mean: {mean_score:.2f}")
# Median (middle value)
median_score = np.median(scores)
print(f"Median: {median_score:.2f}")
# Mode (most frequent value)
from scipy import stats as scipy_stats
mode_result = scipy_stats.mode(scores)
print(f"Mode: {mode_result.mode[0]} (appears {mode_result.count[0]} times)")
# Compare mean, median, mode
plt.figure(figsize=(10, 6))
plt.hist(scores, bins=8, alpha=0.7, edgecolor='black')
plt.axvline(mean_score, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_score:.1f}')
plt.axvline(median_score, color='green', linestyle='--', linewidth=2, label=f'Median: {median_score:.1f}')
plt.axvline(mode_result.mode[0], color='blue', linestyle='--', linewidth=2, label=f'Mode: {mode_result.mode[0]}')
plt.title('Score Distribution with Central Tendency Measures')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
Variability Measures
# Range
data_range = np.ptp(scores) # peak-to-peak
print(f"Range: {data_range}")
# Variance
variance = np.var(scores, ddof=1) # ddof=1 for sample variance
print(f"Variance: {variance:.2f}")
# Standard deviation
std_dev = np.std(scores, ddof=1)
print(f"Standard Deviation: {std_dev:.2f}")
# Coefficient of variation (relative variability)
cv = (std_dev / mean_score) * 100
print(f"Coefficient of Variation: {cv:.2f}%")
# Quartiles and IQR
q1 = np.percentile(scores, 25)
q2 = np.percentile(scores, 50) # Same as median
q3 = np.percentile(scores, 75)
iqr = q3 - q1
print(f"Q1 (25th percentile): {q1}")
print(f"Q2 (50th percentile): {q2}")
print(f"Q3 (75th percentile): {q3}")
print(f"IQR: {iqr}")
# Visualize quartiles
plt.figure(figsize=(8, 6))
plt.boxplot(scores, vert=False)
plt.title('Box Plot: Quartiles and Outliers')
plt.xlabel('Score')
plt.grid(True, alpha=0.3)
# Add quartile annotations
plt.text(q1, 1.1, f'Q1: {q1}', ha='center')
plt.text(q2, 1.1, f'Q2: {q2}', ha='center')
plt.text(q3, 1.1, f'Q3: {q3}', ha='center')
plt.text(q3 + 5, 1.1, f'IQR: {iqr}', ha='left')
plt.tight_layout()
plt.show()
Distribution Shape
# Skewness (asymmetry)
skewness = scipy_stats.skew(scores)
print(f"Skewness: {skewness:.3f}")
print("Distribution is:", "right-skewed" if skewness > 0.5 else "left-skewed" if skewness < -0.5 else "approximately symmetric")
# Kurtosis (tail heaviness)
kurtosis = scipy_stats.kurtosis(scores)
print(f"Kurtosis: {kurtosis:.3f}")
print("Distribution has:", "heavy tails" if kurtosis > 0.5 else "light tails" if kurtosis < -0.5 else "normal tails")
# Create different distributions to compare
normal_data = np.random.normal(75, 10, 1000)
skewed_data = np.random.exponential(2, 1000) * 20 + 50
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].hist(normal_data, bins=30, alpha=0.7, edgecolor='black')
axes[0].set_title('Normal Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].axvline(np.mean(normal_data), color='red', linestyle='--', label='Mean')
axes[0].axvline(np.median(normal_data), color='green', linestyle='--', label='Median')
axes[0].legend()
axes[1].hist(skewed_data, bins=30, alpha=0.7, edgecolor='black')
axes[1].set_title('Skewed Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
axes[1].axvline(np.mean(skewed_data), color='red', linestyle='--', label='Mean')
axes[1].axvline(np.median(skewed_data), color='green', linestyle='--', label='Median')
axes[1].legend()
plt.tight_layout()
plt.show()
Correlation Analysis
Pearson Correlation
# Study time vs exam scores
study_hours = np.array([2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
exam_scores = np.array([65, 70, 75, 80, 82, 85, 88, 90, 92, 95])
# Calculate correlation coefficient
correlation = np.corrcoef(study_hours, exam_scores)[0, 1]
print(f"Pearson correlation: {correlation:.3f}")
# Interpretation
if abs(correlation) < 0.3:
strength = "weak"
elif abs(correlation) < 0.7:
strength = "moderate"
else:
strength = "strong"
direction = "positive" if correlation > 0 else "negative"
print(f"This is a {strength} {direction} correlation")
# Scatter plot with correlation
plt.figure(figsize=(8, 6))
plt.scatter(study_hours, exam_scores, s=50, alpha=0.7)
plt.title(f'Study Hours vs Exam Scores\nCorrelation: {correlation:.3f}')
plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.grid(True, alpha=0.3)
# Add trend line
z = np.polyfit(study_hours, exam_scores, 1)
p = np.poly1d(z)
plt.plot(study_hours, p(study_hours), "r--", alpha=0.8)
plt.tight_layout()
plt.show()
Spearman Rank Correlation
# Non-linear relationship example
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 4, 9, 16, 25, 36, 49, 64, 81, 100]) # Perfect quadratic
# Pearson correlation (linear)
pearson_corr = np.corrcoef(x, y)[0, 1]
print(f"Pearson correlation: {pearson_corr:.3f}")
# Spearman correlation (monotonic)
spearman_corr = scipy_stats.spearmanr(x, y)[0]
print(f"Spearman correlation: {spearman_corr:.3f}")
# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(x, y, s=50, alpha=0.7)
plt.title('Non-linear Relationship')
plt.xlabel('X')
plt.ylabel('Y (X²)')
plt.grid(True, alpha=0.3)
# Add quadratic fit
z = np.polyfit(x, y, 2)
p = np.poly1d(z)
x_smooth = np.linspace(1, 10, 100)
plt.plot(x_smooth, p(x_smooth), "r--", alpha=0.8, label='Quadratic fit')
plt.legend()
plt.tight_layout()
plt.show()
Correlation Matrix
# Multiple variables
np.random.seed(42)
data = pd.DataFrame({
'height': np.random.normal(170, 10, 100),
'weight': np.random.normal(70, 15, 100),
'age': np.random.normal(30, 8, 100),
'income': np.random.normal(50000, 15000, 100),
'education_years': np.random.normal(16, 3, 100)
})
# Add some realistic correlations
data['weight'] = data['height'] * 0.5 + data['weight'] * 0.5 # Height-weight correlation
data['income'] = data['education_years'] * 2000 + data['income'] * 0.3 # Education-income correlation
# Calculate correlation matrix
correlation_matrix = data.corr()
print("Correlation Matrix:")
print(correlation_matrix.round(3))
# Visualize correlation matrix
plt.figure(figsize=(8, 6))
heatmap = plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar(heatmap, label='Correlation')
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
# Add correlation values
for i in range(len(correlation_matrix.columns)):
for j in range(len(correlation_matrix.columns)):
plt.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
ha='center', va='center', color='white', fontweight='bold')
plt.title('Variable Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
Hypothesis Testing
One-Sample T-Test
# Test if average height is different from 170 cm
heights = np.random.normal(172, 8, 50) # Sample data
# Hypotheses:
# H0: mean height = 170 cm (null hypothesis)
# H1: mean height ≠ 170 cm (alternative hypothesis)
t_stat, p_value = scipy_stats.ttest_1samp(heights, 170)
print(f"Sample mean height: {np.mean(heights):.2f} cm")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
alpha = 0.05 # Significance level
if p_value < alpha:
print("Reject null hypothesis - heights are significantly different from 170 cm")
else:
print("Fail to reject null hypothesis - no significant difference from 170 cm")
# Visualize
plt.figure(figsize=(8, 5))
plt.hist(heights, bins=15, alpha=0.7, edgecolor='black')
plt.axvline(170, color='red', linestyle='--', linewidth=2, label='Hypothesized mean (170 cm)')
plt.axvline(np.mean(heights), color='blue', linestyle='--', linewidth=2,
label=f'Sample mean ({np.mean(heights):.1f} cm)')
plt.title('Height Distribution with Hypothesis Test')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
Two-Sample T-Test
# Compare test scores between two teaching methods
method_a_scores = np.random.normal(78, 10, 30)
method_b_scores = np.random.normal(82, 12, 30)
# Hypotheses:
# H0: mean scores are equal
# H1: mean scores are different
t_stat, p_value = scipy_stats.ttest_ind(method_a_scores, method_b_scores)
print(f"Method A mean: {np.mean(method_a_scores):.2f}")
print(f"Method B mean: {np.mean(method_b_scores):.2f}")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
if p_value < 0.05:
print("Significant difference between teaching methods!")
else:
print("No significant difference between teaching methods.")
# Visualize comparison
plt.figure(figsize=(8, 6))
plt.boxplot([method_a_scores, method_b_scores], labels=['Method A', 'Method B'])
plt.title('Teaching Method Comparison')
plt.ylabel('Test Score')
plt.grid(True, alpha=0.3)
# Add significance annotation
sig_text = "p < 0.05" if p_value < 0.05 else "p ≥ 0.05"
plt.text(1.5, max(max(method_a_scores), max(method_b_scores)) + 2,
f'Significance: {sig_text}', ha='center', fontsize=12,
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
plt.tight_layout()
plt.show()
Chi-Square Test
# Test if there's a relationship between education and job satisfaction
from scipy.stats import chi2_contingency
# Contingency table
contingency_table = np.array([
[30, 10, 5], # High school: Satisfied, Neutral, Dissatisfied
[45, 15, 10], # Bachelor's: Satisfied, Neutral, Dissatisfied
[50, 20, 15] # Graduate: Satisfied, Neutral, Dissatisfied
])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("Contingency Table:")
print("Education Level vs Job Satisfaction")
print(contingency_table)
print(f"\nChi-square statistic: {chi2:.3f}")
print(f"P-value: {p_value:.3f}")
print(f"Degrees of freedom: {dof}")
if p_value < 0.05:
print("Significant relationship between education and job satisfaction!")
else:
print("No significant relationship between education and job satisfaction.")
# Visualize
education_levels = ['High School', "Bachelor's", 'Graduate']
satisfaction_levels = ['Satisfied', 'Neutral', 'Dissatisfied']
fig, ax = plt.subplots(figsize=(10, 6))
bars = []
x = np.arange(len(education_levels))
for i, level in enumerate(satisfaction_levels):
bars.append(ax.bar(x + i*0.25, contingency_table[:, i], width=0.25,
label=level, alpha=0.8))
ax.set_xlabel('Education Level')
ax.set_ylabel('Count')
ax.set_title('Education vs Job Satisfaction')
ax.set_xticks(x + 0.25)
ax.set_xticklabels(education_levels)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
Confidence Intervals
Mean Confidence Interval
# Calculate 95% confidence interval for the mean
def confidence_interval(data, confidence=0.95):
mean = np.mean(data)
std_err = scipy_stats.sem(data) # Standard error of the mean
margin = std_err * scipy_stats.t.ppf((1 + confidence) / 2, len(data) - 1)
return mean - margin, mean + margin
# Sample data
scores = np.random.normal(75, 10, 30)
mean_score = np.mean(scores)
# Calculate confidence interval
ci_lower, ci_upper = confidence_interval(scores)
print(f"Sample mean: {mean_score:.2f}")
print(f"95% Confidence interval: ({ci_lower:.2f}, {ci_upper:.2f})")
# Visualize
plt.figure(figsize=(8, 5))
plt.hist(scores, bins=10, alpha=0.7, edgecolor='black')
plt.axvline(mean_score, color='red', linestyle='-', linewidth=2, label='Sample mean')
plt.axvline(ci_lower, color='blue', linestyle='--', linewidth=2, label='95% CI lower')
plt.axvline(ci_upper, color='blue', linestyle='--', linewidth=2, label='95% CI upper')
plt.title('Confidence Interval for Mean Score')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Interpretation
print("We are 95% confident that the true population mean lies between")
print(f"{ci_lower:.2f} and {ci_upper:.2f}")
Proportion Confidence Interval
# Confidence interval for proportion
def proportion_ci(successes, total, confidence=0.95):
p = successes / total
se = np.sqrt(p * (1 - p) / total)
z = scipy_stats.norm.ppf((1 + confidence) / 2)
margin = z * se
return p - margin, p + margin
# Example: Survey results
total_respondents = 1000
satisfied_customers = 750
ci_lower, ci_upper = proportion_ci(satisfied_customers, total_respondents)
print(f"Sample proportion: {satisfied_customers/total_respondents:.1%}")
print(f"95% CI for satisfaction rate: {ci_lower:.1%} to {ci_upper:.1%}")
# Visualize
plt.figure(figsize=(8, 5))
plt.bar(['Satisfied', 'Not Satisfied'],
[satisfied_customers, total_respondents - satisfied_customers],
color=['green', 'red'], alpha=0.7)
plt.title('Customer Satisfaction Survey Results')
plt.ylabel('Number of Respondents')
plt.text(0, satisfied_customers + 20, f'{satisfied_customers/total_respondents:.1%}',
ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')
# Add confidence interval annotation
plt.text(0.5, total_respondents * 0.8,
f'95% CI: {ci_lower:.1%} - {ci_upper:.1%}',
ha='center', fontsize=11,
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
plt.tight_layout()
plt.show()
Practical Examples
Example 1: A/B Testing Analysis
# A/B test results for website conversion
np.random.seed(42)
# Simulate conversion rates
control_conversions = np.random.binomial(1, 0.12, 10000) # 12% conversion
variant_conversions = np.random.binomial(1, 0.15, 10000) # 15% conversion
control_rate = np.mean(control_conversions)
variant_rate = np.mean(variant_conversions)
print("A/B Test Results:")
print(f"Control conversion rate: {control_rate:.2%}")
print(f"Variant conversion rate: {variant_rate:.2%}")
print(f"Absolute improvement: {(variant_rate - control_rate):.2%}")
print(f"Relative improvement: {((variant_rate - control_rate) / control_rate):.1%}")
# Statistical significance test
t_stat, p_value = scipy_stats.ttest_ind(control_conversions, variant_conversions)
print(f"\nStatistical test:")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
if p_value < 0.05:
print("✅ Significant improvement! The variant performs better.")
else:
print("❌ No significant difference between control and variant.")
# Confidence intervals
control_ci = proportion_ci(np.sum(control_conversions), len(control_conversions))
variant_ci = proportion_ci(np.sum(variant_conversions), len(variant_conversions))
print(f"\n95% CI for control: {control_ci[0]:.2%} - {control_ci[1]:.2%}")
print(f"95% CI for variant: {variant_ci[0]:.2%} - {variant_ci[1]:.2%}")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Conversion rates comparison
rates = [control_rate, variant_rate]
errors = [[control_rate - control_ci[0], variant_rate - variant_ci[0]],
[control_ci[1] - control_rate, variant_ci[1] - variant_rate]]
axes[0].bar(['Control', 'Variant'], rates, yerr=errors, capsize=5,
color=['skyblue', 'orange'], alpha=0.7)
axes[0].set_title('Conversion Rates with 95% CI')
axes[0].set_ylabel('Conversion Rate')
axes[0].set_ylim(0, max(variant_ci[1], control_ci[1]) * 1.1)
axes[0].grid(True, alpha=0.3, axis='y')
# Add percentage labels
for i, (rate, ci) in enumerate(zip(rates, [control_ci, variant_ci])):
axes[0].text(i, rate + 0.005, f'{rate:.1%}', ha='center', va='bottom', fontweight='bold')
# Distribution comparison
axes[1].hist(control_conversions, bins=2, alpha=0.7, label='Control', color='skyblue')
axes[1].hist(variant_conversions, bins=2, alpha=0.7, label='Variant', color='orange')
axes[1].set_title('Conversion Distributions')
axes[1].set_xlabel('Converted (1) vs Not (0)')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
Example 2: Customer Segmentation Analysis
# Customer data analysis
np.random.seed(42)
customers = pd.DataFrame({
'customer_id': range(1, 1001),
'age': np.random.normal(35, 10, 1000),
'annual_income': np.random.normal(60000, 20000, 1000),
'spending_score': np.random.normal(50, 25, 1000), # 0-100 scale
'membership_years': np.random.normal(3, 2, 1000)
})
# Clip values to realistic ranges
customers['age'] = np.clip(customers['age'], 18, 80)
customers['annual_income'] = np.clip(customers['annual_income'], 20000, 150000)
customers['spending_score'] = np.clip(customers['spending_score'], 0, 100)
customers['membership_years'] = np.clip(customers['membership_years'], 0, 10)
print("Customer Demographics Summary:")
print(customers[['age', 'annual_income', 'spending_score', 'membership_years']].describe())
# Segment customers by spending score
def spending_segment(score):
if score >= 75: return 'High Spender'
elif score >= 50: return 'Medium Spender'
else: return 'Low Spender'
customers['spending_segment'] = customers['spending_score'].apply(spending_segment)
# Statistical comparison between segments
segments = customers.groupby('spending_segment')
print("\nSegment Analysis:")
for name, group in segments:
print(f"\n{name}:")
print(f" Count: {len(group)}")
print(f" Avg Age: {group['age'].mean():.1f}")
print(f" Avg Income: ${group['annual_income'].mean():,.0f}")
print(f" Avg Membership: {group['membership_years'].mean():.1f} years")
# Test if high spenders have significantly higher income
high_spenders = customers[customers['spending_segment'] == 'High Spender']['annual_income']
others = customers[customers['spending_segment'] != 'High Spender']['annual_income']
t_stat, p_value = scipy_stats.ttest_ind(high_spenders, others)
print(f"\nIncome comparison (High spenders vs Others):")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print("High spenders have significantly higher income!" if p_value < 0.05 else "No significant income difference.")
# Correlation analysis
correlation_matrix = customers[['age', 'annual_income', 'spending_score', 'membership_years']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix.round(3))
# Visualize key relationships
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Age vs Spending Score
axes[0, 0].scatter(customers['age'], customers['spending_score'], alpha=0.6)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Spending Score')
axes[0, 0].set_title('Age vs Spending Score')
axes[0, 0].grid(True, alpha=0.3)
# Income vs Spending Score
axes[0, 1].scatter(customers['annual_income'], customers['spending_score'], alpha=0.6)
axes[0, 1].set_xlabel('Annual Income')
axes[0, 1].set_ylabel('Spending Score')
axes[0, 1].set_title('Income vs Spending Score')
axes[0, 1].grid(True, alpha=0.3)
# Segment distribution
segment_counts = customers['spending_segment'].value_counts()
axes[1, 0].bar(segment_counts.index, segment_counts.values,
color=['green', 'orange', 'red'], alpha=0.7)
axes[1, 0].set_title('Customer Segments')
axes[1, 0].set_ylabel('Number of Customers')
axes[1, 0].tick_params(axis='x', rotation=45)
# Membership years distribution by segment
segment_data = [customers[customers['spending_segment'] == seg]['membership_years']
for seg in ['High Spender', 'Medium Spender', 'Low Spender']]
axes[1, 1].boxplot(segment_data, labels=['High', 'Medium', 'Low'])
axes[1, 1].set_title('Membership Years by Segment')
axes[1, 1].set_ylabel('Membership Years')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Best Practices
1. Understand Your Data First
# Always start with exploratory analysis
print(df.describe()) # Summary statistics
print(df.isnull().sum()) # Missing values
plt.hist(df['column']) # Distribution
plt.scatter(df['x'], df['y']) # Relationships
2. Choose Appropriate Tests
# Parametric tests (normal data)
# - One-sample t-test: compare sample mean to known value
# - Two-sample t-test: compare means of two groups
# - ANOVA: compare means of three or more groups
# Non-parametric tests (non-normal data)
# - Mann-Whitney U: compare two groups
# - Kruskal-Wallis: compare three or more groups
# - Chi-square: test relationships between categorical variables
3. Consider Effect Size
# P-values tell you if there's a difference, but not how big
from statistics import mean, stdev
def cohen_d(group1, group2):
"""Calculate Cohen's d effect size"""
diff = mean(group1) - mean(group2)
pooled_std = ((len(group1) - 1) * stdev(group1)**2 +
(len(group2) - 1) * stdev(group2)**2) / (len(group1) + len(group2) - 2)
pooled_std = pooled_std ** 0.5
return diff / pooled_std
# Effect size interpretation:
# Small: 0.2, Medium: 0.5, Large: 0.8
4. Avoid Common Mistakes
# Don't confuse correlation with causation
# Correlation shows relationship, not cause-and-effect
# Don't ignore assumptions of statistical tests
# T-tests assume normal distribution and equal variances
# Don't over-rely on p-values
# P < 0.05 doesn't mean the effect is important
# Don't forget about multiple testing
# Testing many hypotheses increases false positive risk
Practice Exercises
Exercise 1: Descriptive Statistics
Analyze a dataset of student grades:
- Calculate mean, median, mode, standard deviation
- Create histograms and box plots
- Identify outliers using IQR method
- Compare performance across different subjects
Exercise 2: Correlation Analysis
Examine relationships in sales data:
- Calculate correlations between variables
- Create scatter plots with trend lines
- Identify strongest and weakest relationships
- Create a correlation heatmap
Exercise 3: Hypothesis Testing
Test hypotheses about customer behavior:
- Test if average purchase amount differs by customer segment
- Compare conversion rates between website versions
- Test relationship between customer satisfaction and loyalty
- Calculate confidence intervals for key metrics
Exercise 4: A/B Test Analysis
Design and analyze an A/B test:
- Calculate sample size requirements
- Perform statistical significance testing
- Calculate confidence intervals
- Create visualizations showing results
Exercise 5: Comprehensive Analysis
Perform a complete statistical analysis:
- Data exploration and cleaning
- Descriptive statistics
- Correlation analysis
- Hypothesis testing
- Create a statistical report with visualizations
Summary
Statistical analysis provides rigorous data insights:
Descriptive Statistics:
# Central tendency
mean = np.mean(data)
median = np.median(data)
mode = scipy_stats.mode(data)
# Variability
std_dev = np.std(data, ddof=1)
variance = np.var(data, ddof=1)
iqr = np.subtract(*np.percentile(data, [75, 25]))
Correlation Analysis:
# Pearson correlation (linear relationships)
pearson_corr = np.corrcoef(x, y)[0, 1]
# Spearman correlation (monotonic relationships)
spearman_corr = scipy_stats.spearmanr(x, y)[0]
Hypothesis Testing:
# T-tests
t_stat, p_value = scipy_stats.ttest_ind(group1, group2)
# Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
Key Concepts:
- Central tendency and variability measures
- Correlation vs causation
- Hypothesis testing and p-values
- Confidence intervals
- Effect size vs statistical significance
- Choosing appropriate statistical tests
Next: Real-World Project - Complete data analysis project! 🚀