import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors
import scipy.stats as stats

data_path = f"./data/"

raw_players_data = pd.read_csv(f'{data_path}players.csv')
players_data = raw_players_data.copy()

print(players_data.shape)
players_data.head(15)

(1683, 7)

players_data.dtypes.sort_values()

nflId           int64
weight          int64
height         object
birthDate      object
collegeName    object
position       object
displayName    object
dtype: object

players_data["height"].unique()

array(['6-4', '6-2', '6-6', '5-10', '6-8', '6-3', '6-0', '6-5', '6-1',
       '5-9', '5-11', '5-8', '6-7', '6-9', '5-6', '5-7'], dtype=object)

def height_to_inches(h):
    feet, inches = h.split("-")
    try:
        inches = int(feet) * 12 + int(inches)
        return inches
    except:raise Exception("height should not be cahracter or string")

players_data["height"] = players_data["height"].apply(height_to_inches)
players_data['height'].sample()

1540    77
Name: height, dtype: int64

# Random sampling the data 
r_sample = players_data.sample(frac=0.3, random_state=45)
print(f'random sample size: {r_sample.shape}')
r_sample.head()

random sample size: (505, 7)

# stratifying sample 
def stratify_sample(stratum): return stratum.sample(frac=0.3, random_state=45)

def biased_strat_sample(stratum):
    # only taking greater or equal to mean i.e. 74.5
    stratum = stratum.loc[stratum["height"] > 74, "height"]
    return stratum.sample(frac=0.3, random_state=45)

strat_sample = players_data.groupby("position").apply(stratify_sample)
biased_strat_sample = players_data.groupby("position").apply(biased_strat_sample)
print(f"stratified sample size: {strat_sample.shape}")
strat_sample.sample(10)

stratified sample size: (503, 7)

# Create histograms using Plotly Express
fig_pop = px.histogram(players_data, x="height", nbins=30, color_discrete_sequence=['orange'])
fig_random = px.histogram(r_sample, x='height', nbins=30,  color_discrete_sequence=['purple'])
fig_stratified = px.histogram(strat_sample, x='height', nbins=30, color_discrete_sequence=['green'])
fig_biased = px.histogram(biased_strat_sample, x="height", nbins=30)

#add legends 
fig_pop.update_traces(name='Population', showlegend=True)
fig_random.update_traces(name='Random', showlegend=True)
fig_stratified.update_traces(name='Stratified', showlegend=True)
fig_biased.update_traces(name='Biased Stratified', showlegend=True)

# Combine the two histograms into one figure
fig_combined = go.Figure(data=[fig_pop['data'][0], fig_random['data'][0], fig_stratified['data'][0], fig_biased["data"][0]])

# Update layout for better visualization
fig_combined.update_layout(title_text='Distribution Comparison: Population vs. Random vs. Stratified vs. Biased Stratified Sample', xaxis_title='Heights in inches', yaxis_title='Frequency')

# Show the plot
fig_combined.show()

samples = range(10, 150, 25)
sample_means = {}
for size in samples:
    sample_mean = []
    for i in range(1000): sample_mean.append(round(players_data["height"].sample(size).mean(), 2))
    sample_means[size] = sample_mean
    plt.hist(sample_mean)
plt.legend([f'{str(lbl)} samples' for lbl in samples])
plt.title("Central Limit Theorem")

Text(0.5, 1.0, 'Central Limit Theorem')

# compute standard error for each samples 
error = {}
for k, v in sample_means.items():
    error[str(k)] = pd.Series(v).sem()


# Create a bar chart
fig = go.Figure()

fig.add_trace(go.Bar(x=list(error.keys()), y=list(error.values()), marker_color='orange'))

# Update layout
fig.update_layout(title='Standard Error for each sample sizes',
                  xaxis_title='Sample size',
                  yaxis_title='Error')

# Show the plot
fig.show()

def compute_plot_ci(df:pd.Series, sample_size:float, ci:float):
    if sample_size > 1 or ci > 1: raise Exception("check either sample size or CI value")
    sample_data = df.sample(frac=sample_size, random_state=45)
    sample_mean = sample_data.mean()
    # standard error 
    se = sample_data.sem() if sample_size < 0.3 else sample_data.std()
    
    # Determine critical value (z for population, t for sample)
    # t.ppf (percent point function) for student's t-distribution 
    # norm.ppf (percent point function) for Normal (Gaussian) distribution 
    critical_value = stats.t.ppf((1 + ci) / 2, df=len(sample_data)-1) if sample_size < 0.3 else  stats.norm.ppf((1 + ci) / 2)
    
    # compute margin of error
    margin_of_error = critical_value * se
    
    #compute CI
    confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
    
    # plotting 

    # Create a histogram
    fig = go.Figure()

    fig.add_trace(go.Histogram(x=sample_data, nbinsx=20, opacity=0.7, marker_color='blue', name=sample_data.name))

    # Add vertical line for the mean
    fig.add_trace(go.Scatter(x=[sample_mean, sample_mean], y=[0, 12], mode='lines', line=dict(color='red'), name='Mean'))

    # Add shaded region for the confidence interval
    fig.add_trace(go.Scatter(x=[confidence_interval[0], confidence_interval[1], confidence_interval[1], confidence_interval[0]],
                            y=[0, 0, 12, 12], fill='toself', fillcolor='rgba(255,0,0,0.3)', line=dict(color='rgba(255,0,0,0)'),
                            name=f'{ci*100}% Confidence Interval'))

    # Update layout
    fig.update_layout(title=f'Height Confidence Interval, sample: {int(sample_size * len(df))} CI: ({round(confidence_interval[0], 2)} -{round(confidence_interval[1], 2)})',
                    xaxis_title=sample_data.name,
                    yaxis_title='Frequency')

    # Show the plot
    fig.show()

compute_plot_ci(players_data["height"], 0.2, 0.98)

data = players_data["height"]

# compute mean and standard deviation
mean_val = np.mean(data)
std_dev = np.std(data)

# Set up figure and axis
fig, ax = plt.subplots(figsize=(15, 6))

#  histogram
counts, bins, _ = plt.hist(data, bins=30, density=True, alpha=0.7, color='gray', label='Histogram')

# normal distribution curve
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mean_val, std_dev)
plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution')

# standard deviations
for i, color in zip(range(-3, 4), ['red', 'orange', 'green', 'blue', 'purple', 'brown', 'pink']):
    plt.axvline(mean_val + i * std_dev, color=color, linestyle='dashed', linewidth=1, alpha=0.7)
    # Markings for first, second, and third standard deviations
    if i == 1:
        plt.text(mean_val + i * std_dev, 0.25, '1st Std', color=color, rotation=90, verticalalignment='bottom')
    elif i == 2:
        plt.text(mean_val + i * std_dev, 0.25, '2nd Std', color=color, rotation=90, verticalalignment='bottom')
    elif i == 3:
        plt.text(mean_val + i * std_dev, 0.25, '3rd Std', color=color, rotation=90, verticalalignment='bottom')

# confidence intervals (68% and 95%)
conf_interval_68 = stats.norm.interval(0.68, loc=mean_val, scale=std_dev)
conf_interval_95 = stats.norm.interval(0.95, loc=mean_val, scale=std_dev)
plt.axvline(conf_interval_68[0], color='violet', linestyle='dashed', linewidth=2, label='68% Confidence Interval')
plt.axvline(conf_interval_68[1], color='violet', linestyle='dashed', linewidth=2)
plt.axvline(conf_interval_95[0], color='green', linestyle='dashed', linewidth=2, label='95% Confidence Interval')
plt.axvline(conf_interval_95[1], color='green', linestyle='dashed', linewidth=2)

# labels and legend
plt.xlabel('Height')
plt.ylabel('Density')
plt.title('Height Distribution with Standard Deviations and Confidence Intervals')
plt.legend()

plt.show()

# QQ plot 
stats.probplot(players_data["height"],  dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Sample Quantiles')

# Show the plot
plt.show()

from scipy.stats import poisson

# Average rate
lambda_value = 3

# Values of k (number of events)
k_values = np.arange(0, 11)

# PMF for each k
pmf_values = poisson.pmf(k_values, lambda_value)

# Plotting
plt.bar(k_values, pmf_values, color='skyblue', edgecolor='black')
plt.title('Example Poisson Distribution')
plt.xlabel('Number of Events (k)')
plt.ylabel('Probability')
plt.xticks(k_values)
plt.show()

from scipy.stats import expon

# Rate parameter
lambda_value = 0.5

# Values of x (time)
x_values = np.linspace(0, 10, 1000)

# Calculate PDF for each x
pdf_values = expon.pdf(x_values, scale=1/lambda_value)

# Plotting
plt.plot(x_values, pdf_values, color='orange', label=f'Exponential Distribution (lambda={lambda_value})')
plt.title('Example Exponential Distribution')
plt.xlabel('Time (x)')
plt.ylabel('Probability Density')
plt.legend()
plt.show()

from lifelines import KaplanMeierFitter

# Generate  data for time until failure (in hours)
np.random.seed(42)
failure_times = np.random.exponential(scale=50, size=100)

# survival function (complementary cumulative distribution function)
kmf = KaplanMeierFitter()
kmf.fit(durations=failure_times, event_observed=np.ones_like(failure_times))
kmf.plot_survival_function(show_censors=True)
plt.title('Survival Function (Time until Failure)')
plt.xlabel('Time (hours)')
plt.ylabel('Survival Probability')
plt.show()

# Estimate failure rate using MLE
lambda_mle = 1 / np.mean(failure_times)
print(f"Estimated Failure Rate (MLE): {lambda_mle:.4f} per hour")

# Alternatively, fit an exponential distribution and extract the failure rate
_, lambda_fit = expon.fit(failure_times, floc=0)
print(f"Estimated Failure Rate (Exponential Fit): {lambda_fit:.4f} per hour")

Estimated Failure Rate (MLE): 0.0219 per hour
Estimated Failure Rate (Exponential Fit): 45.7374 per hour

from scipy.stats import weibull_min

# Example data for Weibull distribution
x_values = np.linspace(0, 5, 1000)

# Plotting with different (lambda, k) combinations
plt.figure(figsize=(10, 6))

for lambda_val, k in [(1, 0.5), (1, 1), (0.5, 1.5), (2, 5)]:
    beta = lambda_val**(-1/k)
    
    # Probability density function (PDF) for Weibull distribution
    pdf_values = weibull_min.pdf(x_values, k, scale=beta)

    # Plotting
    plt.plot(x_values, pdf_values, label=f'(lambda, k) = ({lambda_val}, {k})')

plt.title('Weibull Distribution with Different (lambda, k) Combinations')
plt.xlabel('x')
plt.ylabel('Probability Density Function (PDF)')
plt.legend()
plt.show()

	nflId	height	weight	birthDate	collegeName	position	displayName
0	25511	6-4	225	1977-08-03	Michigan	QB	Tom Brady
1	29550	6-4	328	1982-01-22	Arkansas	T	Jason Peters
2	29851	6-2	225	1983-12-02	California	QB	Aaron Rodgers
3	30842	6-6	267	1984-05-19	UCLA	TE	Marcedes Lewis
4	33084	6-4	217	1985-05-17	Boston College	QB	Matt Ryan
5	33099	6-6	245	1985-01-16	Delaware	QB	Joe Flacco
6	33107	6-4	315	1985-08-30	Virginia Tech	T	Duane Brown
7	33130	5-10	175	1986-12-01	California	WR	DeSean Jackson
8	33131	6-8	300	1986-09-01	Miami	DE	Calais Campbell
9	33138	6-3	222	1985-07-02	Michigan	QB	Chad Henne
10	34452	6-3	220	1988-02-07	Georgia	QB	Matthew Stafford
11	34754	6-0	229	1986-10-07	Missouri	QB	Chase Daniel
12	34843	6-2	215	1985-10-13	Michigan State	QB	Brian Hoyer
13	35443	6-5	320	1988-07-19	Oklahoma	T	Trent Williams
14	35449	6-3	304	1987-05-12	California	NT	Tyson Alualu

	nflId	height	weight	birthDate	collegeName	position	displayName
780	47801	75	305	1995-06-20	North Carolina State	C	Garrett Bradbury
378	43510	76	305	1993-03-15	Illinois	C	Ted Karras
1034	52457	76	227	1998-07-07	Notre Dame	WR	Chase Claypool
690	46284	77	317	1994-11-24	Alabama	C	Bradley Bozeman
1570	54607	71	175	NaN	South Carolina State	CB	Cobie Durant

		nflId	height	weight	birthDate	collegeName	position	displayName
position
TE	345	43399	78	257	1993-01-01	Western Kentucky	TE	Tyler Higbee
SS	606	46122	71	205	1995-09-16	North Carolina	SS	M.J. Stewart
DE	466	44892	76	266	1994-03-18	Ohio	DE	Tarell Basham
G	248	42477	76	310	1992-05-03	West Virginia	G	Mark Glowinski
ILB	405	44174	73	236	1993-02-22	Texas Tech	ILB	Sam Eguavoen
QB	867	47916	75	214	1996-08-08	Auburn	QB	Jarrett Stidham
OLB	1270	53481	73	215	NaN	Notre Dame	OLB	Jeremiah Owusu-Koramoah
T	843	47874	78	307	1996-09-05	Sioux Falls	T	Trey Pipkins
G	1114	52557	76	300	1996-06-19	Ball State	G	Danny Pinter
OLB	139	41269	75	250	1991-03-26	Brigham Young	OLB	Kyle Van Noy

Data and Sampling Distributions¶

Random sampling and sample bias¶

Selection Bais¶

Sampling Distribution of a Statistics¶

Confidence intervals¶

Normal Distribution | Gaussian¶

QQ Plot¶

student's t-distribution¶

Binomial Distribution¶

Poisson Distribution¶

Estimating the failure rate¶

Summary¶