import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import pandas as pd
import plotly.express as px
import random
import plotly.graph_objects as go
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Parameters
mu_0 = 0  # Null hypothesis mean
sigma = 1  # Standard deviation (assumed)
alpha = 0.05  # Significance level

# Generate data under the null hypothesis
null_distribution = norm(loc=mu_0, scale=sigma)
x_values = np.linspace(-3, 3, 1000)
y_values_null = null_distribution.pdf(x_values)

# Plot the null distribution
plt.figure(figsize=(10, 6))
plt.plot(x_values, y_values_null, label='Null Distribution (H0: μ = μ0)', color='blue')

# Right-tailed test: Shade the rejection region
critical_value_right = null_distribution.ppf(1 - alpha)
plt.fill_between(x_values[x_values > critical_value_right], y_values_null[x_values > critical_value_right], color='red', alpha=0.3, label='Rejection Region (Right-Tailed)')

# Left-tailed test: Shade the rejection region
critical_value_left = null_distribution.ppf(alpha)
plt.fill_between(x_values[x_values < critical_value_left], y_values_null[x_values < critical_value_left], color='green', alpha=0.3, label='Rejection Region (Left-Tailed)')

# Add labels and legend
plt.title('One-Way Test - Two-Tailed Test')
plt.xlabel('Sample Mean')
plt.ylabel('Probability Density Function')
plt.axvline(critical_value_right, color='black', linestyle='--', label=f'Critical Value (Right-Tailed, {alpha} Level)')
plt.axvline(critical_value_left, color='purple', linestyle='--', label=f'Critical Value (Left-Tailed, {alpha} Level)')
plt.legend()

# Show the plot
plt.show()

data_path = f"./data/"

web_page_data = pd.read_csv(f'{data_path}web_page_data.csv')
four_session_data = pd.read_csv(f'{data_path}four_sessions.csv')
click_rates_data = pd.read_csv(f'{data_path}click_rates.csv')
ecom_data = pd.read_csv(f'{data_path}ab_data.csv')

print(f'Web page: {web_page_data.shape}, Four session: {four_session_data.shape}, click rates: {click_rates_data.shape}, imm data: {ecom_data.shape}')

Web page: (36, 2), Four session: (20, 2), click rates: (6, 3), imm data: (294480, 5)

web_page_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Page    36 non-null     object 
 1   Time    36 non-null     float64
dtypes: float64(1), object(1)
memory usage: 704.0+ bytes

click_rates_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Headline  6 non-null      object
 1   Click     6 non-null      object
 2   Rate      6 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes

four_session_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Page    20 non-null     object
 1   Time    20 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 448.0+ bytes

ecom_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294480 entries, 0 to 294479
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294480 non-null  int64 
 1   timestamp     294480 non-null  object
 2   group         294480 non-null  object
 3   landing_page  294480 non-null  object
 4   converted     294480 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB

ecom_data["landing_page"].value_counts()

landing_page
new_page    147241
old_page    147239
Name: count, dtype: int64

web_page_data[web_page_data["Page"] == "Page B"]["Time"].mean()

1.62

diff = abs(web_page_data[web_page_data["Page"] == "Page B"]["Time"].mean() - web_page_data[web_page_data["Page"] == "Page A"]["Time"].mean())
print(f'The absolute difference in average time spent in PageA  and PageB:{diff}')

The absolute difference in average time spent in PageA  and PageB:0.3566666666666669

fig = px.box(web_page_data, x="Page", y="Time", title="Distribution of time spent in Pages", labels={'Time': 'Time spent (sec)', 'Page': 'Page'})
fig.show()

def permute_function(df:pd.Series, n_a:int, n_b: int):
    combined = n_a + n_b
    # pick n_b idx with shuffle 
    idx_b = set(random.sample(range(combined), n_b))
    idx_a = set(range(combined)) - idx_b
    # compute and retrun the mean difference 
    return df.loc[list(idx_b)].mean() - df.loc[list(idx_a)].mean()

n_a = web_page_data[web_page_data["Page"] == "Page A"].shape[0]
n_b = web_page_data[web_page_data["Page"] == "Page B"].shape[0]

print(permute_function(web_page_data["Time"], n_a, n_b))

0.14866666666666672

random.seed(45)

# lets take 1000 samples 
permutation_diff = [permute_function(web_page_data["Time"], n_a, n_b) for _ in range(1000)]

fig = px.histogram(permutation_diff, nbins=12, labels={'value': 'session time difference (sec)'})

# Add a vertical line at difference
fig.add_trace(go.Scatter(x=[diff,diff], y=[0, len(permutation_diff)], mode='lines', name='Observed difference of mean',                         line=dict(color='red', width=2)))

fig.show()

grouped_data = ecom_data.groupby("group")["converted"].value_counts()
grouped_data

group      converted
control    0            129479
           1             17723
treatment  0            129764
           1             17514
Name: count, dtype: int64

grouped_data.index

MultiIndex([(  'control', 0),
            (  'control', 1),
            ('treatment', 0),
            ('treatment', 1)],
           names=['group', 'converted'])

grouped_data["treatment"].sum()

147278

ecom_data['converted'].mean()

0.1196583808747623

obs_pct_diff = 100 * ( grouped_data.loc[('control', 1)] / grouped_data["control"].sum() - grouped_data.loc[('treatment', 1)] / grouped_data["treatment"].sum() )
print(f'1.  Observed difference: {obs_pct_diff:.4f}%')
zeros, ones = ecom_data['converted'].value_counts()
print(f'2. control pop: {grouped_data["control"].sum()}, treatment pop: {grouped_data["treatment"].sum()}')
print(f'3.  #0s: {zeros} %: {round(zeros/(ones +zeros), 3)}, #1s: {ones} %: {round(ones /(ones +zeros), 3)}')

1.  Observed difference: 0.1481%
2. control pop: 147202, treatment pop: 147278
3.  #0s: 259243 %: 0.88, #1s: 35237 %: 0.12

permutation_diff_conv = [100*permute_function(ecom_data["converted"], grouped_data.loc[('control', 1)], grouped_data.loc[('treatment', 1)]) for _ in range(1000)]

fig = px.histogram(permutation_diff_conv, nbins=12, labels={'value': 'Conversion rate (%)'}, title="Frequency distribution for the difference in conversion rates between control and treatment")

# Add a vertical line at difference
fig.add_trace(go.Scatter(x=[obs_pct_diff,obs_pct_diff], y=[0, len(permutation_diff_conv)/3], mode='lines', name='Observed difference', line=dict(color='red', width=2)))

fig.show()

print(f'P-value: {np.mean([dif > obs_pct_diff for  dif in permutation_diff_conv])}')

P-value: 0.32

cnv_rates = np.array([[grouped_data.loc[('control', 1)], grouped_data["control"].sum() - grouped_data.loc[('control', 1)]], [grouped_data.loc[('treatment', 1)], grouped_data["treatment"].sum() - grouped_data.loc[('treatment', 1)]]])
chi2, p_value, df, _ = stats.chi2_contingency(cnv_rates)

print(f'p-value for single sided test: {p_value / 2:.4f}')

p-value for single sided test: 0.1089

result = stats.ttest_ind(web_page_data[web_page_data["Page"] == 'Page A'].Time,  web_page_data[web_page_data["Page"] == 'Page B'].Time, equal_var=False)
print(f'p-value for single sided test: {result.pvalue / 2:.4f}')

p-value for single sided test: 0.1408

tstat, p_val, _ = sm.stats.ttest_ind(web_page_data[web_page_data["Page"] == "Page A"]["Time"], web_page_data[web_page_data["Page"] == "Page B"]["Time"],  usevar='unequal', alternative='smaller')
print(f"tstat:{tstat}, p-value:{p_val}")

tstat:-1.0983155623638103, p-value:0.14076218622850328

four_session_data.head()

fig = px.box(four_session_data, x="Page", y="Time", title="Distribution of time spent in Pages", labels={'Time': 'Time spent (sec)', 'Page': 'Page'})
fig.show()

ob_var = four_session_data.groupby("Page").mean().var()[0]
ob_mean = four_session_data.groupby("Page").mean().values.ravel()
print(f'Observed variance among groups: {ob_var}\nGroup Mean, Page 1:{ob_mean[0]}, Page 2: {ob_mean[1]}, Page 3: {ob_mean[2]}, Page 4: {ob_mean[3]}')

Observed variance among groups: 55.426666666666655
Group Mean, Page 1:172.8, Page 2: 182.6, Page 3: 175.6, Page 4: 164.6

def perm_test(df:pd.DataFrame):
    data = df.copy()
    data["Time"] = np.random.permutation(data["Time"].values)
    # compute and return variance of the mean of all group i.e. Pages 
    return data.groupby("Page").mean().var()[0]

perm_test(four_session_data)

62.573333333333345

random.seed(45)
permuted_vars = [perm_test(four_session_data) for _ in range(2500)]
pr_pb = np.mean([var > ob_var for var in permuted_vars])
print(f"Pr(Prob):{pr_pb}")

Pr(Prob):0.0808

fig = px.histogram(permuted_vars, nbins=12, labels={'value': 'Variance'}, title="Frequency distribution for Variance among the group")

# Add a vertical line at difference
fig.add_trace(go.Scatter(x=[ob_var,ob_var], y=[0, len(permuted_vars)/3], mode='lines', name='Observed difference', line=dict(color='red', width=2)))

fig.show()

anova_stats = stats.f_oneway(four_session_data[four_session_data.Page == 'Page 1'].Time, 
                            four_session_data[four_session_data.Page == 'Page 2'].Time,
                            four_session_data[four_session_data.Page == 'Page 3'].Time,
                            four_session_data[four_session_data.Page == 'Page 4'].Time)
print(f'F-stat: {anova_stats.statistic / 2:.3f}, p_value: {anova_stats.pvalue / 2:.3f}')

F-stat: 1.370, p_value: 0.039

click_rates_data.head()

clicks = click_rates_data.pivot(index="Click", columns="Headline", values="Rate")
clicks

row_avg = clicks.mean(axis=1)
expected = pd.DataFrame({"Headline A": row_avg, "Headline B": row_avg, "Headline C": row_avg})
expected

def chi2(ob_data:list, exp_data:list):
    pearson_res = []
    for row, expect in zip(ob_data, exp_data):
        pearson_res.append([(ob - expect) ** 2 / expect for ob in row])
    # return sum of squared reiduals 
    return np.sum(pearson_res)

# compute permute_chi for each columns 
def permute_chi(df:pd.DataFrame, with_replacement: bool):
    if df.shape[0] > 2 : raise Exception("you have non-binary data for each column")
    ones, zeros = df.sum(axis=1)
    bag = np.hstack((np.ones(ones), np.zeros(zeros))).tolist()
    random.shuffle(bag)
    sample_clicks, sample_no_clikcs, init_r = [], [], 0
    for col in df.columns.tolist():
        if with_replacement: 
            category_clicks = sum(random.sample(bag, df[col].sum()))
        else:
            category_clicks = sum(bag[init_r: init_r + df[col].sum()])
        
        sample_clicks.append(category_clicks)
        sample_no_clikcs.append(df[col].sum() - category_clicks)
        
        init_r += df[col].sum()
    
    # H0 => expected (null hypothesis), for clicked vs non-clicked 
    expected_clicks, expected_no_clicks = expected.sum(axis=1).tolist()
    # print(f"permuted data: {[sample_clicks, sample_no_clikcs], [expected_clicks/expected.shape[1], expected_no_clicks/expected.shape[1] ]}")
    return chi2([sample_clicks, sample_no_clikcs], [expected_clicks/expected.shape[1], expected_no_clicks/expected.shape[1] ] )

permute_chi(clicks, False)

0.2379913529808417

ob_chisquared = chi2(clicks.values, expected["Headline A"].tolist())
print(f"observed chi: {ob_chisquared}")

observed chi: 1.6659394708658917

permuted_chi_data = [permute_chi(clicks, False) for _ in range(2500)]
resampled_pval = np.mean([chi_val > ob_chisquared for chi_val in permuted_chi_data])
print(f"Random sample p-value: {resampled_pval}")

Random sample p-value: 0.4616

permuted_chi_data_ = [permute_chi(clicks, True) for _ in range(2500)]
resampled_pval_ = np.mean([chi_val > ob_chisquared for chi_val in permuted_chi_data_])
print(f"Random sample with replacement p-value: {resampled_pval_}")

Random sample with replacement p-value: 0.4692

fig = px.histogram(permuted_chi_data, nbins=12, labels={'value': 'Chi-square score'}, title="Chi-Square score Frequency distribution for click rate among different Headlines for 2500 trials")

# Add a vertical line at difference
fig.add_trace(go.Scatter(x=[ob_chisquared,ob_chisquared], y=[0, len(permuted_chi_data)/3], mode='lines', name='Observed difference', line=dict(color='red', width=2)))

fig.show()

# Degrees of freedom
df_values = [1, 2, 5, 10, 20]

# Create figure
fig = go.Figure()

# Plot Chi-Square distributions for different degrees of freedom
for df in df_values:
    x =  np.array([1 + i * (30 - 1) / 99 for i in range(100)])
    y = np.random.chisquare(df, size=100)
    histogram, bin_edges = np.histogram(y, bins=50, density=True)
    fig.add_trace(go.Scatter(x=x, y=np.sqrt(x) * np.exp(-x / 2) / (2 ** (df / 2 - 1) * np.math.gamma(df / 2)), mode='lines', name=f'DF = {df}'))
    fig.add_trace(go.Histogram(x=y, nbinsx=50, name=f'DF = {df}', opacity=0.5))

# Update layout
fig.update_layout(
    title='Chi-Square Distribution with Different Degrees of Freedom',
    xaxis_title='Value',
    yaxis_title='Probability Density',
    showlegend=True
)

# Show the plot
fig.show()

im_data = pd.read_csv(f"{data_path}imanishi_data.csv")
im_data

fig = go.Figure()

fig.add_trace(go.Bar(x=im_data['Digit'], y=im_data['Frequency  ']))

fig.update_layout(title='Bar Chart for Digits with Frequencies',
                  xaxis=dict(title='Digit'),
                  yaxis=dict(title='Frequency'))

fig.show()

# Parameters for the null hypothesis distribution (no effect)
mu_0 = 0  # mean under null hypothesis
sigma = 1  # standard deviation

# Parameters for the alternative hypothesis distribution (true effect)
mu_1 = 1.5  # mean under alternative hypothesis

# Significance level (Type I error rate)
alpha = 0.05

# Critical value for a two-tailed test
z_critical = norm.ppf(1 - alpha / 2)

# Effect size (difference in means)
effect_size = mu_1 - mu_0

# Desired power
desired_power = 0.80

# Calculate required sample size for desired power
sample_size = norm.ppf(desired_power, loc=mu_1, scale=sigma) ** 2 / effect_size ** 2

# Calculate power with the required sample size
power = 1 - norm.cdf(z_critical - effect_size, loc=mu_1, scale=sigma / np.sqrt(sample_size))

# Plot the distributions under null and alternative hypotheses
x = np.linspace(-3, 5, 1000)
plt.plot(x, norm.pdf(x, loc=mu_0, scale=sigma), label='Null Hypothesis (No Effect)', color='blue')
plt.plot(x, norm.pdf(x, loc=mu_1, scale=sigma), label='Alternative Hypothesis (True Effect)', color='orange')

# Shade the area representing Type II error (not rejecting a false null hypothesis)
x_fill = np.linspace(mu_0, mu_0 + effect_size, 1000)
plt.fill_between(x_fill, norm.pdf(x_fill, loc=mu_0, scale=sigma), color='blue', alpha=0.3, label='Type II Error')

# Mark the critical value for Type I error
plt.axvline(x=z_critical, color='red', linestyle='--', label=f'Critical Value\n(Type I Error Rate = {alpha})')

# Mark the region of statistical power
plt.text(mu_1 + 0.5, 0.3, f'Statistical Power\n(1 - Type II Error Rate = {power:.2f})', color='green')

# Display the required sample size
plt.text(mu_1 +1, 0.4, f'Required Sample Size = {int(sample_size)}', color='purple')

# Customize the plot
plt.title('Statistical Power Example with Sample Size')
plt.xlabel('Effect')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()

# Define parameters
num_arms = 3
num_steps = 1000

# True mean rewards for each arm (unknown to the agent)
true_means = [2, 4, 1]

# Initialize variables
cumulative_rewards = np.zeros((num_arms, num_steps))
chosen_arms = np.zeros(num_steps, dtype=int)

# Agent's strategy: Epsilon-Greedy
epsilon = 0.1

# Simulation loop
for step in range(num_steps):
    # Explore (with probability epsilon)
    if np.random.rand() < epsilon:
        chosen_arm = np.random.choice(num_arms)
    # Exploit (with probability 1-epsilon)
    else:
        chosen_arm = np.argmax(np.mean(cumulative_rewards, axis=1))

    # Simulate pulling the chosen arm and observe the reward
    reward = np.random.normal(true_means[chosen_arm], 1)
    
    # Update cumulative rewards and record chosen arm
    cumulative_rewards[chosen_arm, step] = reward
    chosen_arms[step] = chosen_arm

# Plot the results
plt.figure(figsize=(10, 6))

# Plot cumulative rewards over time
for arm in range(num_arms):
    plt.plot(np.cumsum(cumulative_rewards[arm]) / np.arange(1, num_steps + 1), label=f'Arm {arm + 1}')

plt.title('Multi-Arm Bandit Simulation')
plt.xlabel('Steps')
plt.ylabel('Average Cumulative Reward')
plt.legend()
plt.grid(True)
plt.show()

	Headline A	Headline B	Headline C
Click
Click	11.333333	11.333333	11.333333
No-click	988.666667	988.666667	988.666667

Statistical Experiments and Significance Testing¶

A/B Testing¶

Hypothesis Testing | Significance Testing¶

Resampling (Cross validation and Bootsrap)¶

Permutation test¶

Statitical significance and P-values¶

t-Tests¶

Multiple testing¶

Degree of freedom¶

ANOVA¶

Chi-Squre Test¶

Fisher's Exact Test¶

Statistical power¶

Multi-Arm Bandit¶

	Digit	Frequency
0	0	14
1	1	71
2	2	7
3	3	65
4	4	23
5	5	19
6	6	12
7	7	45
8	8	53
9	9	6