import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns 
import scipy.stats as stats

data_path = f"./data/"

raw_medal_data = pd.read_excel(f"{data_path}Medals.xlsx")
medal_data = raw_medal_data.copy()
medal_data.head()

/home/whiskey/miniconda3/envs/nlp/lib/python3.9/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

# Mean 
medal_data["Total"].mean()

11.612903225806452

print(f'The weighted average of totla by gold medla: {np.average(medal_data["Total"], weights=medal_data["Gold"])}')

The weighted average of totla by gold medla: 46.832352941176474

def trimmed_mean(data, pct):
    if pct >= 1 or pct < 0: raise Exception("trim percentage is not valid !")
    trim_count = int(len(data) * pct)
    sorted_data = np.sort(data)
    trimmed_data = sorted_data[trim_count: -trim_count]
    t_mean = trimmed_data.mean()
    return t_mean, trimmed_data

t_mean, t_data = trimmed_mean(medal_data["Total"], 0.1)
print(f'10% trimmed mean of total medals: {t_mean}')

10% trimmed mean of total medals: 6.8933333333333335

# Plot the original and trimmed distributions
plt.figure(figsize=(10, 6))

# Original Distribution
plt.subplot(2, 1, 1)
sns.histplot(medal_data["Total"], bins=10, kde=True, color='skyblue')
plt.title('Original Distribution')

# Trimmed Distribution
plt.subplot(2, 1, 2)
sns.histplot(t_data, bins=10, kde=True, color='salmon')
plt.title('Trimmed Distribution')

plt.tight_layout()
plt.show()

print(f'Median on Total: {medal_data["Total"].median()}')

Median on Total: 4.0

medal_data.describe()

# box plot
plt.figure(figsize=(6, 6))
sns.boxplot(y=medal_data["Gold"], color='skyblue')

# title and labels
plt.title('Gold Medla Conts')

plt.show()

print(f'Gold Medal varinace: {medal_data["Gold"].var()}, standard deviation: {medal_data["Gold"].std()}')

Gold Medal varinace: 49.31510051425906, standard deviation: 7.022471111671379

mean_ab_dev = abs(medal_data['Gold'] - medal_data["Gold"].mean()).sum() / medal_data.shape[0]
median_ab_dev = abs(medal_data["Gold"].sort_values() - medal_data["Gold"].median()).median()

print(f'Mean Absolute deviation: {mean_ab_dev}, median absolue deviation: {median_ab_dev}')

Mean Absolute deviation: 4.0048560527228565, median absolue deviation: 1.0

q1, q3  = medal_data["Total"].quantile([0.25, 0.75])
iqr = q3 - q1
print(f'Q1: {q1}, Q3: {q3}, IQR: {iqr}')

Q1: 2.0, Q3: 11.0, IQR: 9.0

# frquenct table
plt.hist(medal_data["Gold"], bins=5, color="skyblue", edgecolor="black")
plt.title("Frequnecy plot")
plt.xlabel("Gold")
plt.ylabel('Frequency')
plt.show()

sns.kdeplot(medal_data["Gold"],color="skyblue", fill=True)
plt.title("Density plot of Gold Medals")
plt.xlabel("Gold")
plt.ylabel('Frequency')
plt.show()

categorical_cols = medal_data.select_dtypes(include="object").columns.to_list()
print(f'Categorical features: {categorical_cols}')

Categorical features: ['Team/NOC']

print(f"Mode of dataset:\n {medal_data.loc[:, ['Rank', 'Gold', 'Silver', 'Bronze', 'Total','Rank by Total']].mode()}")

Mode of dataset:
    Rank  Gold  Silver  Bronze  Total  Rank by Total
0    86     0       1       1      1             77

# lets plot the number of gold, silver, and bronze medals by contry.

# Create the grouped bar plot
fig = px.bar(medal_data, x='Team/NOC', y=['Gold', 'Silver', 'Bronze'],
             labels={'value': 'Number of Medals', 'variable': 'Medal Type'},
             title='Number of Medals by Country and Type',
             barmode='group')

# Show the plot
fig.show()

# compute correlation matrix
corr_mat = medal_data.drop(columns=["Team/NOC"], axis=1).corr()

# Create a heatmap of the correlation matrix
fig = go.Figure(data=go.Heatmap(z=corr_mat.values, x=corr_mat.columns, y=corr_mat.columns, colorscale='Viridis'))

fig.update_layout(title='Correlation Matrix Heatmap')

# Show the plot
fig.show()

from plotly.subplots import make_subplots

scatter_data = medal_data.drop(columns=['Team/NOC'], axis=1)
# mkae scatter plot 
fig = make_subplots(rows=len(scatter_data.columns), cols=len(scatter_data.columns), subplot_titles=scatter_data.columns)

#fill the subplot 
for i in range(len(scatter_data.columns)):
    for j in range(len(scatter_data.columns)):
        fig.add_trace(go.Scatter(x=scatter_data.iloc[:, i], y=scatter_data.iloc[:, j], mode='markers'), row=i + 1, col=j + 1)
        
        if j == 0:
            fig.update_yaxes(title_text=scatter_data.columns[i], row=i + 1, col=j + 1)

#update layout
fig.update_layout(title='Pair Plot', height=800, width=1800)
fig.show()

sns.set_theme(style='darkgrid')

sns.jointplot(x=scatter_data["Gold"], y=scatter_data['Silver'], kind='hex', color='c')
plt.xlabel("Gold")
plt.ylabel('Silver')
plt.show()

# # contour plot 
# contour plot 
# sns.kdeplot(x=scatter_data["Gold"], y=scatter_data["Silver"])
# plt.xlabel("Gold")
# plt.ylabel('Silver')
# plt.show()

# contour plot 
fig = px.density_contour(scatter_data, x="Gold", y="Silver", marginal_x="histogram", marginal_y="histogram")
fig.update_layout(width=800, height=400, title='Contour plot among Gold and Silver medals')
fig.show()

# lets plot violin plots 
medal_data["Team/NOC"]
fig = px.violin(medal_data, x="Gold", box=True, points='all', hover_data=medal_data.columns)
fig.update_layout(width=800, height=400, title='Violin plot of Gold Medals')
fig.show()

	Rank	Gold	Silver	Bronze	Total	Rank by Total
count	93.000000	93.000000	93.000000	93.000000	93.000000	93.000000
mean	46.333333	3.655914	3.634409	4.322581	11.612903	43.494624
std	26.219116	7.022471	6.626339	6.210372	19.091332	24.171769
min	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000
25%	24.000000	0.000000	0.000000	1.000000	2.000000	23.000000
50%	46.000000	1.000000	1.000000	2.000000	4.000000	47.000000
75%	70.000000	3.000000	4.000000	5.000000	11.000000	66.000000
max	86.000000	39.000000	41.000000	33.000000	113.000000	77.000000

Estimate of location (Where is my Data !)¶

Takeaways¶

Estimates of Variability: (Dispersion - How spread my data is !!)¶

Takeaways¶

Distribution¶

Moments in statistics¶

Exploring Binary or Categorical Data¶

Correlation¶

Exploring two or more variables | (Bivariate or Multivariate Analysis)¶

	Rank	Team/NOC	Gold	Silver	Bronze	Total	Rank by Total
0	1	United States of America	39	41	33	113	1
1	2	People's Republic of China	38	32	18	88	2
2	3	Japan	27	14	17	58	5
3	4	Great Britain	22	21	22	65	4
4	5	ROC	20	28	23	71	3