# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
from scipy.stats import gamma
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from mplsoccer import FontManager
from adjustText import adjust_text

# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name

# Load datasets from CSV files
xG = pd.read_csv("xGactions.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col=0)

players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)

games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)

actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)

#Concat of different seasons into one
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])

#Adjusting action ids given the concat
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)

#Adding infos to events given the vaep framework
actions = spadl.add_names(actions)

# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)

#Selecting specific infos from games
games = games[["game_id", "game_date", "competition_id", "season_id"]]

#Creating the final df for elaboration
df = (
    actions
    .merge(fb, how="left")
    .merge(xG, how="left")
    .merge(games, how="left")
    .merge(players, how="left")
    )

#Keeping only shots
df1 = df[df["type_name"] == 'shot']

#Setting a threshold
threshold = 250

# Calculate weighted goals and xG for each event for each event given the time decay
df1['goals'] = df1['result_name'].apply(lambda x: 1 if x != "fail" else 0)

# Now, aggregate these weighted values at the player level
weighted_agg = df1.groupby(["player_name", "player_id"], observed=True).agg({
    'goals': 'sum',
    'xG': 'sum',
    'type_name': 'count'  # This counts the number of shots
}).reset_index()

# Rename columns for clarity
weighted_agg.rename(columns={'type_name': 'shots'}, inplace=True)

#Columns elaboration to further calculations 
#(cleaning, type adaptation, goals/xG ratio calc, setting a threshold of passes we want)
weighted_agg['goals'] = weighted_agg['goals'].fillna(0)
weighted_agg['goals'] = weighted_agg['goals'].astype(int)
weighted_agg['weighted_raw_ratio'] = weighted_agg['goals'] / weighted_agg['xG']
B = weighted_agg[(weighted_agg['shots'] >= threshold) & (weighted_agg['goals'] > 0)]

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66395/292290907.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['goals'] = df1['result_name'].apply(lambda x: 1 if x != "fail" else 0)

#Get the whole list of players in the data
playerlist = B['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()

# The simulation_gamma_posterior function combines the prior distribution with each 
# player's observed data (goals and xG) to produce a posterior distribution
# This accounts for uncertainty in performance estimates, especially for smaller sample sizes
def simulate_gamma_posterior(successes, trials, prior_shape, prior_rate, n_sims=10000, seed=42):
    """
    Create a posterior distribution for player passing performance using a Bayesian approach.
    
    Args:
        successes: Number of successful passes
        trials: Expected goals (xG)
        prior_shape: Shape parameter of prior gamma distribution
        prior_rate: Rate parameter of prior gamma distribution
        n_sims: Number of simulations
        seed: Random seed
        
    Returns:
        dict: Contains mean and standard deviation of posterior distribution
    """
    np.random.seed(seed)
    posterior_shape = prior_shape + successes
    posterior_rate = prior_rate + trials
    posterior_sample = np.random.gamma(posterior_shape, 1/posterior_rate, n_sims)
    return {
        'mean': np.mean(posterior_sample),
        'sd': np.std(posterior_sample),
        'lower_95': np.percentile(posterior_sample, 2.5),  # 2.5th percentile
        'upper_95': np.percentile(posterior_sample, 97.5)  # 97.5th percentile
    }

# Apply the Bayesian adjustment to each player's raw ratio
# The resulting adj_ratio_mean represents our best estimate of their true shooting skill
# adj_ratio_sd represents our uncertainty in that estimate

# In our visualization, we use:
# - Position on x-axis: Player's adjusted ratio (best estimate of true skill)
# - Size of points: Number of shots (more data = more reliable estimates)
# - Error bars: 95% confidence interval

# Fit a gamma distribution to the observed raw ratios to establish a prior distribution
# This helps inform our estimates of players' true skill levels
if not B.empty:
    # Fit the gamma distribution to the weighted ratio (weighted_goals / weighted_xG)
    shape, loc, scale = gamma.fit(B['weighted_raw_ratio'], floc=0)
    prior_shape = shape
    prior_rate = 1 / scale

    print(f"Prior shape: {round(prior_shape, 2)}, Prior rate: {round(prior_rate, 2)}")

    # Apply the function using weighted goals and xG
    B['adj_ratio'] = B.apply(
        lambda row: simulate_gamma_posterior(row['goals'], row['xG'], prior_shape, prior_rate),
        axis=1
    )

    # Unnest the adj_ratio column
    adj_ratio_df = B['adj_ratio'].apply(pd.Series)
    B = pd.concat([B, adj_ratio_df], axis=1)

    # Rename columns for clarity
    B = B.rename(columns={'mean': 'adj_ratio_mean', 'sd': 'adj_ratio_sd'})

    # Sort the DataFrame by adjusted ratio mean
    C = B.sort_values(by='adj_ratio_mean', ascending=False)

Prior shape: 31.45, Prior rate: 29.64

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66395/1218299227.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  B['adj_ratio'] = B.apply(

from IPython.display import display, HTML

# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])

dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
    {options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
    var input = document.getElementById("dropdown").value;
    var output = document.getElementById("output");
    output.innerHTML = "Selected: " + input;
}}
</script>
"""

# Display the dropdown
display(HTML(dropdown_html))

#Control which unique positions we have to group them
positions.position.unique()

array(['RCB', nan, 'SS', 'ST', 'GK', 'LW', 'LWB', 'LCB', 'AM', 'RWB',
       'RW', 'SV', 'CB', 'AMR', 'DM', 'CM', 'AML'], dtype=object)

# Define the position group function
def map_position_group(pos):
    if pos in ['RCB', 'LCB', 'CB']:
        return 'CB'
    elif pos in ['RWB', 'LWB']:
        return 'WB'
    elif pos in ['CM', 'DM', 'SV']:
        return 'CDM'
    elif pos in ['LW', 'AM', 'RW', 'AMR', 'AML']:
        return 'AMW'
    elif pos in ['SS', 'ST']:
        return 'ST'
    elif pos in ['GK']:
        return 'GK'
    else:
        return 'Other'

# Create a new column for position groups
positions['position_group'] = positions['position'].apply(map_position_group)
positions.position_group.unique()

array(['CB', 'Other', 'ST', 'GK', 'AMW', 'WB', 'CDM'], dtype=object)

#Create a column with unique positions list for every player across their seasons
pl_positions = positions.groupby(["player_name", "player_id"], observed=True)['position_group'].unique().reset_index(name='position')
Cc = C.merge(pl_positions)
Cc['position'] = Cc['position'].apply(lambda x: ', '.join(x))

#Selecting the position group we want to explore
PS = 'AMW'

#If we want to see a specific list of players
player_names = ['Rafael Leão', 'Vinícius Júnior', 'Kylian Mbappé', 'Khvicha Kvaratskhelia', 'Federico Chiesa', 'Luis Díaz', 'Cody Gakpo']

#Filter for whatever we want to
Cd = Cc[Cc['position'] == PS ].head(7)

Cd

# Assuming C is the DataFrame with adjusted ratios and player_name
# Filter the top 10 players by adjusted goal-scoring ratio mean
D = Cd.head(20)

# Create the scatter plot with error bars showing 95% confidence interval
fig = plt.figure(figsize = (10, 6), facecolor = '#D7D1CF')
ax = plt.subplot(111, facecolor = '#D7D1CF')

# Remove the bottom spine of the x-axis
ax.spines["top"].set(visible = False)
ax.spines["right"].set(visible = False)
ax.spines["bottom"].set(visible = False)
ax.spines["left"].set_color('#ACA7A5')

# Grid customization
ax.grid(which='major', linestyle='-', linewidth='.8', color='#ACA7A5', zorder = -1)
ax.xaxis.grid(False)
ax.yaxis.grid(False)

# Add labels and title
ax.xaxis.set_label_text("Adjusted goals/xG Ratio", size = 10, color = "#000000")
ax.yaxis.set_label_text('', size = 7, color = "#4E616C")
ax.xaxis.set_label_coords(0.8, -0.07)

ax.invert_yaxis()
ax.set_xlim(.5, D.upper_95.max() + 0.01)
ax.set_ylim( (len(D) - 1) + 0.5, -.5)
ax.xaxis.set_major_locator(ticker.MultipleLocator(.5))
ax.tick_params(axis = 'both', labelsize = 12, color = '#D7D1CF', labelcolor = '#000000')

plt.axvline(x = 1.0, color = '#1976D2', linestyle = '--', linewidth = 2, zorder = 1, alpha=0.5)

# Create the scatter plot
points = plt.scatter(
    D['adj_ratio_mean'], 
    D['player_name'],  # Use the player names directly for y-values
    s = D['shots'] * 2.8,  # Adjust size multiplier as needed
    color = '#1565C0', 
    edgecolor = '#FFFFFF', linewidths=0.5, 
    zorder = 3
)

# Add error bars for 95% confidence interval (2 standard deviations) - use #D32F2F color for specific players
for index, row in D.iterrows():
    ax.errorbar(
        row['adj_ratio_mean'],
        row['player_name'],
        xerr=[[row['adj_ratio_mean'] - row['lower_95']], [row['upper_95'] - row['adj_ratio_mean']]],
        color='#1565C0', capsize = 5, capthick = 2)

# Annotate the scatter points with the number of shots
for index, row in D.iterrows():
    plt.text(
        row['adj_ratio_mean'], 
        row['player_name'], 
        str(int(row['shots'])),  # The number of shots
        color = '#FFFFFF', 
        fontsize = row['shots'] * 0.032,  # Adjust the text size proportional to the scatter size
        ha = 'center', 
        va = 'center', 
        zorder = 4
    )
    
# Add titles and annotations
ax.text(
    x = .3, y = -1.1,
    s = "Selected players shooting ability",
    color = "#000000",
    size = 20, fontfamily='SourceSansPro-SemiBold')
ax.text(
    x = .3, y = -0.8,
    s = f"From 20/21 to 24/25 seasons | {PS}s",
    color = "#000000",
    size = 12)

ax.text(
    x = .3, y = 7.5,
    s = f"Players sorted by descending Adjusted G/xG ratio (minimum {threshold} shots).\nAn Adjusted G/xG Ratio of 1.0 indicates a player is performing as expected in terms of goal-scoring efficacy.\nRatios above 1.0 indicate overperformance, while ratios below 1.0 indicate underperformance.",
    color = "#000000",
    size = 9)
ax.text(
    x = .3, y = 8,
    s = "Twitter : @gualanodavide | Bluesky : @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com",
    color = "#000000",
    size = 9)

plt.savefig(f"ScoringOverperformance{PS}.png", dpi=300, bbox_inches = "tight")

	player_name	player_id	goals	xG	shots	weighted_raw_ratio	adj_ratio	adj_ratio_mean	adj_ratio_sd	lower_95	upper_95	position
3	Morgan Whittaker	381127.0	38	24.550852	388	1.547808	{'mean': 1.2842619043781835, 'sd': 0.152658826...	1.284262	0.152659	0.999374	1.598106	AMW
5	Václav Cerny	291964.0	41	27.552025	281	1.488094	{'mean': 1.2693855410571513, 'sd': 0.147730624...	1.269386	0.147731	0.993323	1.572771	AMW
11	Harvey Barnes	331382.0	46	32.309478	306	1.423731	{'mean': 1.2527038365090641, 'sd': 0.141000897...	1.252704	0.141001	0.988673	1.541789	AMW
12	Son Heung-Min	91909.0	77	57.416811	413	1.341071	{'mean': 1.247887785739785, 'sd': 0.1187490902...	1.247888	0.118749	1.023458	1.489436	AMW
17	Igor Paixão	436553.0	35	25.138151	259	1.392306	{'mean': 1.2156417054953035, 'sd': 0.147693449...	1.215642	0.147693	0.940367	1.519563	AMW
20	Paulo Dybala	114863.0	38	27.820173	356	1.365915	{'mean': 1.2111961584938988, 'sd': 0.143973580...	1.211196	0.143974	0.942517	1.507185	AMW
32	Phil Foden	331254.0	68	53.827964	428	1.263284	{'mean': 1.19361728921748, 'sd': 0.11861535240...	1.193617	0.118615	0.969942	1.435342	AMW

Explanation¶