# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
from scipy.stats import gamma
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from mplsoccer import FontManager
from adjustText import adjust_text

# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name

# Load datasets from CSV files
xP = pd.read_csv("xPactions.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col=0)

players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)

games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)

actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)

#Concat of different seasons into one
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])

#Adjusting action ids given the concat
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)

#Adding infos to events given the vaep framework
actions = spadl.add_names(actions)

# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)

#Selecting specific infos from games
games = games[["game_id", "game_date", "competition_id", "season_id"]]

#Creating the final df for elaboration
df = (
    actions
    .merge(fb, how="left")
    .merge(xP, how="left")
    .merge(games, how="left")
    .merge(players, how="left")
    )

#Keeping only passes
df1 = df[df["type_name"] == 'pass']

threshold = 2000

# Calculate successful passes and weighted xP for each event given the time decay
df1['successful_passes'] = df1['result_name'].apply(lambda x: 1 if x == "success" else 0)

# Now, aggregate these weighted values at the player level
weighted_agg = df1.groupby(["player_name", "player_id"], observed=True).agg({
    'successful_passes': 'sum',
    'xP': 'sum',
    'type_name': 'count'  # This counts the number of shots
}).reset_index()

# Rename column for clarity
weighted_agg.rename(columns={'type_name': 'passes'}, inplace=True)

#Columns elaboration to further calculations 
#(cleaning, type adaptation, successful passes/weighted xP ratio calc, setting a threshold of passes we want)
weighted_agg['successful_passes'] = weighted_agg['successful_passes'].fillna(0)
weighted_agg['successful_passes'] = weighted_agg['successful_passes'].astype(int)
weighted_agg['weighted_raw_ratio'] = weighted_agg['successful_passes'] / weighted_agg['xP']
B = weighted_agg[(weighted_agg['passes'] >= threshold) & (weighted_agg['successful_passes'] > 0)]

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66066/1299763970.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['successful_passes'] = df1['result_name'].apply(lambda x: 1 if x == "success" else 0)

#Get the whole list of players in the data
playerlist = B['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()

# The simulation_gamma_posterior function combines the prior distribution with each 
# player's observed data (successful passes and xP) to produce a posterior distribution
# This accounts for uncertainty in performance estimates, especially for smaller sample sizes
def simulate_gamma_posterior(successes, trials, prior_shape, prior_rate, n_sims=10000, seed=42):
    """
    Create a posterior distribution for player passing performance using a Bayesian approach.
    
    Args:
        successes: Number of successful passes
        trials: Expected passes (xP)
        prior_shape: Shape parameter of prior gamma distribution
        prior_rate: Rate parameter of prior gamma distribution
        n_sims: Number of simulations
        seed: Random seed
        
    Returns:
        dict: Contains mean and standard deviation of posterior distribution
    """
    np.random.seed(seed)
    posterior_shape = prior_shape + successes
    posterior_rate = prior_rate + trials
    posterior_sample = np.random.gamma(posterior_shape, 1/posterior_rate, n_sims)
    return {
        'mean': np.mean(posterior_sample),
        'sd': np.std(posterior_sample),
        'lower_95': np.percentile(posterior_sample, 2.5),  # 2.5th percentile
        'upper_95': np.percentile(posterior_sample, 97.5)  # 97.5th percentile
    }

# Apply the Bayesian adjustment to each player's raw ratio
# The resulting adj_ratio_mean represents our best estimate of their true passing skill
# adj_ratio_sd represents our uncertainty in that estimate

# In our visualization, we use:
# - Position on x-axis: Player's adjusted ratio (best estimate of true skill)
# - Size of points: Number of passes (more data = more reliable estimates)
# - Error bars: 95% confidence interval

# Fit a gamma distribution to the observed raw ratios to establish a prior distribution
# This helps inform our estimates of players' true skill levels
if not B.empty:
    # Fit the gamma distribution to the weighted ratio (weighted_goals / weighted_xG)
    shape, loc, scale = gamma.fit(B['weighted_raw_ratio'], floc=0)
    prior_shape = shape
    prior_rate = 1 / scale

    print(f"Prior shape: {round(prior_shape, 2)}, Prior rate: {round(prior_rate, 2)}")

    # Apply the function using weighted passes and xP
    B['adj_ratio'] = B.apply(
        lambda row: simulate_gamma_posterior(row['successful_passes'], row['xP'], prior_shape, prior_rate),
        axis=1
    )

    # Unnest the adj_ratio column
    adj_ratio_df = B['adj_ratio'].apply(pd.Series)
    B = pd.concat([B, adj_ratio_df], axis=1)

    # Rename columns for clarity
    B = B.rename(columns={'mean': 'adj_ratio_mean', 'sd': 'adj_ratio_sd'})

    # Sort the DataFrame by adjusted ratio mean
    C = B.sort_values(by='adj_ratio_mean', ascending=False)

Prior shape: 373.9, Prior rate: 372.67

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66066/4224291177.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  B['adj_ratio'] = B.apply(

from IPython.display import display, HTML

# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])

dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
    {options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
    var input = document.getElementById("dropdown").value;
    var output = document.getElementById("output");
    output.innerHTML = "Selected: " + input;
}}
</script>
"""

# Display the dropdown
display(HTML(dropdown_html))

#Control which unique positions we have to group them
positions.position.unique()

array(['RCB', nan, 'SS', 'ST', 'GK', 'LW', 'LWB', 'LCB', 'AM', 'RWB',
       'RW', 'SV', 'CB', 'AMR', 'DM', 'CM', 'AML'], dtype=object)

# Define the position group function
def map_position_group(pos):
    if pos in ['RCB', 'LCB', 'CB']:
        return 'CB'
    elif pos in ['RWB', 'LWB']:
        return 'WB'
    elif pos in ['CM', 'DM', 'SV']:
        return 'CDM'
    elif pos in ['LW', 'AM', 'RW', 'AMR', 'AML']:
        return 'AMW'
    elif pos in ['SS', 'ST']:
        return 'ST'
    elif pos in ['GK']:
        return 'GK'
    else:
        return 'Other'

# Create a new column for position groups
positions['position_group'] = positions['position'].apply(map_position_group)
positions.position_group.unique()

array(['CB', 'Other', 'ST', 'GK', 'AMW', 'WB', 'CDM'], dtype=object)

#Create a column with unique positions list for every player across their seasons
pl_positions = positions.groupby(["player_name", "player_id"], observed=True)['position_group'].unique().reset_index(name='position')
Cc = C.merge(pl_positions)
Cc['position'] = Cc['position'].apply(lambda x: ', '.join(x))

#Selecting the position group we want to explore
PS = 'ST'

#If we want to see a specific list of players
player_names = ['Alessandro Bastoni', 'Nathan Aké', 'Iñigo Martínez', 'Bremer', 'Marquinhos', 'Jurriën Timber', 'Lisandro Martínez']

#Filter for whatever we want to
Cd = Cc[Cc['position'] == PS].head(7)

Cd

# Assuming C is the DataFrame with adjusted ratios and player_name
# Filter the top 10 players by adjusted goal-scoring ratio mean
D = Cd.head(20)

# Create the scatter plot with error bars showing 95% confidence interval
fig = plt.figure(figsize = (10, 6), facecolor = '#D7D1CF')
ax = plt.subplot(111, facecolor = '#D7D1CF')

# Remove the bottom spine of the x-axis
ax.spines["top"].set(visible = False)
ax.spines["right"].set(visible = False)
ax.spines["bottom"].set(visible = False)
ax.spines["left"].set_color('#ACA7A5')

# Grid customization
ax.grid(which='major', linestyle='-', linewidth='.8', color='#ACA7A5', zorder = -1)
ax.xaxis.grid(False)
ax.yaxis.grid(False)

# Add labels and title
ax.xaxis.set_label_text("Adjusted Successful Passes/xP Ratio", size = 10, color = "#000000")
ax.yaxis.set_label_text('', size = 7, color = "#4E616C")
ax.xaxis.set_label_coords(0.8, -0.07)

ax.invert_yaxis()
ax.set_xlim(.5, D.upper_95.max() + 0.01)
ax.set_ylim( (len(D) - 1) + 0.5, -.5)
ax.xaxis.set_major_locator(ticker.MultipleLocator(.5))
ax.tick_params(axis = 'both', labelsize = 12, color = '#D7D1CF', labelcolor = '#000000')

plt.axvline(x = 1.0, color = '#1976D2', linestyle = '--', linewidth = 2, zorder = 1, alpha=0.5)

# Create the scatter plot
points = plt.scatter(
    D['adj_ratio_mean'], 
    D['player_name'],  # Use the player names directly for y-values
    s = D['passes'] * 0.15,  # Adjust size multiplier as needed
    color = '#1565C0', 
    edgecolor = '#FFFFFF', linewidths=0.5, 
    zorder = 3
)

# Add error bars for 95% confidence interval (2 standard deviations) - use #D32F2F color for specific players
for index, row in D.iterrows():
    ax.errorbar(
        row['adj_ratio_mean'],
        row['player_name'],
        xerr=[[row['adj_ratio_mean'] - row['lower_95']], [row['upper_95'] - row['adj_ratio_mean']]],
        color='#1565C0', capsize = 5, capthick = 2)

# Annotate the scatter points with the number of shots
for index, row in D.iterrows():
    plt.text(
        row['adj_ratio_mean'], 
        row['player_name'], 
        str(int(row['passes'])),  # The number of shots
        color = '#FFFFFF', 
        fontsize = row['passes'] * 0.002,  # Adjust the text size proportional to the scatter size
        ha = 'center', 
        va = 'center', 
        zorder = 4
    )

# Add titles and annotations
ax.text(
    x = .4, y = -1.1,
    s = "Selected players passing ability",
    color = "#000000",
    size = 20, fontfamily='SourceSansPro-SemiBold')
ax.text(
    x = .4, y = -0.8,
    s = f"From 20/21 to 24/25 seasons | {PS}s",
    color = "#000000",
    size = 12)

ax.text(
    x = .4, y = 7.5,
    s = f"Players sorted by descending Adjusted Successful Passes/xP ratio (minimum {threshold} passes).\nAn Adjusted Ratio of 1.0 indicates a player is performing as expected in terms of passing efficacy.\nRatios above 1.0 indicate overperformance, while ratios below 1.0 indicate underperformance.",
    color = "#000000",
    size = 9)
ax.text(
    x = .4, y = 8,
    s = "Twitter : @gualanodavide | Bluesky : @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com",
    color = "#000000",
    size = 9)

plt.savefig(f"PassingOverperformance{PS}.png", dpi=300, bbox_inches = "tight")

	player_name	player_id	successful_passes	xP	passes	weighted_raw_ratio	adj_ratio	adj_ratio_mean	adj_ratio_sd	lower_95	upper_95	position
160	Mateo Cassierra	322162.0	1936	1788.492798	2345	1.082476	{'mean': 1.069254420815771, 'sd': 0.0220609701...	1.069254	0.022061	1.025946	1.112597	ST
778	Karim Benzema	14296.0	3990	3813.206210	4666	1.046364	{'mean': 1.0428412539843843, 'sd': 0.015658889...	1.042841	0.015659	1.012019	1.073558	ST
915	Jonathan David	383855.0	3077	2950.237217	3781	1.042967	{'mean': 1.0388671678709405, 'sd': 0.017541573...	1.038867	0.017542	1.004371	1.073307	ST
965	Abel Ruiz	372828.0	1810	1733.096621	2230	1.044373	{'mean': 1.0375351928937395, 'sd': 0.022015321...	1.037535	0.022015	0.994329	1.080800	ST
1078	Ludovic Ajorque	234364.0	2537	2440.691336	3583	1.039460	{'mean': 1.0350480327482996, 'sd': 0.019029059...	1.035048	0.019029	0.997655	1.072435	ST
1160	Ollie Watkins	148503.0	2323	2239.159097	3128	1.037443	{'mean': 1.0329634546765005, 'sd': 0.019729725...	1.032963	0.019730	0.994208	1.071740	ST
1263	Serhou Guirassy	236506.0	2221	2146.539201	2820	1.034689	{'mean': 1.0304441305418732, 'sd': 0.020064651...	1.030444	0.020065	0.991038	1.069886	ST

Explanation¶