# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
from scipy.stats import gamma
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from mplsoccer import FontManager
from adjustText import adjust_text
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
# Load datasets from CSV files
xG = pd.read_csv("xGactions.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col=0)
players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)
games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)
actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)
#Concat of different seasons into one
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])
#Adjusting action ids given the concat
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)
#Adding infos to events given the vaep framework
actions = spadl.add_names(actions)
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
#Selecting specific infos from games
games = games[["game_id", "game_date", "competition_id", "season_id"]]
#Creating the final df for elaboration
df = (
actions
.merge(fb, how="left")
.merge(xG, how="left")
.merge(games, how="left")
.merge(players, how="left")
)
#Keeping only shots
df1 = df[df["type_name"] == 'shot']
#Setting a threshold
threshold = 250
# Calculate weighted goals and xG for each event for each event given the time decay
df1['goals'] = df1['result_name'].apply(lambda x: 1 if x != "fail" else 0)
# Now, aggregate these weighted values at the player level
weighted_agg = df1.groupby(["player_name", "player_id"], observed=True).agg({
'goals': 'sum',
'xG': 'sum',
'type_name': 'count' # This counts the number of shots
}).reset_index()
# Rename columns for clarity
weighted_agg.rename(columns={'type_name': 'shots'}, inplace=True)
#Columns elaboration to further calculations
#(cleaning, type adaptation, goals/xG ratio calc, setting a threshold of passes we want)
weighted_agg['goals'] = weighted_agg['goals'].fillna(0)
weighted_agg['goals'] = weighted_agg['goals'].astype(int)
weighted_agg['weighted_raw_ratio'] = weighted_agg['goals'] / weighted_agg['xG']
B = weighted_agg[(weighted_agg['shots'] >= threshold) & (weighted_agg['goals'] > 0)]
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66395/292290907.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df1['goals'] = df1['result_name'].apply(lambda x: 1 if x != "fail" else 0)
#Get the whole list of players in the data
playerlist = B['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()
# The simulation_gamma_posterior function combines the prior distribution with each
# player's observed data (goals and xG) to produce a posterior distribution
# This accounts for uncertainty in performance estimates, especially for smaller sample sizes
def simulate_gamma_posterior(successes, trials, prior_shape, prior_rate, n_sims=10000, seed=42):
"""
Create a posterior distribution for player passing performance using a Bayesian approach.
Args:
successes: Number of successful passes
trials: Expected goals (xG)
prior_shape: Shape parameter of prior gamma distribution
prior_rate: Rate parameter of prior gamma distribution
n_sims: Number of simulations
seed: Random seed
Returns:
dict: Contains mean and standard deviation of posterior distribution
"""
np.random.seed(seed)
posterior_shape = prior_shape + successes
posterior_rate = prior_rate + trials
posterior_sample = np.random.gamma(posterior_shape, 1/posterior_rate, n_sims)
return {
'mean': np.mean(posterior_sample),
'sd': np.std(posterior_sample),
'lower_95': np.percentile(posterior_sample, 2.5), # 2.5th percentile
'upper_95': np.percentile(posterior_sample, 97.5) # 97.5th percentile
}
# Apply the Bayesian adjustment to each player's raw ratio
# The resulting adj_ratio_mean represents our best estimate of their true shooting skill
# adj_ratio_sd represents our uncertainty in that estimate
# In our visualization, we use:
# - Position on x-axis: Player's adjusted ratio (best estimate of true skill)
# - Size of points: Number of shots (more data = more reliable estimates)
# - Error bars: 95% confidence interval
# Fit a gamma distribution to the observed raw ratios to establish a prior distribution
# This helps inform our estimates of players' true skill levels
if not B.empty:
# Fit the gamma distribution to the weighted ratio (weighted_goals / weighted_xG)
shape, loc, scale = gamma.fit(B['weighted_raw_ratio'], floc=0)
prior_shape = shape
prior_rate = 1 / scale
print(f"Prior shape: {round(prior_shape, 2)}, Prior rate: {round(prior_rate, 2)}")
# Apply the function using weighted goals and xG
B['adj_ratio'] = B.apply(
lambda row: simulate_gamma_posterior(row['goals'], row['xG'], prior_shape, prior_rate),
axis=1
)
# Unnest the adj_ratio column
adj_ratio_df = B['adj_ratio'].apply(pd.Series)
B = pd.concat([B, adj_ratio_df], axis=1)
# Rename columns for clarity
B = B.rename(columns={'mean': 'adj_ratio_mean', 'sd': 'adj_ratio_sd'})
# Sort the DataFrame by adjusted ratio mean
C = B.sort_values(by='adj_ratio_mean', ascending=False)
Prior shape: 31.45, Prior rate: 29.64
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66395/1218299227.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy B['adj_ratio'] = B.apply(
from IPython.display import display, HTML
# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])
dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
{options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
var input = document.getElementById("dropdown").value;
var output = document.getElementById("output");
output.innerHTML = "Selected: " + input;
}}
</script>
"""
# Display the dropdown
display(HTML(dropdown_html))
#Control which unique positions we have to group them
positions.position.unique()
array(['RCB', nan, 'SS', 'ST', 'GK', 'LW', 'LWB', 'LCB', 'AM', 'RWB', 'RW', 'SV', 'CB', 'AMR', 'DM', 'CM', 'AML'], dtype=object)
# Define the position group function
def map_position_group(pos):
if pos in ['RCB', 'LCB', 'CB']:
return 'CB'
elif pos in ['RWB', 'LWB']:
return 'WB'
elif pos in ['CM', 'DM', 'SV']:
return 'CDM'
elif pos in ['LW', 'AM', 'RW', 'AMR', 'AML']:
return 'AMW'
elif pos in ['SS', 'ST']:
return 'ST'
elif pos in ['GK']:
return 'GK'
else:
return 'Other'
# Create a new column for position groups
positions['position_group'] = positions['position'].apply(map_position_group)
positions.position_group.unique()
array(['CB', 'Other', 'ST', 'GK', 'AMW', 'WB', 'CDM'], dtype=object)
#Create a column with unique positions list for every player across their seasons
pl_positions = positions.groupby(["player_name", "player_id"], observed=True)['position_group'].unique().reset_index(name='position')
Cc = C.merge(pl_positions)
Cc['position'] = Cc['position'].apply(lambda x: ', '.join(x))
#Selecting the position group we want to explore
PS = 'AMW'
#If we want to see a specific list of players
player_names = ['Rafael Leão', 'Vinícius Júnior', 'Kylian Mbappé', 'Khvicha Kvaratskhelia', 'Federico Chiesa', 'Luis Díaz', 'Cody Gakpo']
#Filter for whatever we want to
Cd = Cc[Cc['position'] == PS ].head(7)
Cd
player_name | player_id | goals | xG | shots | weighted_raw_ratio | adj_ratio | adj_ratio_mean | adj_ratio_sd | lower_95 | upper_95 | position | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | Morgan Whittaker | 381127.0 | 38 | 24.550852 | 388 | 1.547808 | {'mean': 1.2842619043781835, 'sd': 0.152658826... | 1.284262 | 0.152659 | 0.999374 | 1.598106 | AMW |
5 | Václav Cerny | 291964.0 | 41 | 27.552025 | 281 | 1.488094 | {'mean': 1.2693855410571513, 'sd': 0.147730624... | 1.269386 | 0.147731 | 0.993323 | 1.572771 | AMW |
11 | Harvey Barnes | 331382.0 | 46 | 32.309478 | 306 | 1.423731 | {'mean': 1.2527038365090641, 'sd': 0.141000897... | 1.252704 | 0.141001 | 0.988673 | 1.541789 | AMW |
12 | Son Heung-Min | 91909.0 | 77 | 57.416811 | 413 | 1.341071 | {'mean': 1.247887785739785, 'sd': 0.1187490902... | 1.247888 | 0.118749 | 1.023458 | 1.489436 | AMW |
17 | Igor Paixão | 436553.0 | 35 | 25.138151 | 259 | 1.392306 | {'mean': 1.2156417054953035, 'sd': 0.147693449... | 1.215642 | 0.147693 | 0.940367 | 1.519563 | AMW |
20 | Paulo Dybala | 114863.0 | 38 | 27.820173 | 356 | 1.365915 | {'mean': 1.2111961584938988, 'sd': 0.143973580... | 1.211196 | 0.143974 | 0.942517 | 1.507185 | AMW |
32 | Phil Foden | 331254.0 | 68 | 53.827964 | 428 | 1.263284 | {'mean': 1.19361728921748, 'sd': 0.11861535240... | 1.193617 | 0.118615 | 0.969942 | 1.435342 | AMW |
# Assuming C is the DataFrame with adjusted ratios and player_name
# Filter the top 10 players by adjusted goal-scoring ratio mean
D = Cd.head(20)
# Create the scatter plot with error bars showing 95% confidence interval
fig = plt.figure(figsize = (10, 6), facecolor = '#D7D1CF')
ax = plt.subplot(111, facecolor = '#D7D1CF')
# Remove the bottom spine of the x-axis
ax.spines["top"].set(visible = False)
ax.spines["right"].set(visible = False)
ax.spines["bottom"].set(visible = False)
ax.spines["left"].set_color('#ACA7A5')
# Grid customization
ax.grid(which='major', linestyle='-', linewidth='.8', color='#ACA7A5', zorder = -1)
ax.xaxis.grid(False)
ax.yaxis.grid(False)
# Add labels and title
ax.xaxis.set_label_text("Adjusted goals/xG Ratio", size = 10, color = "#000000")
ax.yaxis.set_label_text('', size = 7, color = "#4E616C")
ax.xaxis.set_label_coords(0.8, -0.07)
ax.invert_yaxis()
ax.set_xlim(.5, D.upper_95.max() + 0.01)
ax.set_ylim( (len(D) - 1) + 0.5, -.5)
ax.xaxis.set_major_locator(ticker.MultipleLocator(.5))
ax.tick_params(axis = 'both', labelsize = 12, color = '#D7D1CF', labelcolor = '#000000')
plt.axvline(x = 1.0, color = '#1976D2', linestyle = '--', linewidth = 2, zorder = 1, alpha=0.5)
# Create the scatter plot
points = plt.scatter(
D['adj_ratio_mean'],
D['player_name'], # Use the player names directly for y-values
s = D['shots'] * 2.8, # Adjust size multiplier as needed
color = '#1565C0',
edgecolor = '#FFFFFF', linewidths=0.5,
zorder = 3
)
# Add error bars for 95% confidence interval (2 standard deviations) - use #D32F2F color for specific players
for index, row in D.iterrows():
ax.errorbar(
row['adj_ratio_mean'],
row['player_name'],
xerr=[[row['adj_ratio_mean'] - row['lower_95']], [row['upper_95'] - row['adj_ratio_mean']]],
color='#1565C0', capsize = 5, capthick = 2)
# Annotate the scatter points with the number of shots
for index, row in D.iterrows():
plt.text(
row['adj_ratio_mean'],
row['player_name'],
str(int(row['shots'])), # The number of shots
color = '#FFFFFF',
fontsize = row['shots'] * 0.032, # Adjust the text size proportional to the scatter size
ha = 'center',
va = 'center',
zorder = 4
)
# Add titles and annotations
ax.text(
x = .3, y = -1.1,
s = "Selected players shooting ability",
color = "#000000",
size = 20, fontfamily='SourceSansPro-SemiBold')
ax.text(
x = .3, y = -0.8,
s = f"From 20/21 to 24/25 seasons | {PS}s",
color = "#000000",
size = 12)
ax.text(
x = .3, y = 7.5,
s = f"Players sorted by descending Adjusted G/xG ratio (minimum {threshold} shots).\nAn Adjusted G/xG Ratio of 1.0 indicates a player is performing as expected in terms of goal-scoring efficacy.\nRatios above 1.0 indicate overperformance, while ratios below 1.0 indicate underperformance.",
color = "#000000",
size = 9)
ax.text(
x = .3, y = 8,
s = "Twitter : @gualanodavide | Bluesky : @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com",
color = "#000000",
size = 9)
plt.savefig(f"ScoringOverperformance{PS}.png", dpi=300, bbox_inches = "tight")
Explanation¶
Prior Distribution: A gamma distribution is fitted to the weighted raw ratios of players with a substantial number of shots (e.g., minimum 100 shots) to establish a prior distribution. This helps to inform the posterior distribution.
Posterior Distribution: The adjusted goals/xG ratio is derived from the posterior distribution, which combines the prior distribution with the player’s observed data (number of goals and expected goals). This Bayesian approach helps to mitigate the effect of small sample sizes and extreme values.
Adjusted Successful Passes/xP Ratio: This is the result of the Bayesian approach, where the posterior distribution of the player’s goals per expected goals is estimated using a gamma distribution.
Scatter Plot Points: The size of each scatter plot point represents the number of shots taken by the player. Larger points indicate more shots, which typically means more reliable estimates of their shooting performance. The position of each point on the x-axis represents the player’s adjusted goals/xG ratio, which is an estimate of their shooting ability after adjusting for variability.
Error Bars: The error bars around each point represent the range of uncertainty in the player’s adjusted ratio. They are derived from the standard deviation of the posterior distribution. These bars indicate the confidence interval for the player’s true shooting ability, with wider bars indicating more uncertainty and narrower bars indicating more precision. They all are set to represnt a 95% confidence.