# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
from scipy.stats import gamma
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from mplsoccer import FontManager
from adjustText import adjust_text
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
# Load datasets from CSV files
xP = pd.read_csv("xPactions.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col=0)
players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)
games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)
actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)
#Concat of different seasons into one
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])
#Adjusting action ids given the concat
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)
#Adding infos to events given the vaep framework
actions = spadl.add_names(actions)
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
#Selecting specific infos from games
games = games[["game_id", "game_date", "competition_id", "season_id"]]
#Creating the final df for elaboration
df = (
actions
.merge(fb, how="left")
.merge(xP, how="left")
.merge(games, how="left")
.merge(players, how="left")
)
#Keeping only passes
df1 = df[df["type_name"] == 'pass']
threshold = 2000
# Calculate successful passes and weighted xP for each event given the time decay
df1['successful_passes'] = df1['result_name'].apply(lambda x: 1 if x == "success" else 0)
# Now, aggregate these weighted values at the player level
weighted_agg = df1.groupby(["player_name", "player_id"], observed=True).agg({
'successful_passes': 'sum',
'xP': 'sum',
'type_name': 'count' # This counts the number of shots
}).reset_index()
# Rename column for clarity
weighted_agg.rename(columns={'type_name': 'passes'}, inplace=True)
#Columns elaboration to further calculations
#(cleaning, type adaptation, successful passes/weighted xP ratio calc, setting a threshold of passes we want)
weighted_agg['successful_passes'] = weighted_agg['successful_passes'].fillna(0)
weighted_agg['successful_passes'] = weighted_agg['successful_passes'].astype(int)
weighted_agg['weighted_raw_ratio'] = weighted_agg['successful_passes'] / weighted_agg['xP']
B = weighted_agg[(weighted_agg['passes'] >= threshold) & (weighted_agg['successful_passes'] > 0)]
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66066/1299763970.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df1['successful_passes'] = df1['result_name'].apply(lambda x: 1 if x == "success" else 0)
#Get the whole list of players in the data
playerlist = B['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()
# The simulation_gamma_posterior function combines the prior distribution with each
# player's observed data (successful passes and xP) to produce a posterior distribution
# This accounts for uncertainty in performance estimates, especially for smaller sample sizes
def simulate_gamma_posterior(successes, trials, prior_shape, prior_rate, n_sims=10000, seed=42):
"""
Create a posterior distribution for player passing performance using a Bayesian approach.
Args:
successes: Number of successful passes
trials: Expected passes (xP)
prior_shape: Shape parameter of prior gamma distribution
prior_rate: Rate parameter of prior gamma distribution
n_sims: Number of simulations
seed: Random seed
Returns:
dict: Contains mean and standard deviation of posterior distribution
"""
np.random.seed(seed)
posterior_shape = prior_shape + successes
posterior_rate = prior_rate + trials
posterior_sample = np.random.gamma(posterior_shape, 1/posterior_rate, n_sims)
return {
'mean': np.mean(posterior_sample),
'sd': np.std(posterior_sample),
'lower_95': np.percentile(posterior_sample, 2.5), # 2.5th percentile
'upper_95': np.percentile(posterior_sample, 97.5) # 97.5th percentile
}
# Apply the Bayesian adjustment to each player's raw ratio
# The resulting adj_ratio_mean represents our best estimate of their true passing skill
# adj_ratio_sd represents our uncertainty in that estimate
# In our visualization, we use:
# - Position on x-axis: Player's adjusted ratio (best estimate of true skill)
# - Size of points: Number of passes (more data = more reliable estimates)
# - Error bars: 95% confidence interval
# Fit a gamma distribution to the observed raw ratios to establish a prior distribution
# This helps inform our estimates of players' true skill levels
if not B.empty:
# Fit the gamma distribution to the weighted ratio (weighted_goals / weighted_xG)
shape, loc, scale = gamma.fit(B['weighted_raw_ratio'], floc=0)
prior_shape = shape
prior_rate = 1 / scale
print(f"Prior shape: {round(prior_shape, 2)}, Prior rate: {round(prior_rate, 2)}")
# Apply the function using weighted passes and xP
B['adj_ratio'] = B.apply(
lambda row: simulate_gamma_posterior(row['successful_passes'], row['xP'], prior_shape, prior_rate),
axis=1
)
# Unnest the adj_ratio column
adj_ratio_df = B['adj_ratio'].apply(pd.Series)
B = pd.concat([B, adj_ratio_df], axis=1)
# Rename columns for clarity
B = B.rename(columns={'mean': 'adj_ratio_mean', 'sd': 'adj_ratio_sd'})
# Sort the DataFrame by adjusted ratio mean
C = B.sort_values(by='adj_ratio_mean', ascending=False)
Prior shape: 373.9, Prior rate: 372.67
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_66066/4224291177.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy B['adj_ratio'] = B.apply(
from IPython.display import display, HTML
# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])
dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
{options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
var input = document.getElementById("dropdown").value;
var output = document.getElementById("output");
output.innerHTML = "Selected: " + input;
}}
</script>
"""
# Display the dropdown
display(HTML(dropdown_html))
#Control which unique positions we have to group them
positions.position.unique()
array(['RCB', nan, 'SS', 'ST', 'GK', 'LW', 'LWB', 'LCB', 'AM', 'RWB',
'RW', 'SV', 'CB', 'AMR', 'DM', 'CM', 'AML'], dtype=object)
# Define the position group function
def map_position_group(pos):
if pos in ['RCB', 'LCB', 'CB']:
return 'CB'
elif pos in ['RWB', 'LWB']:
return 'WB'
elif pos in ['CM', 'DM', 'SV']:
return 'CDM'
elif pos in ['LW', 'AM', 'RW', 'AMR', 'AML']:
return 'AMW'
elif pos in ['SS', 'ST']:
return 'ST'
elif pos in ['GK']:
return 'GK'
else:
return 'Other'
# Create a new column for position groups
positions['position_group'] = positions['position'].apply(map_position_group)
positions.position_group.unique()
array(['CB', 'Other', 'ST', 'GK', 'AMW', 'WB', 'CDM'], dtype=object)
#Create a column with unique positions list for every player across their seasons
pl_positions = positions.groupby(["player_name", "player_id"], observed=True)['position_group'].unique().reset_index(name='position')
Cc = C.merge(pl_positions)
Cc['position'] = Cc['position'].apply(lambda x: ', '.join(x))
#Selecting the position group we want to explore
PS = 'ST'
#If we want to see a specific list of players
player_names = ['Alessandro Bastoni', 'Nathan Aké', 'Iñigo Martínez', 'Bremer', 'Marquinhos', 'Jurriën Timber', 'Lisandro Martínez']
#Filter for whatever we want to
Cd = Cc[Cc['position'] == PS].head(7)
Cd
| player_name | player_id | successful_passes | xP | passes | weighted_raw_ratio | adj_ratio | adj_ratio_mean | adj_ratio_sd | lower_95 | upper_95 | position | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 160 | Mateo Cassierra | 322162.0 | 1936 | 1788.492798 | 2345 | 1.082476 | {'mean': 1.069254420815771, 'sd': 0.0220609701... | 1.069254 | 0.022061 | 1.025946 | 1.112597 | ST |
| 778 | Karim Benzema | 14296.0 | 3990 | 3813.206210 | 4666 | 1.046364 | {'mean': 1.0428412539843843, 'sd': 0.015658889... | 1.042841 | 0.015659 | 1.012019 | 1.073558 | ST |
| 915 | Jonathan David | 383855.0 | 3077 | 2950.237217 | 3781 | 1.042967 | {'mean': 1.0388671678709405, 'sd': 0.017541573... | 1.038867 | 0.017542 | 1.004371 | 1.073307 | ST |
| 965 | Abel Ruiz | 372828.0 | 1810 | 1733.096621 | 2230 | 1.044373 | {'mean': 1.0375351928937395, 'sd': 0.022015321... | 1.037535 | 0.022015 | 0.994329 | 1.080800 | ST |
| 1078 | Ludovic Ajorque | 234364.0 | 2537 | 2440.691336 | 3583 | 1.039460 | {'mean': 1.0350480327482996, 'sd': 0.019029059... | 1.035048 | 0.019029 | 0.997655 | 1.072435 | ST |
| 1160 | Ollie Watkins | 148503.0 | 2323 | 2239.159097 | 3128 | 1.037443 | {'mean': 1.0329634546765005, 'sd': 0.019729725... | 1.032963 | 0.019730 | 0.994208 | 1.071740 | ST |
| 1263 | Serhou Guirassy | 236506.0 | 2221 | 2146.539201 | 2820 | 1.034689 | {'mean': 1.0304441305418732, 'sd': 0.020064651... | 1.030444 | 0.020065 | 0.991038 | 1.069886 | ST |
# Assuming C is the DataFrame with adjusted ratios and player_name
# Filter the top 10 players by adjusted goal-scoring ratio mean
D = Cd.head(20)
# Create the scatter plot with error bars showing 95% confidence interval
fig = plt.figure(figsize = (10, 6), facecolor = '#D7D1CF')
ax = plt.subplot(111, facecolor = '#D7D1CF')
# Remove the bottom spine of the x-axis
ax.spines["top"].set(visible = False)
ax.spines["right"].set(visible = False)
ax.spines["bottom"].set(visible = False)
ax.spines["left"].set_color('#ACA7A5')
# Grid customization
ax.grid(which='major', linestyle='-', linewidth='.8', color='#ACA7A5', zorder = -1)
ax.xaxis.grid(False)
ax.yaxis.grid(False)
# Add labels and title
ax.xaxis.set_label_text("Adjusted Successful Passes/xP Ratio", size = 10, color = "#000000")
ax.yaxis.set_label_text('', size = 7, color = "#4E616C")
ax.xaxis.set_label_coords(0.8, -0.07)
ax.invert_yaxis()
ax.set_xlim(.5, D.upper_95.max() + 0.01)
ax.set_ylim( (len(D) - 1) + 0.5, -.5)
ax.xaxis.set_major_locator(ticker.MultipleLocator(.5))
ax.tick_params(axis = 'both', labelsize = 12, color = '#D7D1CF', labelcolor = '#000000')
plt.axvline(x = 1.0, color = '#1976D2', linestyle = '--', linewidth = 2, zorder = 1, alpha=0.5)
# Create the scatter plot
points = plt.scatter(
D['adj_ratio_mean'],
D['player_name'], # Use the player names directly for y-values
s = D['passes'] * 0.15, # Adjust size multiplier as needed
color = '#1565C0',
edgecolor = '#FFFFFF', linewidths=0.5,
zorder = 3
)
# Add error bars for 95% confidence interval (2 standard deviations) - use #D32F2F color for specific players
for index, row in D.iterrows():
ax.errorbar(
row['adj_ratio_mean'],
row['player_name'],
xerr=[[row['adj_ratio_mean'] - row['lower_95']], [row['upper_95'] - row['adj_ratio_mean']]],
color='#1565C0', capsize = 5, capthick = 2)
# Annotate the scatter points with the number of shots
for index, row in D.iterrows():
plt.text(
row['adj_ratio_mean'],
row['player_name'],
str(int(row['passes'])), # The number of shots
color = '#FFFFFF',
fontsize = row['passes'] * 0.002, # Adjust the text size proportional to the scatter size
ha = 'center',
va = 'center',
zorder = 4
)
# Add titles and annotations
ax.text(
x = .4, y = -1.1,
s = "Selected players passing ability",
color = "#000000",
size = 20, fontfamily='SourceSansPro-SemiBold')
ax.text(
x = .4, y = -0.8,
s = f"From 20/21 to 24/25 seasons | {PS}s",
color = "#000000",
size = 12)
ax.text(
x = .4, y = 7.5,
s = f"Players sorted by descending Adjusted Successful Passes/xP ratio (minimum {threshold} passes).\nAn Adjusted Ratio of 1.0 indicates a player is performing as expected in terms of passing efficacy.\nRatios above 1.0 indicate overperformance, while ratios below 1.0 indicate underperformance.",
color = "#000000",
size = 9)
ax.text(
x = .4, y = 8,
s = "Twitter : @gualanodavide | Bluesky : @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com",
color = "#000000",
size = 9)
plt.savefig(f"PassingOverperformance{PS}.png", dpi=300, bbox_inches = "tight")
Explanation¶
Prior Distribution: A gamma distribution is fitted to the weighted raw ratios of players with a substantial number of passes (e.g., minimum 1000 passes) to establish a prior distribution. This helps to inform the posterior distribution.
Posterior Distribution: The adjusted Successful Passes/xP ratio is derived from the posterior distribution, which combines the prior distribution with the player’s observed data (number of successful passes and expected passes). This Bayesian approach helps to mitigate the effect of small sample sizes and extreme values.
Adjusted Successful Passes/xP Ratio: This is the result of the Bayesian approach, where the posterior distribution of the player’s successful passes per expected passes (Successful Passes/xP) is estimated using a gamma distribution.
Scatter Plot Points: The size of each scatter plot point represents the number of passes done by the player. Larger points indicate more passes, which typically means more reliable estimates of their passing performance. The position of each point on the x-axis represents the player’s adjusted Successful Passes/xP ratio, which is an estimate of their passing ability after adjusting for variability.
Error Bars: The error bars around each point represent the range of uncertainty in the player’s adjusted Successful Passes/xP ratio. They are derived from the standard deviation of the posterior distribution. These bars indicate the confidence interval for the player’s true passing ability, with wider bars indicating more uncertainty and narrower bars indicating more precision. They all are set to represnt a 95% confidence.